xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass lowers the pseudo control flow instructions to real
11 /// machine instructions.
12 ///
13 /// All control flow is handled using predicated instructions and
14 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
15 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
16 /// by writing to the 64-bit EXEC register (each bit corresponds to a
17 /// single vector ALU).  Typically, for predicates, a vector ALU will write
18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
20 /// EXEC to update the predicates.
21 ///
22 /// For example:
23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24 /// %sgpr0 = SI_IF %vcc
25 ///   %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26 /// %sgpr0 = SI_ELSE %sgpr0
27 ///   %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
28 /// SI_END_CF %sgpr0
29 ///
30 /// becomes:
31 ///
32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc  // Save and update the exec mask
33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec  // Clear live bits from saved exec mask
34 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
35 ///                                   // optimization which allows us to
36 ///                                   // branch if all the bits of
37 ///                                   // EXEC are zero.
38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
39 ///
40 /// label0:
41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then
42 ///                                    // block
43 /// %exec = S_XOR_B64 %sgpr0, %exec    // Update the exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
47 /// label1:
48 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "GCNSubtarget.h"
53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
54 #include "llvm/ADT/SmallSet.h"
55 #include "llvm/CodeGen/LiveIntervals.h"
56 #include "llvm/CodeGen/LiveVariables.h"
57 #include "llvm/CodeGen/MachineDominators.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "si-lower-control-flow"
63 
64 static cl::opt<bool>
65 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
66     cl::init(true), cl::ReallyHidden);
67 
68 namespace {
69 
70 class SILowerControlFlow : public MachineFunctionPass {
71 private:
72   const SIRegisterInfo *TRI = nullptr;
73   const SIInstrInfo *TII = nullptr;
74   LiveIntervals *LIS = nullptr;
75   LiveVariables *LV = nullptr;
76   MachineDominatorTree *MDT = nullptr;
77   MachineRegisterInfo *MRI = nullptr;
78   SetVector<MachineInstr*> LoweredEndCf;
79   DenseSet<Register> LoweredIf;
80   SmallSet<MachineBasicBlock *, 4> KillBlocks;
81 
82   const TargetRegisterClass *BoolRC = nullptr;
83   unsigned AndOpc;
84   unsigned OrOpc;
85   unsigned XorOpc;
86   unsigned MovTermOpc;
87   unsigned Andn2TermOpc;
88   unsigned XorTermrOpc;
89   unsigned OrTermrOpc;
90   unsigned OrSaveExecOpc;
91   unsigned Exec;
92 
93   bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
94 
95   void emitIf(MachineInstr &MI);
96   void emitElse(MachineInstr &MI);
97   void emitIfBreak(MachineInstr &MI);
98   void emitLoop(MachineInstr &MI);
99 
100   MachineBasicBlock *emitEndCf(MachineInstr &MI);
101 
102   void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
103 
104   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
105                         SmallVectorImpl<MachineOperand> &Src) const;
106 
107   void combineMasks(MachineInstr &MI);
108 
109   bool removeMBBifRedundant(MachineBasicBlock &MBB);
110 
111   MachineBasicBlock *process(MachineInstr &MI);
112 
113   // Skip to the next instruction, ignoring debug instructions, and trivial
114   // block boundaries (blocks that have one (typically fallthrough) successor,
115   // and the successor has one predecessor.
116   MachineBasicBlock::iterator
117   skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
118                                  MachineBasicBlock::iterator It) const;
119 
120   /// Find the insertion point for a new conditional branch.
121   MachineBasicBlock::iterator
122   skipToUncondBrOrEnd(MachineBasicBlock &MBB,
123                       MachineBasicBlock::iterator I) const {
124     assert(I->isTerminator());
125 
126     // FIXME: What if we had multiple pre-existing conditional branches?
127     MachineBasicBlock::iterator End = MBB.end();
128     while (I != End && !I->isUnconditionalBranch())
129       ++I;
130     return I;
131   }
132 
133   // Remove redundant SI_END_CF instructions.
134   void optimizeEndCf();
135 
136 public:
137   static char ID;
138 
139   SILowerControlFlow() : MachineFunctionPass(ID) {}
140 
141   bool runOnMachineFunction(MachineFunction &MF) override;
142 
143   StringRef getPassName() const override {
144     return "SI Lower control flow pseudo instructions";
145   }
146 
147   void getAnalysisUsage(AnalysisUsage &AU) const override {
148     // Should preserve the same set that TwoAddressInstructions does.
149     AU.addPreserved<MachineDominatorTree>();
150     AU.addPreserved<SlotIndexes>();
151     AU.addPreserved<LiveIntervals>();
152     AU.addPreservedID(LiveVariablesID);
153     MachineFunctionPass::getAnalysisUsage(AU);
154   }
155 };
156 
157 } // end anonymous namespace
158 
159 char SILowerControlFlow::ID = 0;
160 
161 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
162                "SI lower control flow", false, false)
163 
164 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
165   MachineOperand &ImpDefSCC = MI.getOperand(3);
166   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
167 
168   ImpDefSCC.setIsDead(IsDead);
169 }
170 
171 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
172 
173 bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
174                                  const MachineBasicBlock *End) {
175   DenseSet<const MachineBasicBlock*> Visited;
176   SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
177 
178   while (!Worklist.empty()) {
179     MachineBasicBlock *MBB = Worklist.pop_back_val();
180 
181     if (MBB == End || !Visited.insert(MBB).second)
182       continue;
183     if (KillBlocks.contains(MBB))
184       return true;
185 
186     Worklist.append(MBB->succ_begin(), MBB->succ_end());
187   }
188 
189   return false;
190 }
191 
192 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
193   Register SaveExecReg = MI.getOperand(0).getReg();
194   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
195 
196   if (U == MRI->use_instr_nodbg_end() ||
197       std::next(U) != MRI->use_instr_nodbg_end() ||
198       U->getOpcode() != AMDGPU::SI_END_CF)
199     return false;
200 
201   return true;
202 }
203 
204 void SILowerControlFlow::emitIf(MachineInstr &MI) {
205   MachineBasicBlock &MBB = *MI.getParent();
206   const DebugLoc &DL = MI.getDebugLoc();
207   MachineBasicBlock::iterator I(&MI);
208   Register SaveExecReg = MI.getOperand(0).getReg();
209   MachineOperand& Cond = MI.getOperand(1);
210   assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
211 
212   MachineOperand &ImpDefSCC = MI.getOperand(4);
213   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
214 
215   // If there is only one use of save exec register and that use is SI_END_CF,
216   // we can optimize SI_IF by returning the full saved exec mask instead of
217   // just cleared bits.
218   bool SimpleIf = isSimpleIf(MI, MRI);
219 
220   if (SimpleIf) {
221     // Check for SI_KILL_*_TERMINATOR on path from if to endif.
222     // if there is any such terminator simplifications are not safe.
223     auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
224     SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
225   }
226 
227   // Add an implicit def of exec to discourage scheduling VALU after this which
228   // will interfere with trying to form s_and_saveexec_b64 later.
229   Register CopyReg = SimpleIf ? SaveExecReg
230                        : MRI->createVirtualRegister(BoolRC);
231   MachineInstr *CopyExec =
232     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
233     .addReg(Exec)
234     .addReg(Exec, RegState::ImplicitDefine);
235   LoweredIf.insert(CopyReg);
236 
237   Register Tmp = MRI->createVirtualRegister(BoolRC);
238 
239   MachineInstr *And =
240     BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
241     .addReg(CopyReg)
242     .add(Cond);
243   if (LV)
244     LV->replaceKillInstruction(Cond.getReg(), MI, *And);
245 
246   setImpSCCDefDead(*And, true);
247 
248   MachineInstr *Xor = nullptr;
249   if (!SimpleIf) {
250     Xor =
251       BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
252       .addReg(Tmp)
253       .addReg(CopyReg);
254     setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
255   }
256 
257   // Use a copy that is a terminator to get correct spill code placement it with
258   // fast regalloc.
259   MachineInstr *SetExec =
260     BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
261     .addReg(Tmp, RegState::Kill);
262   if (LV)
263     LV->getVarInfo(Tmp).Kills.push_back(SetExec);
264 
265   // Skip ahead to the unconditional branch in case there are other terminators
266   // present.
267   I = skipToUncondBrOrEnd(MBB, I);
268 
269   // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
270   // during SIRemoveShortExecBranches.
271   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
272                             .add(MI.getOperand(2));
273 
274   if (!LIS) {
275     MI.eraseFromParent();
276     return;
277   }
278 
279   LIS->InsertMachineInstrInMaps(*CopyExec);
280 
281   // Replace with and so we don't need to fix the live interval for condition
282   // register.
283   LIS->ReplaceMachineInstrInMaps(MI, *And);
284 
285   if (!SimpleIf)
286     LIS->InsertMachineInstrInMaps(*Xor);
287   LIS->InsertMachineInstrInMaps(*SetExec);
288   LIS->InsertMachineInstrInMaps(*NewBr);
289 
290   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
291   MI.eraseFromParent();
292 
293   // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
294   // hard to add another def here but I'm not sure how to correctly update the
295   // valno.
296   LIS->removeInterval(SaveExecReg);
297   LIS->createAndComputeVirtRegInterval(SaveExecReg);
298   LIS->createAndComputeVirtRegInterval(Tmp);
299   if (!SimpleIf)
300     LIS->createAndComputeVirtRegInterval(CopyReg);
301 }
302 
303 void SILowerControlFlow::emitElse(MachineInstr &MI) {
304   MachineBasicBlock &MBB = *MI.getParent();
305   const DebugLoc &DL = MI.getDebugLoc();
306 
307   Register DstReg = MI.getOperand(0).getReg();
308 
309   MachineBasicBlock::iterator Start = MBB.begin();
310 
311   // This must be inserted before phis and any spill code inserted before the
312   // else.
313   Register SaveReg = MRI->createVirtualRegister(BoolRC);
314   MachineInstr *OrSaveExec =
315     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
316     .add(MI.getOperand(1)); // Saved EXEC
317   if (LV)
318     LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *OrSaveExec);
319 
320   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
321 
322   MachineBasicBlock::iterator ElsePt(MI);
323 
324   // This accounts for any modification of the EXEC mask within the block and
325   // can be optimized out pre-RA when not required.
326   MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
327                           .addReg(Exec)
328                           .addReg(SaveReg);
329 
330   if (LIS)
331     LIS->InsertMachineInstrInMaps(*And);
332 
333   MachineInstr *Xor =
334     BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
335     .addReg(Exec)
336     .addReg(DstReg);
337 
338   // Skip ahead to the unconditional branch in case there are other terminators
339   // present.
340   ElsePt = skipToUncondBrOrEnd(MBB, ElsePt);
341 
342   MachineInstr *Branch =
343       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
344           .addMBB(DestBB);
345 
346   if (!LIS) {
347     MI.eraseFromParent();
348     return;
349   }
350 
351   LIS->RemoveMachineInstrFromMaps(MI);
352   MI.eraseFromParent();
353 
354   LIS->InsertMachineInstrInMaps(*OrSaveExec);
355 
356   LIS->InsertMachineInstrInMaps(*Xor);
357   LIS->InsertMachineInstrInMaps(*Branch);
358 
359   LIS->removeInterval(DstReg);
360   LIS->createAndComputeVirtRegInterval(DstReg);
361   LIS->createAndComputeVirtRegInterval(SaveReg);
362 
363   // Let this be recomputed.
364   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
365 }
366 
367 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
368   MachineBasicBlock &MBB = *MI.getParent();
369   const DebugLoc &DL = MI.getDebugLoc();
370   auto Dst = MI.getOperand(0).getReg();
371 
372   // Skip ANDing with exec if the break condition is already masked by exec
373   // because it is a V_CMP in the same basic block. (We know the break
374   // condition operand was an i1 in IR, so if it is a VALU instruction it must
375   // be one with a carry-out.)
376   bool SkipAnding = false;
377   if (MI.getOperand(1).isReg()) {
378     if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
379       SkipAnding = Def->getParent() == MI.getParent()
380           && SIInstrInfo::isVALU(*Def);
381     }
382   }
383 
384   // AND the break condition operand with exec, then OR that into the "loop
385   // exit" mask.
386   MachineInstr *And = nullptr, *Or = nullptr;
387   if (!SkipAnding) {
388     Register AndReg = MRI->createVirtualRegister(BoolRC);
389     And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
390              .addReg(Exec)
391              .add(MI.getOperand(1));
392     if (LV)
393       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And);
394     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
395              .addReg(AndReg)
396              .add(MI.getOperand(2));
397     if (LIS)
398       LIS->createAndComputeVirtRegInterval(AndReg);
399   } else {
400     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
401              .add(MI.getOperand(1))
402              .add(MI.getOperand(2));
403     if (LV)
404       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *Or);
405   }
406   if (LV)
407     LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or);
408 
409   if (LIS) {
410     if (And)
411       LIS->InsertMachineInstrInMaps(*And);
412     LIS->ReplaceMachineInstrInMaps(MI, *Or);
413   }
414 
415   MI.eraseFromParent();
416 }
417 
418 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
419   MachineBasicBlock &MBB = *MI.getParent();
420   const DebugLoc &DL = MI.getDebugLoc();
421 
422   MachineInstr *AndN2 =
423       BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
424           .addReg(Exec)
425           .add(MI.getOperand(0));
426 
427   auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
428   MachineInstr *Branch =
429       BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
430           .add(MI.getOperand(1));
431 
432   if (LIS) {
433     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
434     LIS->InsertMachineInstrInMaps(*Branch);
435   }
436 
437   MI.eraseFromParent();
438 }
439 
440 MachineBasicBlock::iterator
441 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
442   MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
443 
444   SmallSet<const MachineBasicBlock *, 4> Visited;
445   MachineBasicBlock *B = &MBB;
446   do {
447     if (!Visited.insert(B).second)
448       return MBB.end();
449 
450     auto E = B->end();
451     for ( ; It != E; ++It) {
452       if (TII->mayReadEXEC(*MRI, *It))
453         break;
454     }
455 
456     if (It != E)
457       return It;
458 
459     if (B->succ_size() != 1)
460       return MBB.end();
461 
462     // If there is one trivial successor, advance to the next block.
463     MachineBasicBlock *Succ = *B->succ_begin();
464 
465     It = Succ->begin();
466     B = Succ;
467   } while (true);
468 }
469 
470 MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
471   MachineBasicBlock &MBB = *MI.getParent();
472   const DebugLoc &DL = MI.getDebugLoc();
473 
474   MachineBasicBlock::iterator InsPt = MBB.begin();
475 
476   // If we have instructions that aren't prolog instructions, split the block
477   // and emit a terminator instruction. This ensures correct spill placement.
478   // FIXME: We should unconditionally split the block here.
479   bool NeedBlockSplit = false;
480   Register DataReg = MI.getOperand(0).getReg();
481   for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
482        I != E; ++I) {
483     if (I->modifiesRegister(DataReg, TRI)) {
484       NeedBlockSplit = true;
485       break;
486     }
487   }
488 
489   unsigned Opcode = OrOpc;
490   MachineBasicBlock *SplitBB = &MBB;
491   if (NeedBlockSplit) {
492     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
493     if (MDT && SplitBB != &MBB) {
494       MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
495       SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
496                                                  MBBNode->end());
497       MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
498       for (MachineDomTreeNode *Child : Children)
499         MDT->changeImmediateDominator(Child, SplitBBNode);
500     }
501     Opcode = OrTermrOpc;
502     InsPt = MI;
503   }
504 
505   MachineInstr *NewMI =
506     BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
507     .addReg(Exec)
508     .add(MI.getOperand(0));
509   if (LV)
510     LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI);
511 
512   LoweredEndCf.insert(NewMI);
513 
514   if (LIS)
515     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
516 
517   MI.eraseFromParent();
518 
519   if (LIS)
520     LIS->handleMove(*NewMI);
521   return SplitBB;
522 }
523 
524 // Returns replace operands for a logical operation, either single result
525 // for exec or two operands if source was another equivalent operation.
526 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
527        SmallVectorImpl<MachineOperand> &Src) const {
528   MachineOperand &Op = MI.getOperand(OpNo);
529   if (!Op.isReg() || !Op.getReg().isVirtual()) {
530     Src.push_back(Op);
531     return;
532   }
533 
534   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
535   if (!Def || Def->getParent() != MI.getParent() ||
536       !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
537     return;
538 
539   // Make sure we do not modify exec between def and use.
540   // A copy with implcitly defined exec inserted earlier is an exclusion, it
541   // does not really modify exec.
542   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
543     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
544         !(I->isCopy() && I->getOperand(0).getReg() != Exec))
545       return;
546 
547   for (const auto &SrcOp : Def->explicit_operands())
548     if (SrcOp.isReg() && SrcOp.isUse() &&
549         (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
550       Src.push_back(SrcOp);
551 }
552 
553 // Search and combine pairs of equivalent instructions, like
554 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
555 // S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
556 // One of the operands is exec mask.
557 void SILowerControlFlow::combineMasks(MachineInstr &MI) {
558   assert(MI.getNumExplicitOperands() == 3);
559   SmallVector<MachineOperand, 4> Ops;
560   unsigned OpToReplace = 1;
561   findMaskOperands(MI, 1, Ops);
562   if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
563   findMaskOperands(MI, 2, Ops);
564   if (Ops.size() != 3) return;
565 
566   unsigned UniqueOpndIdx;
567   if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
568   else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
569   else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
570   else return;
571 
572   Register Reg = MI.getOperand(OpToReplace).getReg();
573   MI.RemoveOperand(OpToReplace);
574   MI.addOperand(Ops[UniqueOpndIdx]);
575   if (MRI->use_empty(Reg))
576     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
577 }
578 
579 void SILowerControlFlow::optimizeEndCf() {
580   // If the only instruction immediately following this END_CF is an another
581   // END_CF in the only successor we can avoid emitting exec mask restore here.
582   if (!RemoveRedundantEndcf)
583     return;
584 
585   for (MachineInstr *MI : LoweredEndCf) {
586     MachineBasicBlock &MBB = *MI->getParent();
587     auto Next =
588       skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
589     if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
590       continue;
591     // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
592     // If that belongs to SI_ELSE then saved mask has an inverted value.
593     Register SavedExec
594       = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
595     assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
596 
597     const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
598     if (Def && LoweredIf.count(SavedExec)) {
599       LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
600       if (LIS)
601         LIS->RemoveMachineInstrFromMaps(*MI);
602       Register Reg;
603       if (LV)
604         Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
605       MI->eraseFromParent();
606       if (LV)
607         LV->recomputeForSingleDefVirtReg(Reg);
608       removeMBBifRedundant(MBB);
609     }
610   }
611 }
612 
613 MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
614   MachineBasicBlock &MBB = *MI.getParent();
615   MachineBasicBlock::iterator I(MI);
616   MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
617 
618   MachineBasicBlock *SplitBB = &MBB;
619 
620   switch (MI.getOpcode()) {
621   case AMDGPU::SI_IF:
622     emitIf(MI);
623     break;
624 
625   case AMDGPU::SI_ELSE:
626     emitElse(MI);
627     break;
628 
629   case AMDGPU::SI_IF_BREAK:
630     emitIfBreak(MI);
631     break;
632 
633   case AMDGPU::SI_LOOP:
634     emitLoop(MI);
635     break;
636 
637   case AMDGPU::SI_WATERFALL_LOOP:
638     MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
639     break;
640 
641   case AMDGPU::SI_END_CF:
642     SplitBB = emitEndCf(MI);
643     break;
644 
645   default:
646     assert(false && "Attempt to process unsupported instruction");
647     break;
648   }
649 
650   MachineBasicBlock::iterator Next;
651   for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
652     Next = std::next(I);
653     MachineInstr &MaskMI = *I;
654     switch (MaskMI.getOpcode()) {
655     case AMDGPU::S_AND_B64:
656     case AMDGPU::S_OR_B64:
657     case AMDGPU::S_AND_B32:
658     case AMDGPU::S_OR_B32:
659       // Cleanup bit manipulations on exec mask
660       combineMasks(MaskMI);
661       break;
662     default:
663       I = MBB.end();
664       break;
665     }
666   }
667 
668   return SplitBB;
669 }
670 
671 void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
672                                        MachineInstr &MI) {
673   MachineFunction &MF = *MBB->getParent();
674   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
675   bool IsWave32 = ST.isWave32();
676 
677   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
678     // This should be before all vector instructions.
679     BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
680             TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
681         .addImm(MI.getOperand(0).getImm());
682     if (LIS)
683       LIS->RemoveMachineInstrFromMaps(MI);
684     MI.eraseFromParent();
685     return;
686   }
687 
688   // Extract the thread count from an SGPR input and set EXEC accordingly.
689   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
690   //
691   // S_BFE_U32 count, input, {shift, 7}
692   // S_BFM_B64 exec, count, 0
693   // S_CMP_EQ_U32 count, 64
694   // S_CMOV_B64 exec, -1
695   Register InputReg = MI.getOperand(0).getReg();
696   MachineInstr *FirstMI = &*MBB->begin();
697   if (InputReg.isVirtual()) {
698     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
699     assert(DefInstr && DefInstr->isCopy());
700     if (DefInstr->getParent() == MBB) {
701       if (DefInstr != FirstMI) {
702         // If the `InputReg` is defined in current block, we also need to
703         // move that instruction to the beginning of the block.
704         DefInstr->removeFromParent();
705         MBB->insert(FirstMI, DefInstr);
706         if (LIS)
707           LIS->handleMove(*DefInstr);
708       } else {
709         // If first instruction is definition then move pointer after it.
710         FirstMI = &*std::next(FirstMI->getIterator());
711       }
712     }
713   }
714 
715   // Insert instruction sequence at block beginning (before vector operations).
716   const DebugLoc DL = MI.getDebugLoc();
717   const unsigned WavefrontSize = ST.getWavefrontSize();
718   const unsigned Mask = (WavefrontSize << 1) - 1;
719   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
720   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
721                    .addReg(InputReg)
722                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
723   if (LV)
724     LV->recomputeForSingleDefVirtReg(InputReg);
725   auto BfmMI =
726       BuildMI(*MBB, FirstMI, DL,
727               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
728           .addReg(CountReg)
729           .addImm(0);
730   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
731                    .addReg(CountReg, RegState::Kill)
732                    .addImm(WavefrontSize);
733   if (LV)
734     LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
735   auto CmovMI =
736       BuildMI(*MBB, FirstMI, DL,
737               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
738               Exec)
739           .addImm(-1);
740 
741   if (!LIS) {
742     MI.eraseFromParent();
743     return;
744   }
745 
746   LIS->RemoveMachineInstrFromMaps(MI);
747   MI.eraseFromParent();
748 
749   LIS->InsertMachineInstrInMaps(*BfeMI);
750   LIS->InsertMachineInstrInMaps(*BfmMI);
751   LIS->InsertMachineInstrInMaps(*CmpMI);
752   LIS->InsertMachineInstrInMaps(*CmovMI);
753 
754   LIS->removeInterval(InputReg);
755   LIS->createAndComputeVirtRegInterval(InputReg);
756   LIS->createAndComputeVirtRegInterval(CountReg);
757 }
758 
759 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
760   for (auto &I : MBB.instrs()) {
761     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
762       return false;
763   }
764 
765   assert(MBB.succ_size() == 1 && "MBB has more than one successor");
766 
767   MachineBasicBlock *Succ = *MBB.succ_begin();
768   MachineBasicBlock *FallThrough = nullptr;
769 
770   while (!MBB.predecessors().empty()) {
771     MachineBasicBlock *P = *MBB.pred_begin();
772     if (P->getFallThrough() == &MBB)
773       FallThrough = P;
774     P->ReplaceUsesOfBlockWith(&MBB, Succ);
775   }
776   MBB.removeSuccessor(Succ);
777   if (LIS) {
778     for (auto &I : MBB.instrs())
779       LIS->RemoveMachineInstrFromMaps(I);
780   }
781   if (MDT) {
782     // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
783     // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
784     // be a leaf node in MDT and could be erased directly.
785     if (MDT->dominates(&MBB, Succ))
786       MDT->changeImmediateDominator(MDT->getNode(Succ),
787                                     MDT->getNode(&MBB)->getIDom());
788     MDT->eraseNode(&MBB);
789   }
790   MBB.clear();
791   MBB.eraseFromParent();
792   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
793     if (!Succ->canFallThrough()) {
794       MachineFunction *MF = FallThrough->getParent();
795       MachineFunction::iterator FallThroughPos(FallThrough);
796       MF->splice(std::next(FallThroughPos), Succ);
797     } else
798       BuildMI(*FallThrough, FallThrough->end(),
799               FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
800           .addMBB(Succ);
801   }
802 
803   return true;
804 }
805 
806 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
807   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
808   TII = ST.getInstrInfo();
809   TRI = &TII->getRegisterInfo();
810 
811   // This doesn't actually need LiveIntervals, but we can preserve them.
812   LIS = getAnalysisIfAvailable<LiveIntervals>();
813   // This doesn't actually need LiveVariables, but we can preserve them.
814   LV = getAnalysisIfAvailable<LiveVariables>();
815   MDT = getAnalysisIfAvailable<MachineDominatorTree>();
816   MRI = &MF.getRegInfo();
817   BoolRC = TRI->getBoolRC();
818 
819   if (ST.isWave32()) {
820     AndOpc = AMDGPU::S_AND_B32;
821     OrOpc = AMDGPU::S_OR_B32;
822     XorOpc = AMDGPU::S_XOR_B32;
823     MovTermOpc = AMDGPU::S_MOV_B32_term;
824     Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
825     XorTermrOpc = AMDGPU::S_XOR_B32_term;
826     OrTermrOpc = AMDGPU::S_OR_B32_term;
827     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
828     Exec = AMDGPU::EXEC_LO;
829   } else {
830     AndOpc = AMDGPU::S_AND_B64;
831     OrOpc = AMDGPU::S_OR_B64;
832     XorOpc = AMDGPU::S_XOR_B64;
833     MovTermOpc = AMDGPU::S_MOV_B64_term;
834     Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
835     XorTermrOpc = AMDGPU::S_XOR_B64_term;
836     OrTermrOpc = AMDGPU::S_OR_B64_term;
837     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
838     Exec = AMDGPU::EXEC;
839   }
840 
841   // Compute set of blocks with kills
842   const bool CanDemote =
843       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
844   for (auto &MBB : MF) {
845     bool IsKillBlock = false;
846     for (auto &Term : MBB.terminators()) {
847       if (TII->isKillTerminator(Term.getOpcode())) {
848         KillBlocks.insert(&MBB);
849         IsKillBlock = true;
850         break;
851       }
852     }
853     if (CanDemote && !IsKillBlock) {
854       for (auto &MI : MBB) {
855         if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
856           KillBlocks.insert(&MBB);
857           break;
858         }
859       }
860     }
861   }
862 
863   MachineFunction::iterator NextBB;
864   for (MachineFunction::iterator BI = MF.begin();
865        BI != MF.end(); BI = NextBB) {
866     NextBB = std::next(BI);
867     MachineBasicBlock *MBB = &*BI;
868 
869     MachineBasicBlock::iterator I, E, Next;
870     E = MBB->end();
871     for (I = MBB->begin(); I != E; I = Next) {
872       Next = std::next(I);
873       MachineInstr &MI = *I;
874       MachineBasicBlock *SplitMBB = MBB;
875 
876       switch (MI.getOpcode()) {
877       case AMDGPU::SI_IF:
878       case AMDGPU::SI_ELSE:
879       case AMDGPU::SI_IF_BREAK:
880       case AMDGPU::SI_WATERFALL_LOOP:
881       case AMDGPU::SI_LOOP:
882       case AMDGPU::SI_END_CF:
883         SplitMBB = process(MI);
884         break;
885 
886       // FIXME: find a better place for this
887       case AMDGPU::SI_INIT_EXEC:
888       case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
889         lowerInitExec(MBB, MI);
890         if (LIS)
891           LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
892         break;
893 
894       default:
895         break;
896       }
897 
898       if (SplitMBB != MBB) {
899         MBB = Next->getParent();
900         E = MBB->end();
901       }
902     }
903   }
904 
905   optimizeEndCf();
906 
907   LoweredEndCf.clear();
908   LoweredIf.clear();
909   KillBlocks.clear();
910 
911   return true;
912 }
913