xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 //===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass compute turns all control flow pseudo instructions into native one
11 /// computing their address on the fly; it also sets STACK_SIZE info.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "MCTargetDesc/R600MCTargetDesc.h"
16 #include "R600.h"
17 #include "R600MachineFunctionInfo.h"
18 #include "R600Subtarget.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include <set>
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "r600cf"
25 
26 namespace {
27 
28 struct CFStack {
29   enum StackItem {
30     ENTRY = 0,
31     SUB_ENTRY = 1,
32     FIRST_NON_WQM_PUSH = 2,
33     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
34   };
35 
36   const R600Subtarget *ST;
37   std::vector<StackItem> BranchStack;
38   std::vector<StackItem> LoopStack;
39   unsigned MaxStackSize;
40   unsigned CurrentEntries = 0;
41   unsigned CurrentSubEntries = 0;
42 
43   CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
44       // We need to reserve a stack entry for CALL_FS in vertex shaders.
45       MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {}
46 
47   unsigned getLoopDepth();
48   bool branchStackContains(CFStack::StackItem);
49   bool requiresWorkAroundForInst(unsigned Opcode);
50   unsigned getSubEntrySize(CFStack::StackItem Item);
51   void updateMaxStackSize();
52   void pushBranch(unsigned Opcode, bool isWQM = false);
53   void pushLoop();
54   void popBranch();
55   void popLoop();
56 };
57 
58 unsigned CFStack::getLoopDepth() {
59   return LoopStack.size();
60 }
61 
62 bool CFStack::branchStackContains(CFStack::StackItem Item) {
63   return llvm::is_contained(BranchStack, Item);
64 }
65 
66 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
67   if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
68       getLoopDepth() > 1)
69     return true;
70 
71   if (!ST->hasCFAluBug())
72     return false;
73 
74   switch(Opcode) {
75   default: return false;
76   case R600::CF_ALU_PUSH_BEFORE:
77   case R600::CF_ALU_ELSE_AFTER:
78   case R600::CF_ALU_BREAK:
79   case R600::CF_ALU_CONTINUE:
80     if (CurrentSubEntries == 0)
81       return false;
82     if (ST->getWavefrontSize() == 64) {
83       // We are being conservative here.  We only require this work-around if
84       // CurrentSubEntries > 3 &&
85       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
86       //
87       // We have to be conservative, because we don't know for certain that
88       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
89       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
90       // resources without any problems.
91       return CurrentSubEntries > 3;
92     }
93     assert(ST->getWavefrontSize() == 32);
94     // We are being conservative here.  We only require the work-around if
95     // CurrentSubEntries > 7 &&
96     // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
97     // See the comment on the wavefront size == 64 case for why we are
98     // being conservative.
99     return CurrentSubEntries > 7;
100   }
101 }
102 
103 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
104   switch(Item) {
105   default:
106     return 0;
107   case CFStack::FIRST_NON_WQM_PUSH:
108     assert(!ST->hasCaymanISA());
109     if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
110       // +1 For the push operation.
111       // +2 Extra space required.
112       return 3;
113     }
114     // Some documentation says that this is not necessary on Evergreen,
115     // but experimentation has show that we need to allocate 1 extra
116     // sub-entry for the first non-WQM push.
117     // +1 For the push operation.
118     // +1 Extra space required.
119     return 2;
120   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
121     assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
122     // +1 For the push operation.
123     // +1 Extra space required.
124     return 2;
125   case CFStack::SUB_ENTRY:
126     return 1;
127   }
128 }
129 
130 void CFStack::updateMaxStackSize() {
131   unsigned CurrentStackSize = CurrentEntries + divideCeil(CurrentSubEntries, 4);
132   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
133 }
134 
135 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
136   CFStack::StackItem Item = CFStack::ENTRY;
137   switch(Opcode) {
138   case R600::CF_PUSH_EG:
139   case R600::CF_ALU_PUSH_BEFORE:
140     if (!isWQM) {
141       if (!ST->hasCaymanISA() &&
142           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
143         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
144                                              // See comment in
145                                              // CFStack::getSubEntrySize()
146       else if (CurrentEntries > 0 &&
147                ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
148                !ST->hasCaymanISA() &&
149                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
150         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
151       else
152         Item = CFStack::SUB_ENTRY;
153     } else
154       Item = CFStack::ENTRY;
155     break;
156   }
157   BranchStack.push_back(Item);
158   if (Item == CFStack::ENTRY)
159     CurrentEntries++;
160   else
161     CurrentSubEntries += getSubEntrySize(Item);
162   updateMaxStackSize();
163 }
164 
165 void CFStack::pushLoop() {
166   LoopStack.push_back(CFStack::ENTRY);
167   CurrentEntries++;
168   updateMaxStackSize();
169 }
170 
171 void CFStack::popBranch() {
172   CFStack::StackItem Top = BranchStack.back();
173   if (Top == CFStack::ENTRY)
174     CurrentEntries--;
175   else
176     CurrentSubEntries-= getSubEntrySize(Top);
177   BranchStack.pop_back();
178 }
179 
180 void CFStack::popLoop() {
181   CurrentEntries--;
182   LoopStack.pop_back();
183 }
184 
185 class R600ControlFlowFinalizer : public MachineFunctionPass {
186 private:
187   using ClauseFile = std::pair<MachineInstr *, std::vector<MachineInstr *>>;
188 
189   enum ControlFlowInstruction {
190     CF_TC,
191     CF_VC,
192     CF_CALL_FS,
193     CF_WHILE_LOOP,
194     CF_END_LOOP,
195     CF_LOOP_BREAK,
196     CF_LOOP_CONTINUE,
197     CF_JUMP,
198     CF_ELSE,
199     CF_POP,
200     CF_END
201   };
202 
203   const R600InstrInfo *TII = nullptr;
204   const R600RegisterInfo *TRI = nullptr;
205   unsigned MaxFetchInst;
206   const R600Subtarget *ST = nullptr;
207 
208   bool IsTrivialInst(MachineInstr &MI) const {
209     switch (MI.getOpcode()) {
210     case R600::KILL:
211     case R600::RETURN:
212       return true;
213     default:
214       return false;
215     }
216   }
217 
218   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
219     unsigned Opcode = 0;
220     bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
221     switch (CFI) {
222     case CF_TC:
223       Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600;
224       break;
225     case CF_VC:
226       Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600;
227       break;
228     case CF_CALL_FS:
229       Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600;
230       break;
231     case CF_WHILE_LOOP:
232       Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600;
233       break;
234     case CF_END_LOOP:
235       Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600;
236       break;
237     case CF_LOOP_BREAK:
238       Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600;
239       break;
240     case CF_LOOP_CONTINUE:
241       Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600;
242       break;
243     case CF_JUMP:
244       Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600;
245       break;
246     case CF_ELSE:
247       Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600;
248       break;
249     case CF_POP:
250       Opcode = isEg ? R600::POP_EG : R600::POP_R600;
251       break;
252     case CF_END:
253       if (ST->hasCaymanISA()) {
254         Opcode = R600::CF_END_CM;
255         break;
256       }
257       Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600;
258       break;
259     }
260     assert (Opcode && "No opcode selected");
261     return TII->get(Opcode);
262   }
263 
264   bool isCompatibleWithClause(const MachineInstr &MI,
265                               std::set<unsigned> &DstRegs) const {
266     unsigned DstMI, SrcMI;
267     for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
268                                           E = MI.operands_end();
269          I != E; ++I) {
270       const MachineOperand &MO = *I;
271       if (!MO.isReg())
272         continue;
273       if (MO.isDef()) {
274         Register Reg = MO.getReg();
275         if (R600::R600_Reg128RegClass.contains(Reg))
276           DstMI = Reg;
277         else
278           DstMI = TRI->getMatchingSuperReg(Reg,
279               R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
280               &R600::R600_Reg128RegClass);
281       }
282       if (MO.isUse()) {
283         Register Reg = MO.getReg();
284         if (R600::R600_Reg128RegClass.contains(Reg))
285           SrcMI = Reg;
286         else
287           SrcMI = TRI->getMatchingSuperReg(Reg,
288               R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
289               &R600::R600_Reg128RegClass);
290       }
291     }
292     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
293       DstRegs.insert(DstMI);
294       return true;
295     }
296     return false;
297   }
298 
299   ClauseFile
300   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
301       const {
302     MachineBasicBlock::iterator ClauseHead = I;
303     std::vector<MachineInstr *> ClauseContent;
304     unsigned AluInstCount = 0;
305     bool IsTex = TII->usesTextureCache(*ClauseHead);
306     std::set<unsigned> DstRegs;
307     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
308       if (IsTrivialInst(*I))
309         continue;
310       if (AluInstCount >= MaxFetchInst)
311         break;
312       if ((IsTex && !TII->usesTextureCache(*I)) ||
313           (!IsTex && !TII->usesVertexCache(*I)))
314         break;
315       if (!isCompatibleWithClause(*I, DstRegs))
316         break;
317       AluInstCount ++;
318       ClauseContent.push_back(&*I);
319     }
320     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
321         getHWInstrDesc(IsTex?CF_TC:CF_VC))
322         .addImm(0) // ADDR
323         .addImm(AluInstCount - 1); // COUNT
324     return ClauseFile(MIb, std::move(ClauseContent));
325   }
326 
327   void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
328     static const unsigned LiteralRegs[] = {
329       R600::ALU_LITERAL_X,
330       R600::ALU_LITERAL_Y,
331       R600::ALU_LITERAL_Z,
332       R600::ALU_LITERAL_W
333     };
334     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
335         TII->getSrcs(MI);
336     for (const auto &Src:Srcs) {
337       if (Src.first->getReg() != R600::ALU_LITERAL_X)
338         continue;
339       int64_t Imm = Src.second;
340       std::vector<MachineOperand *>::iterator It =
341           llvm::find_if(Lits, [&](MachineOperand *val) {
342             return val->isImm() && (val->getImm() == Imm);
343           });
344 
345       // Get corresponding Operand
346       MachineOperand &Operand = MI.getOperand(
347           TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal));
348 
349       if (It != Lits.end()) {
350         // Reuse existing literal reg
351         unsigned Index = It - Lits.begin();
352         Src.first->setReg(LiteralRegs[Index]);
353       } else {
354         // Allocate new literal reg
355         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
356         Src.first->setReg(LiteralRegs[Lits.size()]);
357         Lits.push_back(&Operand);
358       }
359     }
360   }
361 
362   MachineBasicBlock::iterator insertLiterals(
363       MachineBasicBlock::iterator InsertPos,
364       const std::vector<unsigned> &Literals) const {
365     MachineBasicBlock *MBB = InsertPos->getParent();
366     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
367       unsigned LiteralPair0 = Literals[i];
368       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
369       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
370           TII->get(R600::LITERALS))
371           .addImm(LiteralPair0)
372           .addImm(LiteralPair1);
373     }
374     return InsertPos;
375   }
376 
377   ClauseFile
378   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
379       const {
380     MachineInstr &ClauseHead = *I;
381     std::vector<MachineInstr *> ClauseContent;
382     I++;
383     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
384       if (IsTrivialInst(*I)) {
385         ++I;
386         continue;
387       }
388       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
389         break;
390       std::vector<MachineOperand *>Literals;
391       if (I->isBundle()) {
392         MachineInstr &DeleteMI = *I;
393         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
394         while (++BI != E && BI->isBundledWithPred()) {
395           BI->unbundleFromPred();
396           for (MachineOperand &MO : BI->operands()) {
397             if (MO.isReg() && MO.isInternalRead())
398               MO.setIsInternalRead(false);
399           }
400           getLiteral(*BI, Literals);
401           ClauseContent.push_back(&*BI);
402         }
403         I = BI;
404         DeleteMI.eraseFromParent();
405       } else {
406         getLiteral(*I, Literals);
407         ClauseContent.push_back(&*I);
408         I++;
409       }
410       for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
411         MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
412             TII->get(R600::LITERALS));
413         if (Literals[i]->isImm()) {
414             MILit.addImm(Literals[i]->getImm());
415         } else {
416             MILit.addGlobalAddress(Literals[i]->getGlobal(),
417                                    Literals[i]->getOffset());
418         }
419         if (i + 1 < e) {
420           if (Literals[i + 1]->isImm()) {
421             MILit.addImm(Literals[i + 1]->getImm());
422           } else {
423             MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
424                                    Literals[i + 1]->getOffset());
425           }
426         } else
427           MILit.addImm(0);
428         ClauseContent.push_back(MILit);
429       }
430     }
431     assert(ClauseContent.size() < 128 && "ALU clause is too big");
432     ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
433     return ClauseFile(&ClauseHead, std::move(ClauseContent));
434   }
435 
436   void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
437                        const DebugLoc &DL, ClauseFile &Clause,
438                        unsigned &CfCount) {
439     CounterPropagateAddr(*Clause.first, CfCount);
440     MachineBasicBlock *BB = Clause.first->getParent();
441     BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
442     for (MachineInstr *MI : Clause.second)
443       BB->splice(InsertPos, BB, MI);
444     CfCount += 2 * Clause.second.size();
445   }
446 
447   void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
448                      ClauseFile &Clause, unsigned &CfCount) {
449     Clause.first->getOperand(0).setImm(0);
450     CounterPropagateAddr(*Clause.first, CfCount);
451     MachineBasicBlock *BB = Clause.first->getParent();
452     BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
453     for (MachineInstr *MI : Clause.second)
454       BB->splice(InsertPos, BB, MI);
455     CfCount += Clause.second.size();
456   }
457 
458   void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
459     MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
460   }
461   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
462                             unsigned Addr) const {
463     for (MachineInstr *MI : MIs) {
464       CounterPropagateAddr(*MI, Addr);
465     }
466   }
467 
468 public:
469   static char ID;
470 
471   R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
472 
473   bool runOnMachineFunction(MachineFunction &MF) override {
474     ST = &MF.getSubtarget<R600Subtarget>();
475     MaxFetchInst = ST->getTexVTXClauseSize();
476     TII = ST->getInstrInfo();
477     TRI = ST->getRegisterInfo();
478 
479     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
480 
481     CFStack CFStack(ST, MF.getFunction().getCallingConv());
482     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
483         ++MB) {
484       MachineBasicBlock &MBB = *MB;
485       unsigned CfCount = 0;
486       std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
487       std::vector<MachineInstr * > IfThenElseStack;
488       if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_VS) {
489         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
490             getHWInstrDesc(CF_CALL_FS));
491         CfCount++;
492       }
493       std::vector<ClauseFile> FetchClauses, AluClauses;
494       std::vector<MachineInstr *> LastAlu(1);
495       std::vector<MachineInstr *> ToPopAfter;
496 
497       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
498           I != E;) {
499         if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
500           LLVM_DEBUG(dbgs() << CfCount << ":"; I->dump(););
501           FetchClauses.push_back(MakeFetchClause(MBB, I));
502           CfCount++;
503           LastAlu.back() = nullptr;
504           continue;
505         }
506 
507         MachineBasicBlock::iterator MI = I;
508         if (MI->getOpcode() != R600::ENDIF)
509           LastAlu.back() = nullptr;
510         if (MI->getOpcode() == R600::CF_ALU)
511           LastAlu.back() = &*MI;
512         I++;
513         bool RequiresWorkAround =
514             CFStack.requiresWorkAroundForInst(MI->getOpcode());
515         switch (MI->getOpcode()) {
516         case R600::CF_ALU_PUSH_BEFORE:
517           if (RequiresWorkAround) {
518             LLVM_DEBUG(dbgs()
519                        << "Applying bug work-around for ALU_PUSH_BEFORE\n");
520             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG))
521                 .addImm(CfCount + 1)
522                 .addImm(1);
523             MI->setDesc(TII->get(R600::CF_ALU));
524             CfCount++;
525             CFStack.pushBranch(R600::CF_PUSH_EG);
526           } else
527             CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE);
528           [[fallthrough]];
529         case R600::CF_ALU:
530           I = MI;
531           AluClauses.push_back(MakeALUClause(MBB, I));
532           LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
533           CfCount++;
534           break;
535         case R600::WHILELOOP: {
536           CFStack.pushLoop();
537           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
538               getHWInstrDesc(CF_WHILE_LOOP))
539               .addImm(1);
540           std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount,
541               std::set<MachineInstr *>());
542           Pair.second.insert(MIb);
543           LoopStack.push_back(std::move(Pair));
544           MI->eraseFromParent();
545           CfCount++;
546           break;
547         }
548         case R600::ENDLOOP: {
549           CFStack.popLoop();
550           std::pair<unsigned, std::set<MachineInstr *>> Pair =
551               std::move(LoopStack.back());
552           LoopStack.pop_back();
553           CounterPropagateAddr(Pair.second, CfCount);
554           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
555               .addImm(Pair.first + 1);
556           MI->eraseFromParent();
557           CfCount++;
558           break;
559         }
560         case R600::IF_PREDICATE_SET: {
561           LastAlu.push_back(nullptr);
562           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
563               getHWInstrDesc(CF_JUMP))
564               .addImm(0)
565               .addImm(0);
566           IfThenElseStack.push_back(MIb);
567           LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
568           MI->eraseFromParent();
569           CfCount++;
570           break;
571         }
572         case R600::ELSE: {
573           MachineInstr * JumpInst = IfThenElseStack.back();
574           IfThenElseStack.pop_back();
575           CounterPropagateAddr(*JumpInst, CfCount);
576           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
577               getHWInstrDesc(CF_ELSE))
578               .addImm(0)
579               .addImm(0);
580           LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
581           IfThenElseStack.push_back(MIb);
582           MI->eraseFromParent();
583           CfCount++;
584           break;
585         }
586         case R600::ENDIF: {
587           CFStack.popBranch();
588           if (LastAlu.back()) {
589             ToPopAfter.push_back(LastAlu.back());
590           } else {
591             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
592                 getHWInstrDesc(CF_POP))
593                 .addImm(CfCount + 1)
594                 .addImm(1);
595             (void)MIb;
596             LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
597             CfCount++;
598           }
599 
600           MachineInstr *IfOrElseInst = IfThenElseStack.back();
601           IfThenElseStack.pop_back();
602           CounterPropagateAddr(*IfOrElseInst, CfCount);
603           IfOrElseInst->getOperand(1).setImm(1);
604           LastAlu.pop_back();
605           MI->eraseFromParent();
606           break;
607         }
608         case R600::BREAK: {
609           CfCount ++;
610           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
611               getHWInstrDesc(CF_LOOP_BREAK))
612               .addImm(0);
613           LoopStack.back().second.insert(MIb);
614           MI->eraseFromParent();
615           break;
616         }
617         case R600::CONTINUE: {
618           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
619               getHWInstrDesc(CF_LOOP_CONTINUE))
620               .addImm(0);
621           LoopStack.back().second.insert(MIb);
622           MI->eraseFromParent();
623           CfCount++;
624           break;
625         }
626         case R600::RETURN: {
627           DebugLoc DL = MBB.findDebugLoc(MI);
628           BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
629           CfCount++;
630           if (CfCount % 2) {
631             BuildMI(MBB, I, DL, TII->get(R600::PAD));
632             CfCount++;
633           }
634           MI->eraseFromParent();
635           for (ClauseFile &CF : FetchClauses)
636             EmitFetchClause(I, DL, CF, CfCount);
637           for (ClauseFile &CF : AluClauses)
638             EmitALUClause(I, DL, CF, CfCount);
639           break;
640         }
641         default:
642           if (TII->isExport(MI->getOpcode())) {
643             LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump(););
644             CfCount++;
645           }
646           break;
647         }
648       }
649       for (MachineInstr *Alu : ToPopAfter) {
650         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
651             TII->get(R600::CF_ALU_POP_AFTER))
652             .addImm(Alu->getOperand(0).getImm())
653             .addImm(Alu->getOperand(1).getImm())
654             .addImm(Alu->getOperand(2).getImm())
655             .addImm(Alu->getOperand(3).getImm())
656             .addImm(Alu->getOperand(4).getImm())
657             .addImm(Alu->getOperand(5).getImm())
658             .addImm(Alu->getOperand(6).getImm())
659             .addImm(Alu->getOperand(7).getImm())
660             .addImm(Alu->getOperand(8).getImm());
661         Alu->eraseFromParent();
662       }
663       MFI->CFStackSize = CFStack.MaxStackSize;
664     }
665 
666     return false;
667   }
668 
669   StringRef getPassName() const override {
670     return "R600 Control Flow Finalizer Pass";
671   }
672 };
673 
674 } // end anonymous namespace
675 
676 INITIALIZE_PASS_BEGIN(R600ControlFlowFinalizer, DEBUG_TYPE,
677                      "R600 Control Flow Finalizer", false, false)
678 INITIALIZE_PASS_END(R600ControlFlowFinalizer, DEBUG_TYPE,
679                     "R600 Control Flow Finalizer", false, false)
680 
681 char R600ControlFlowFinalizer::ID = 0;
682 
683 char &llvm::R600ControlFlowFinalizerID = R600ControlFlowFinalizer::ID;
684 
685 FunctionPass *llvm::createR600ControlFlowFinalizer() {
686   return new R600ControlFlowFinalizer();
687 }
688