xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/CodeGen/ModuloSchedule.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/Analysis/MemoryLocation.h"
12 #include "llvm/CodeGen/LiveIntervals.h"
13 #include "llvm/CodeGen/MachineInstrBuilder.h"
14 #include "llvm/CodeGen/MachineLoopInfo.h"
15 #include "llvm/CodeGen/MachineRegisterInfo.h"
16 #include "llvm/InitializePasses.h"
17 #include "llvm/MC/MCContext.h"
18 #include "llvm/Support/Debug.h"
19 #include "llvm/Support/ErrorHandling.h"
20 #include "llvm/Support/raw_ostream.h"
21 
22 #define DEBUG_TYPE "pipeliner"
23 using namespace llvm;
24 
25 static cl::opt<bool> SwapBranchTargetsMVE(
26     "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false),
27     cl::desc("Swap target blocks of a conditional branch for MVE expander"));
28 
29 void ModuloSchedule::print(raw_ostream &OS) {
30   for (MachineInstr *MI : ScheduledInstrs)
31     OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
32 }
33 
34 //===----------------------------------------------------------------------===//
35 // ModuloScheduleExpander implementation
36 //===----------------------------------------------------------------------===//
37 
38 /// Return the register values for  the operands of a Phi instruction.
39 /// This function assume the instruction is a Phi.
40 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
41                        Register &InitVal, Register &LoopVal) {
42   assert(Phi.isPHI() && "Expecting a Phi.");
43 
44   InitVal = Register();
45   LoopVal = Register();
46   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
47     if (Phi.getOperand(i + 1).getMBB() != Loop)
48       InitVal = Phi.getOperand(i).getReg();
49     else
50       LoopVal = Phi.getOperand(i).getReg();
51 
52   assert(InitVal && LoopVal && "Unexpected Phi structure.");
53 }
54 
55 /// Return the Phi register value that comes from the incoming block.
56 static Register getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
57   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
58     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
59       return Phi.getOperand(i).getReg();
60   return Register();
61 }
62 
63 /// Return the Phi register value that comes the loop block.
64 static Register getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
65   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
66     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
67       return Phi.getOperand(i).getReg();
68   return Register();
69 }
70 
71 void ModuloScheduleExpander::expand() {
72   BB = Schedule.getLoop()->getTopBlock();
73   Preheader = *BB->pred_begin();
74   if (Preheader == BB)
75     Preheader = *std::next(BB->pred_begin());
76 
77   // Iterate over the definitions in each instruction, and compute the
78   // stage difference for each use.  Keep the maximum value.
79   for (MachineInstr *MI : Schedule.getInstructions()) {
80     int DefStage = Schedule.getStage(MI);
81     for (const MachineOperand &Op : MI->all_defs()) {
82       Register Reg = Op.getReg();
83       unsigned MaxDiff = 0;
84       bool PhiIsSwapped = false;
85       for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
86         MachineInstr *UseMI = UseOp.getParent();
87         int UseStage = Schedule.getStage(UseMI);
88         unsigned Diff = 0;
89         if (UseStage != -1 && UseStage >= DefStage)
90           Diff = UseStage - DefStage;
91         if (MI->isPHI()) {
92           if (isLoopCarried(*MI))
93             ++Diff;
94           else
95             PhiIsSwapped = true;
96         }
97         MaxDiff = std::max(Diff, MaxDiff);
98       }
99       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
100     }
101   }
102 
103   generatePipelinedLoop();
104 }
105 
106 void ModuloScheduleExpander::generatePipelinedLoop() {
107   LoopInfo = TII->analyzeLoopForPipelining(BB);
108   assert(LoopInfo && "Must be able to analyze loop!");
109 
110   // Create a new basic block for the kernel and add it to the CFG.
111   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
112 
113   unsigned MaxStageCount = Schedule.getNumStages() - 1;
114 
115   // Remember the registers that are used in different stages. The index is
116   // the iteration, or stage, that the instruction is scheduled in.  This is
117   // a map between register names in the original block and the names created
118   // in each stage of the pipelined loop.
119   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
120 
121   // The renaming destination by Phis for the registers across stages.
122   // This map is updated during Phis generation to point to the most recent
123   // renaming destination.
124   ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
125 
126   InstrMapTy InstrMap;
127 
128   SmallVector<MachineBasicBlock *, 4> PrologBBs;
129 
130   // Generate the prolog instructions that set up the pipeline.
131   generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
132   MF.insert(BB->getIterator(), KernelBB);
133   LIS.insertMBBInMaps(KernelBB);
134 
135   // Rearrange the instructions to generate the new, pipelined loop,
136   // and update register names as needed.
137   for (MachineInstr *CI : Schedule.getInstructions()) {
138     if (CI->isPHI())
139       continue;
140     unsigned StageNum = Schedule.getStage(CI);
141     MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
142     updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
143     KernelBB->push_back(NewMI);
144     LIS.InsertMachineInstrInMaps(*NewMI);
145     InstrMap[NewMI] = CI;
146   }
147 
148   // Copy any terminator instructions to the new kernel, and update
149   // names as needed.
150   for (MachineInstr &MI : BB->terminators()) {
151     MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
152     updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
153     KernelBB->push_back(NewMI);
154     LIS.InsertMachineInstrInMaps(*NewMI);
155     InstrMap[NewMI] = &MI;
156   }
157 
158   NewKernel = KernelBB;
159   KernelBB->transferSuccessors(BB);
160   KernelBB->replaceSuccessor(BB, KernelBB);
161 
162   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
163                        InstrMap, MaxStageCount, MaxStageCount, false);
164   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
165                InstrMap, MaxStageCount, MaxStageCount, false);
166 
167   LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
168 
169   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
170   // Generate the epilog instructions to complete the pipeline.
171   generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
172                  PrologBBs);
173 
174   // We need this step because the register allocation doesn't handle some
175   // situations well, so we insert copies to help out.
176   splitLifetimes(KernelBB, EpilogBBs);
177 
178   // Remove dead instructions due to loop induction variables.
179   removeDeadInstructions(KernelBB, EpilogBBs);
180 
181   // Add branches between prolog and epilog blocks.
182   addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
183 
184   delete[] VRMap;
185   delete[] VRMapPhi;
186 }
187 
188 void ModuloScheduleExpander::cleanup() {
189   // Remove the original loop since it's no longer referenced.
190   for (auto &I : *BB)
191     LIS.RemoveMachineInstrFromMaps(I);
192   BB->clear();
193   BB->eraseFromParent();
194 }
195 
196 /// Generate the pipeline prolog code.
197 void ModuloScheduleExpander::generateProlog(unsigned LastStage,
198                                             MachineBasicBlock *KernelBB,
199                                             ValueMapTy *VRMap,
200                                             MBBVectorTy &PrologBBs) {
201   MachineBasicBlock *PredBB = Preheader;
202   InstrMapTy InstrMap;
203 
204   // Generate a basic block for each stage, not including the last stage,
205   // which will be generated in the kernel. Each basic block may contain
206   // instructions from multiple stages/iterations.
207   for (unsigned i = 0; i < LastStage; ++i) {
208     // Create and insert the prolog basic block prior to the original loop
209     // basic block.  The original loop is removed later.
210     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
211     PrologBBs.push_back(NewBB);
212     MF.insert(BB->getIterator(), NewBB);
213     NewBB->transferSuccessors(PredBB);
214     PredBB->addSuccessor(NewBB);
215     PredBB = NewBB;
216     LIS.insertMBBInMaps(NewBB);
217 
218     // Generate instructions for each appropriate stage. Process instructions
219     // in original program order.
220     for (int StageNum = i; StageNum >= 0; --StageNum) {
221       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
222                                        BBE = BB->getFirstTerminator();
223            BBI != BBE; ++BBI) {
224         if (Schedule.getStage(&*BBI) == StageNum) {
225           if (BBI->isPHI())
226             continue;
227           MachineInstr *NewMI =
228               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
229           updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
230           NewBB->push_back(NewMI);
231           LIS.InsertMachineInstrInMaps(*NewMI);
232           InstrMap[NewMI] = &*BBI;
233         }
234       }
235     }
236     rewritePhiValues(NewBB, i, VRMap, InstrMap);
237     LLVM_DEBUG({
238       dbgs() << "prolog:\n";
239       NewBB->dump();
240     });
241   }
242 
243   PredBB->replaceSuccessor(BB, KernelBB);
244 
245   // Check if we need to remove the branch from the preheader to the original
246   // loop, and replace it with a branch to the new loop.
247   unsigned numBranches = TII->removeBranch(*Preheader);
248   if (numBranches) {
249     SmallVector<MachineOperand, 0> Cond;
250     TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
251   }
252 }
253 
254 /// Generate the pipeline epilog code. The epilog code finishes the iterations
255 /// that were started in either the prolog or the kernel.  We create a basic
256 /// block for each stage that needs to complete.
257 void ModuloScheduleExpander::generateEpilog(
258     unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
259     ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
260     MBBVectorTy &PrologBBs) {
261   // We need to change the branch from the kernel to the first epilog block, so
262   // this call to analyze branch uses the kernel rather than the original BB.
263   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
264   SmallVector<MachineOperand, 4> Cond;
265   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
266   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
267   if (checkBranch)
268     return;
269 
270   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
271   if (*LoopExitI == KernelBB)
272     ++LoopExitI;
273   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
274   MachineBasicBlock *LoopExitBB = *LoopExitI;
275 
276   MachineBasicBlock *PredBB = KernelBB;
277   MachineBasicBlock *EpilogStart = LoopExitBB;
278   InstrMapTy InstrMap;
279 
280   // Generate a basic block for each stage, not including the last stage,
281   // which was generated for the kernel.  Each basic block may contain
282   // instructions from multiple stages/iterations.
283   int EpilogStage = LastStage + 1;
284   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
285     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
286     EpilogBBs.push_back(NewBB);
287     MF.insert(BB->getIterator(), NewBB);
288 
289     PredBB->replaceSuccessor(LoopExitBB, NewBB);
290     NewBB->addSuccessor(LoopExitBB);
291     LIS.insertMBBInMaps(NewBB);
292 
293     if (EpilogStart == LoopExitBB)
294       EpilogStart = NewBB;
295 
296     // Add instructions to the epilog depending on the current block.
297     // Process instructions in original program order.
298     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
299       for (auto &BBI : *BB) {
300         if (BBI.isPHI())
301           continue;
302         MachineInstr *In = &BBI;
303         if ((unsigned)Schedule.getStage(In) == StageNum) {
304           // Instructions with memoperands in the epilog are updated with
305           // conservative values.
306           MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
307           updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
308           NewBB->push_back(NewMI);
309           LIS.InsertMachineInstrInMaps(*NewMI);
310           InstrMap[NewMI] = In;
311         }
312       }
313     }
314     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
315                          InstrMap, LastStage, EpilogStage, i == 1);
316     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
317                  InstrMap, LastStage, EpilogStage, i == 1);
318     PredBB = NewBB;
319 
320     LLVM_DEBUG({
321       dbgs() << "epilog:\n";
322       NewBB->dump();
323     });
324   }
325 
326   // Fix any Phi nodes in the loop exit block.
327   LoopExitBB->replacePhiUsesWith(BB, PredBB);
328 
329   // Create a branch to the new epilog from the kernel.
330   // Remove the original branch and add a new branch to the epilog.
331   TII->removeBranch(*KernelBB);
332   assert((OrigBB == TBB || OrigBB == FBB) &&
333          "Unable to determine looping branch direction");
334   if (OrigBB != TBB)
335     TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
336   else
337     TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
338   // Add a branch to the loop exit.
339   if (EpilogBBs.size() > 0) {
340     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
341     SmallVector<MachineOperand, 4> Cond1;
342     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
343   }
344 }
345 
346 /// Replace all uses of FromReg that appear outside the specified
347 /// basic block with ToReg.
348 static void replaceRegUsesAfterLoop(Register FromReg, Register ToReg,
349                                     MachineBasicBlock *MBB,
350                                     MachineRegisterInfo &MRI) {
351   for (MachineOperand &O :
352        llvm::make_early_inc_range(MRI.use_operands(FromReg)))
353     if (O.getParent()->getParent() != MBB)
354       O.setReg(ToReg);
355 }
356 
357 /// Return true if the register has a use that occurs outside the
358 /// specified loop.
359 static bool hasUseAfterLoop(Register Reg, MachineBasicBlock *BB,
360                             MachineRegisterInfo &MRI) {
361   for (const MachineOperand &MO : MRI.use_operands(Reg))
362     if (MO.getParent()->getParent() != BB)
363       return true;
364   return false;
365 }
366 
367 /// Generate Phis for the specific block in the generated pipelined code.
368 /// This function looks at the Phis from the original code to guide the
369 /// creation of new Phis.
370 void ModuloScheduleExpander::generateExistingPhis(
371     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
372     MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
373     unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
374   // Compute the stage number for the initial value of the Phi, which
375   // comes from the prolog. The prolog to use depends on to which kernel/
376   // epilog that we're adding the Phi.
377   unsigned PrologStage = 0;
378   unsigned PrevStage = 0;
379   bool InKernel = (LastStageNum == CurStageNum);
380   if (InKernel) {
381     PrologStage = LastStageNum - 1;
382     PrevStage = CurStageNum;
383   } else {
384     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
385     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
386   }
387 
388   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
389                                    BBE = BB->getFirstNonPHI();
390        BBI != BBE; ++BBI) {
391     Register Def = BBI->getOperand(0).getReg();
392 
393     Register InitVal;
394     Register LoopVal;
395     getPhiRegs(*BBI, BB, InitVal, LoopVal);
396 
397     Register PhiOp1;
398     // The Phi value from the loop body typically is defined in the loop, but
399     // not always. So, we need to check if the value is defined in the loop.
400     Register PhiOp2 = LoopVal;
401     if (auto It = VRMap[LastStageNum].find(LoopVal);
402         It != VRMap[LastStageNum].end())
403       PhiOp2 = It->second;
404 
405     int StageScheduled = Schedule.getStage(&*BBI);
406     int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
407     unsigned NumStages = getStagesForReg(Def, CurStageNum);
408     if (NumStages == 0) {
409       // We don't need to generate a Phi anymore, but we need to rename any uses
410       // of the Phi value.
411       Register NewReg = VRMap[PrevStage][LoopVal];
412       rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
413                             InitVal, NewReg);
414       auto It = VRMap[CurStageNum].find(LoopVal);
415       if (It != VRMap[CurStageNum].end()) {
416         Register Reg = It->second;
417         VRMap[CurStageNum][Def] = Reg;
418       }
419     }
420     // Adjust the number of Phis needed depending on the number of prologs left,
421     // and the distance from where the Phi is first scheduled. The number of
422     // Phis cannot exceed the number of prolog stages. Each stage can
423     // potentially define two values.
424     unsigned MaxPhis = PrologStage + 2;
425     if (!InKernel && (int)PrologStage <= LoopValStage)
426       MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
427     unsigned NumPhis = std::min(NumStages, MaxPhis);
428 
429     Register NewReg;
430     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
431     // In the epilog, we may need to look back one stage to get the correct
432     // Phi name, because the epilog and prolog blocks execute the same stage.
433     // The correct name is from the previous block only when the Phi has
434     // been completely scheduled prior to the epilog, and Phi value is not
435     // needed in multiple stages.
436     int StageDiff = 0;
437     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
438         NumPhis == 1)
439       StageDiff = 1;
440     // Adjust the computations below when the phi and the loop definition
441     // are scheduled in different stages.
442     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
443       StageDiff = StageScheduled - LoopValStage;
444     for (unsigned np = 0; np < NumPhis; ++np) {
445       // If the Phi hasn't been scheduled, then use the initial Phi operand
446       // value. Otherwise, use the scheduled version of the instruction. This
447       // is a little complicated when a Phi references another Phi.
448       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
449         PhiOp1 = InitVal;
450       // Check if the Phi has already been scheduled in a prolog stage.
451       else if (PrologStage >= AccessStage + StageDiff + np &&
452                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
453         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
454       // Check if the Phi has already been scheduled, but the loop instruction
455       // is either another Phi, or doesn't occur in the loop.
456       else if (PrologStage >= AccessStage + StageDiff + np) {
457         // If the Phi references another Phi, we need to examine the other
458         // Phi to get the correct value.
459         PhiOp1 = LoopVal;
460         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
461         int Indirects = 1;
462         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
463           int PhiStage = Schedule.getStage(InstOp1);
464           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
465             PhiOp1 = getInitPhiReg(*InstOp1, BB);
466           else
467             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
468           InstOp1 = MRI.getVRegDef(PhiOp1);
469           int PhiOpStage = Schedule.getStage(InstOp1);
470           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
471           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np) {
472             auto &M = VRMap[PrologStage - StageAdj - Indirects - np];
473             if (auto It = M.find(PhiOp1); It != M.end()) {
474               PhiOp1 = It->second;
475               break;
476             }
477           }
478           ++Indirects;
479         }
480       } else
481         PhiOp1 = InitVal;
482       // If this references a generated Phi in the kernel, get the Phi operand
483       // from the incoming block.
484       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
485         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
486           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
487 
488       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
489       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
490       // In the epilog, a map lookup is needed to get the value from the kernel,
491       // or previous epilog block. How is does this depends on if the
492       // instruction is scheduled in the previous block.
493       if (!InKernel) {
494         int StageDiffAdj = 0;
495         if (LoopValStage != -1 && StageScheduled > LoopValStage)
496           StageDiffAdj = StageScheduled - LoopValStage;
497         // Use the loop value defined in the kernel, unless the kernel
498         // contains the last definition of the Phi.
499         if (np == 0 && PrevStage == LastStageNum &&
500             (StageScheduled != 0 || LoopValStage != 0) &&
501             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
502           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
503         // Use the value defined by the Phi. We add one because we switch
504         // from looking at the loop value to the Phi definition.
505         else if (np > 0 && PrevStage == LastStageNum &&
506                  VRMap[PrevStage - np + 1].count(Def))
507           PhiOp2 = VRMap[PrevStage - np + 1][Def];
508         // Use the loop value defined in the kernel.
509         else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
510                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
511           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
512         // Use the value defined by the Phi, unless we're generating the first
513         // epilog and the Phi refers to a Phi in a different stage.
514         else if (VRMap[PrevStage - np].count(Def) &&
515                  (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
516                   (LoopValStage == StageScheduled)))
517           PhiOp2 = VRMap[PrevStage - np][Def];
518       }
519 
520       // Check if we can reuse an existing Phi. This occurs when a Phi
521       // references another Phi, and the other Phi is scheduled in an
522       // earlier stage. We can try to reuse an existing Phi up until the last
523       // stage of the current Phi.
524       if (LoopDefIsPhi) {
525         if (static_cast<int>(PrologStage - np) >= StageScheduled) {
526           int LVNumStages = getStagesForPhi(LoopVal);
527           int StageDiff = (StageScheduled - LoopValStage);
528           LVNumStages -= StageDiff;
529           // Make sure the loop value Phi has been processed already.
530           if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
531             NewReg = PhiOp2;
532             unsigned ReuseStage = CurStageNum;
533             if (isLoopCarried(*PhiInst))
534               ReuseStage -= LVNumStages;
535             // Check if the Phi to reuse has been generated yet. If not, then
536             // there is nothing to reuse.
537             if (VRMap[ReuseStage - np].count(LoopVal)) {
538               NewReg = VRMap[ReuseStage - np][LoopVal];
539 
540               rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
541                                     Def, NewReg);
542               // Update the map with the new Phi name.
543               VRMap[CurStageNum - np][Def] = NewReg;
544               PhiOp2 = NewReg;
545               if (VRMap[LastStageNum - np - 1].count(LoopVal))
546                 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
547 
548               if (IsLast && np == NumPhis - 1)
549                 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI);
550               continue;
551             }
552           }
553         }
554         if (InKernel && StageDiff > 0 &&
555             VRMap[CurStageNum - StageDiff - np].count(LoopVal))
556           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
557       }
558 
559       const TargetRegisterClass *RC = MRI.getRegClass(Def);
560       NewReg = MRI.createVirtualRegister(RC);
561 
562       MachineInstrBuilder NewPhi =
563           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
564                   TII->get(TargetOpcode::PHI), NewReg);
565       NewPhi.addReg(PhiOp1).addMBB(BB1);
566       NewPhi.addReg(PhiOp2).addMBB(BB2);
567       LIS.InsertMachineInstrInMaps(*NewPhi);
568       if (np == 0)
569         InstrMap[NewPhi] = &*BBI;
570 
571       // We define the Phis after creating the new pipelined code, so
572       // we need to rename the Phi values in scheduled instructions.
573 
574       Register PrevReg;
575       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
576         PrevReg = VRMap[PrevStage - np][LoopVal];
577       rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
578                             NewReg, PrevReg);
579       // If the Phi has been scheduled, use the new name for rewriting.
580       if (VRMap[CurStageNum - np].count(Def)) {
581         Register R = VRMap[CurStageNum - np][Def];
582         rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
583                               NewReg);
584       }
585 
586       // Check if we need to rename any uses that occurs after the loop. The
587       // register to replace depends on whether the Phi is scheduled in the
588       // epilog.
589       if (IsLast && np == NumPhis - 1)
590         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI);
591 
592       // In the kernel, a dependent Phi uses the value from this Phi.
593       if (InKernel)
594         PhiOp2 = NewReg;
595 
596       // Update the map with the new Phi name.
597       VRMap[CurStageNum - np][Def] = NewReg;
598     }
599 
600     while (NumPhis++ < NumStages) {
601       rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
602                             NewReg, 0);
603     }
604 
605     // Check if we need to rename a Phi that has been eliminated due to
606     // scheduling.
607     if (NumStages == 0 && IsLast) {
608       auto &CurStageMap = VRMap[CurStageNum];
609       auto It = CurStageMap.find(LoopVal);
610       if (It != CurStageMap.end())
611         replaceRegUsesAfterLoop(Def, It->second, BB, MRI);
612     }
613   }
614 }
615 
616 /// Generate Phis for the specified block in the generated pipelined code.
617 /// These are new Phis needed because the definition is scheduled after the
618 /// use in the pipelined sequence.
619 void ModuloScheduleExpander::generatePhis(
620     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
621     MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
622     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
623     bool IsLast) {
624   // Compute the stage number that contains the initial Phi value, and
625   // the Phi from the previous stage.
626   unsigned PrologStage = 0;
627   unsigned PrevStage = 0;
628   unsigned StageDiff = CurStageNum - LastStageNum;
629   bool InKernel = (StageDiff == 0);
630   if (InKernel) {
631     PrologStage = LastStageNum - 1;
632     PrevStage = CurStageNum;
633   } else {
634     PrologStage = LastStageNum - StageDiff;
635     PrevStage = LastStageNum + StageDiff - 1;
636   }
637 
638   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
639                                    BBE = BB->instr_end();
640        BBI != BBE; ++BBI) {
641     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
642       MachineOperand &MO = BBI->getOperand(i);
643       if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
644         continue;
645 
646       int StageScheduled = Schedule.getStage(&*BBI);
647       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
648       Register Def = MO.getReg();
649       unsigned NumPhis = getStagesForReg(Def, CurStageNum);
650       // An instruction scheduled in stage 0 and is used after the loop
651       // requires a phi in the epilog for the last definition from either
652       // the kernel or prolog.
653       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
654           hasUseAfterLoop(Def, BB, MRI))
655         NumPhis = 1;
656       if (!InKernel && (unsigned)StageScheduled > PrologStage)
657         continue;
658 
659       Register PhiOp2;
660       if (InKernel) {
661         PhiOp2 = VRMap[PrevStage][Def];
662         if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
663           if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
664             PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
665       }
666       // The number of Phis can't exceed the number of prolog stages. The
667       // prolog stage number is zero based.
668       if (NumPhis > PrologStage + 1 - StageScheduled)
669         NumPhis = PrologStage + 1 - StageScheduled;
670       for (unsigned np = 0; np < NumPhis; ++np) {
671         // Example for
672         // Org:
673         //   %Org = ... (Scheduled at Stage#0, NumPhi = 2)
674         //
675         // Prolog0 (Stage0):
676         //   %Clone0 = ...
677         // Prolog1 (Stage1):
678         //   %Clone1 = ...
679         // Kernel (Stage2):
680         //   %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
681         //   %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
682         //   %Clone2 = ...
683         // Epilog0 (Stage3):
684         //   %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
685         //   %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
686         // Epilog1 (Stage4):
687         //   %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
688         //
689         // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
690         // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
691         // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
692 
693         Register PhiOp1 = VRMap[PrologStage][Def];
694         if (np <= PrologStage)
695           PhiOp1 = VRMap[PrologStage - np][Def];
696         if (!InKernel) {
697           if (PrevStage == LastStageNum && np == 0)
698             PhiOp2 = VRMap[LastStageNum][Def];
699           else
700             PhiOp2 = VRMapPhi[PrevStage - np][Def];
701         }
702 
703         const TargetRegisterClass *RC = MRI.getRegClass(Def);
704         Register NewReg = MRI.createVirtualRegister(RC);
705 
706         MachineInstrBuilder NewPhi =
707             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
708                     TII->get(TargetOpcode::PHI), NewReg);
709         NewPhi.addReg(PhiOp1).addMBB(BB1);
710         NewPhi.addReg(PhiOp2).addMBB(BB2);
711         LIS.InsertMachineInstrInMaps(*NewPhi);
712         if (np == 0)
713           InstrMap[NewPhi] = &*BBI;
714 
715         // Rewrite uses and update the map. The actions depend upon whether
716         // we generating code for the kernel or epilog blocks.
717         if (InKernel) {
718           rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
719                                 NewReg);
720           rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
721                                 NewReg);
722 
723           PhiOp2 = NewReg;
724           VRMapPhi[PrevStage - np - 1][Def] = NewReg;
725         } else {
726           VRMapPhi[CurStageNum - np][Def] = NewReg;
727           if (np == NumPhis - 1)
728             rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
729                                   NewReg);
730         }
731         if (IsLast && np == NumPhis - 1)
732           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI);
733       }
734     }
735   }
736 }
737 
738 /// Remove instructions that generate values with no uses.
739 /// Typically, these are induction variable operations that generate values
740 /// used in the loop itself.  A dead instruction has a definition with
741 /// no uses, or uses that occur in the original loop only.
742 void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
743                                                     MBBVectorTy &EpilogBBs) {
744   // For each epilog block, check that the value defined by each instruction
745   // is used.  If not, delete it.
746   for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
747     for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
748                                                    ME = MBB->instr_rend();
749          MI != ME;) {
750       // From DeadMachineInstructionElem. Don't delete inline assembly.
751       if (MI->isInlineAsm()) {
752         ++MI;
753         continue;
754       }
755       bool SawStore = false;
756       // Check if it's safe to remove the instruction due to side effects.
757       // We can, and want to, remove Phis here.
758       if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
759         ++MI;
760         continue;
761       }
762       bool used = true;
763       for (const MachineOperand &MO : MI->all_defs()) {
764         Register reg = MO.getReg();
765         // Assume physical registers are used, unless they are marked dead.
766         if (reg.isPhysical()) {
767           used = !MO.isDead();
768           if (used)
769             break;
770           continue;
771         }
772         unsigned realUses = 0;
773         for (const MachineOperand &U : MRI.use_operands(reg)) {
774           // Check if there are any uses that occur only in the original
775           // loop.  If so, that's not a real use.
776           if (U.getParent()->getParent() != BB) {
777             realUses++;
778             used = true;
779             break;
780           }
781         }
782         if (realUses > 0)
783           break;
784         used = false;
785       }
786       if (!used) {
787         LIS.RemoveMachineInstrFromMaps(*MI);
788         MI++->eraseFromParent();
789         continue;
790       }
791       ++MI;
792     }
793   // In the kernel block, check if we can remove a Phi that generates a value
794   // used in an instruction removed in the epilog block.
795   for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
796     Register reg = MI.getOperand(0).getReg();
797     if (MRI.use_begin(reg) == MRI.use_end()) {
798       LIS.RemoveMachineInstrFromMaps(MI);
799       MI.eraseFromParent();
800     }
801   }
802 }
803 
804 /// For loop carried definitions, we split the lifetime of a virtual register
805 /// that has uses past the definition in the next iteration. A copy with a new
806 /// virtual register is inserted before the definition, which helps with
807 /// generating a better register assignment.
808 ///
809 ///   v1 = phi(a, v2)     v1 = phi(a, v2)
810 ///   v2 = phi(b, v3)     v2 = phi(b, v3)
811 ///   v3 = ..             v4 = copy v1
812 ///   .. = V1             v3 = ..
813 ///                       .. = v4
814 void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
815                                             MBBVectorTy &EpilogBBs) {
816   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
817   for (auto &PHI : KernelBB->phis()) {
818     Register Def = PHI.getOperand(0).getReg();
819     // Check for any Phi definition that used as an operand of another Phi
820     // in the same block.
821     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
822                                                  E = MRI.use_instr_end();
823          I != E; ++I) {
824       if (I->isPHI() && I->getParent() == KernelBB) {
825         // Get the loop carried definition.
826         Register LCDef = getLoopPhiReg(PHI, KernelBB);
827         if (!LCDef)
828           continue;
829         MachineInstr *MI = MRI.getVRegDef(LCDef);
830         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
831           continue;
832         // Search through the rest of the block looking for uses of the Phi
833         // definition. If one occurs, then split the lifetime.
834         Register SplitReg;
835         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
836                                     KernelBB->instr_end()))
837           if (BBJ.readsRegister(Def, /*TRI=*/nullptr)) {
838             // We split the lifetime when we find the first use.
839             if (!SplitReg) {
840               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
841               MachineInstr *newCopy =
842                   BuildMI(*KernelBB, MI, MI->getDebugLoc(),
843                           TII->get(TargetOpcode::COPY), SplitReg)
844                       .addReg(Def);
845               LIS.InsertMachineInstrInMaps(*newCopy);
846             }
847             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
848           }
849         if (!SplitReg)
850           continue;
851         // Search through each of the epilog blocks for any uses to be renamed.
852         for (auto &Epilog : EpilogBBs)
853           for (auto &I : *Epilog)
854             if (I.readsRegister(Def, /*TRI=*/nullptr))
855               I.substituteRegister(Def, SplitReg, 0, *TRI);
856         break;
857       }
858     }
859   }
860 }
861 
862 /// Remove the incoming block from the Phis in a basic block.
863 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
864   for (MachineInstr &MI : *BB) {
865     if (!MI.isPHI())
866       break;
867     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
868       if (MI.getOperand(i + 1).getMBB() == Incoming) {
869         MI.removeOperand(i + 1);
870         MI.removeOperand(i);
871         break;
872       }
873   }
874 }
875 
876 /// Create branches from each prolog basic block to the appropriate epilog
877 /// block.  These edges are needed if the loop ends before reaching the
878 /// kernel.
879 void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
880                                          MBBVectorTy &PrologBBs,
881                                          MachineBasicBlock *KernelBB,
882                                          MBBVectorTy &EpilogBBs,
883                                          ValueMapTy *VRMap) {
884   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
885   MachineBasicBlock *LastPro = KernelBB;
886   MachineBasicBlock *LastEpi = KernelBB;
887 
888   // Start from the blocks connected to the kernel and work "out"
889   // to the first prolog and the last epilog blocks.
890   unsigned MaxIter = PrologBBs.size() - 1;
891   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
892     // Add branches to the prolog that go to the corresponding
893     // epilog, and the fall-thru prolog/kernel block.
894     MachineBasicBlock *Prolog = PrologBBs[j];
895     MachineBasicBlock *Epilog = EpilogBBs[i];
896 
897     SmallVector<MachineOperand, 4> Cond;
898     std::optional<bool> StaticallyGreater =
899         LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
900     unsigned numAdded = 0;
901     if (!StaticallyGreater) {
902       Prolog->addSuccessor(Epilog);
903       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
904     } else if (*StaticallyGreater == false) {
905       Prolog->addSuccessor(Epilog);
906       Prolog->removeSuccessor(LastPro);
907       LastEpi->removeSuccessor(Epilog);
908       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
909       removePhis(Epilog, LastEpi);
910       // Remove the blocks that are no longer referenced.
911       if (LastPro != LastEpi) {
912         for (auto &MI : *LastEpi)
913           LIS.RemoveMachineInstrFromMaps(MI);
914         LastEpi->clear();
915         LastEpi->eraseFromParent();
916       }
917       if (LastPro == KernelBB) {
918         LoopInfo->disposed(&LIS);
919         NewKernel = nullptr;
920       }
921       for (auto &MI : *LastPro)
922         LIS.RemoveMachineInstrFromMaps(MI);
923       LastPro->clear();
924       LastPro->eraseFromParent();
925     } else {
926       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
927       removePhis(Epilog, Prolog);
928     }
929     LastPro = Prolog;
930     LastEpi = Epilog;
931     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
932                                                    E = Prolog->instr_rend();
933          I != E && numAdded > 0; ++I, --numAdded)
934       updateInstruction(&*I, false, j, 0, VRMap);
935   }
936 
937   if (NewKernel) {
938     LoopInfo->setPreheader(PrologBBs[MaxIter]);
939     LoopInfo->adjustTripCount(-(MaxIter + 1));
940   }
941 }
942 
943 /// Return true if we can compute the amount the instruction changes
944 /// during each iteration. Set Delta to the amount of the change.
945 bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
946   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
947   const MachineOperand *BaseOp;
948   int64_t Offset;
949   bool OffsetIsScalable;
950   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
951     return false;
952 
953   // FIXME: This algorithm assumes instructions have fixed-size offsets.
954   if (OffsetIsScalable)
955     return false;
956 
957   if (!BaseOp->isReg())
958     return false;
959 
960   Register BaseReg = BaseOp->getReg();
961 
962   MachineRegisterInfo &MRI = MF.getRegInfo();
963   // Check if there is a Phi. If so, get the definition in the loop.
964   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
965   if (BaseDef && BaseDef->isPHI()) {
966     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
967     BaseDef = MRI.getVRegDef(BaseReg);
968   }
969   if (!BaseDef)
970     return false;
971 
972   int D = 0;
973   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
974     return false;
975 
976   Delta = D;
977   return true;
978 }
979 
980 /// Update the memory operand with a new offset when the pipeliner
981 /// generates a new copy of the instruction that refers to a
982 /// different memory location.
983 void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
984                                                MachineInstr &OldMI,
985                                                unsigned Num) {
986   if (Num == 0)
987     return;
988   // If the instruction has memory operands, then adjust the offset
989   // when the instruction appears in different stages.
990   if (NewMI.memoperands_empty())
991     return;
992   SmallVector<MachineMemOperand *, 2> NewMMOs;
993   for (MachineMemOperand *MMO : NewMI.memoperands()) {
994     // TODO: Figure out whether isAtomic is really necessary (see D57601).
995     if (MMO->isVolatile() || MMO->isAtomic() ||
996         (MMO->isInvariant() && MMO->isDereferenceable()) ||
997         (!MMO->getValue())) {
998       NewMMOs.push_back(MMO);
999       continue;
1000     }
1001     unsigned Delta;
1002     if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
1003       int64_t AdjOffset = Delta * Num;
1004       NewMMOs.push_back(
1005           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
1006     } else {
1007       NewMMOs.push_back(MF.getMachineMemOperand(
1008           MMO, 0, LocationSize::beforeOrAfterPointer()));
1009     }
1010   }
1011   NewMI.setMemRefs(MF, NewMMOs);
1012 }
1013 
1014 /// Clone the instruction for the new pipelined loop and update the
1015 /// memory operands, if needed.
1016 MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
1017                                                  unsigned CurStageNum,
1018                                                  unsigned InstStageNum) {
1019   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1020   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1021   return NewMI;
1022 }
1023 
1024 /// Clone the instruction for the new pipelined loop. If needed, this
1025 /// function updates the instruction using the values saved in the
1026 /// InstrChanges structure.
1027 MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1028     MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1029   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1030   auto It = InstrChanges.find(OldMI);
1031   if (It != InstrChanges.end()) {
1032     std::pair<Register, int64_t> RegAndOffset = It->second;
1033     unsigned BasePos, OffsetPos;
1034     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
1035       return nullptr;
1036     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
1037     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
1038     if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
1039       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1040     NewMI->getOperand(OffsetPos).setImm(NewOffset);
1041   }
1042   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1043   return NewMI;
1044 }
1045 
1046 /// Update the machine instruction with new virtual registers.  This
1047 /// function may change the definitions and/or uses.
1048 void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1049                                                bool LastDef,
1050                                                unsigned CurStageNum,
1051                                                unsigned InstrStageNum,
1052                                                ValueMapTy *VRMap) {
1053   for (MachineOperand &MO : NewMI->operands()) {
1054     if (!MO.isReg() || !MO.getReg().isVirtual())
1055       continue;
1056     Register reg = MO.getReg();
1057     if (MO.isDef()) {
1058       // Create a new virtual register for the definition.
1059       const TargetRegisterClass *RC = MRI.getRegClass(reg);
1060       Register NewReg = MRI.createVirtualRegister(RC);
1061       MO.setReg(NewReg);
1062       VRMap[CurStageNum][reg] = NewReg;
1063       if (LastDef)
1064         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI);
1065     } else if (MO.isUse()) {
1066       MachineInstr *Def = MRI.getVRegDef(reg);
1067       // Compute the stage that contains the last definition for instruction.
1068       int DefStageNum = Schedule.getStage(Def);
1069       unsigned StageNum = CurStageNum;
1070       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1071         // Compute the difference in stages between the defintion and the use.
1072         unsigned StageDiff = (InstrStageNum - DefStageNum);
1073         // Make an adjustment to get the last definition.
1074         StageNum -= StageDiff;
1075       }
1076       if (auto It = VRMap[StageNum].find(reg); It != VRMap[StageNum].end())
1077         MO.setReg(It->second);
1078     }
1079   }
1080 }
1081 
1082 /// Return the instruction in the loop that defines the register.
1083 /// If the definition is a Phi, then follow the Phi operand to
1084 /// the instruction in the loop.
1085 MachineInstr *ModuloScheduleExpander::findDefInLoop(Register Reg) {
1086   SmallPtrSet<MachineInstr *, 8> Visited;
1087   MachineInstr *Def = MRI.getVRegDef(Reg);
1088   while (Def->isPHI()) {
1089     if (!Visited.insert(Def).second)
1090       break;
1091     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1092       if (Def->getOperand(i + 1).getMBB() == BB) {
1093         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
1094         break;
1095       }
1096   }
1097   return Def;
1098 }
1099 
1100 /// Return the new name for the value from the previous stage.
1101 Register ModuloScheduleExpander::getPrevMapVal(
1102     unsigned StageNum, unsigned PhiStage, Register LoopVal, unsigned LoopStage,
1103     ValueMapTy *VRMap, MachineBasicBlock *BB) {
1104   Register PrevVal;
1105   if (StageNum > PhiStage) {
1106     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
1107     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
1108       // The name is defined in the previous stage.
1109       PrevVal = VRMap[StageNum - 1][LoopVal];
1110     else if (VRMap[StageNum].count(LoopVal))
1111       // The previous name is defined in the current stage when the instruction
1112       // order is swapped.
1113       PrevVal = VRMap[StageNum][LoopVal];
1114     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1115       // The loop value hasn't yet been scheduled.
1116       PrevVal = LoopVal;
1117     else if (StageNum == PhiStage + 1)
1118       // The loop value is another phi, which has not been scheduled.
1119       PrevVal = getInitPhiReg(*LoopInst, BB);
1120     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1121       // The loop value is another phi, which has been scheduled.
1122       PrevVal =
1123           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
1124                         LoopStage, VRMap, BB);
1125   }
1126   return PrevVal;
1127 }
1128 
1129 /// Rewrite the Phi values in the specified block to use the mappings
1130 /// from the initial operand. Once the Phi is scheduled, we switch
1131 /// to using the loop value instead of the Phi value, so those names
1132 /// do not need to be rewritten.
1133 void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1134                                               unsigned StageNum,
1135                                               ValueMapTy *VRMap,
1136                                               InstrMapTy &InstrMap) {
1137   for (auto &PHI : BB->phis()) {
1138     Register InitVal;
1139     Register LoopVal;
1140     getPhiRegs(PHI, BB, InitVal, LoopVal);
1141     Register PhiDef = PHI.getOperand(0).getReg();
1142 
1143     unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
1144     unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
1145     unsigned NumPhis = getStagesForPhi(PhiDef);
1146     if (NumPhis > StageNum)
1147       NumPhis = StageNum;
1148     for (unsigned np = 0; np <= NumPhis; ++np) {
1149       Register NewVal =
1150           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1151       if (!NewVal)
1152         NewVal = InitVal;
1153       rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
1154                             NewVal);
1155     }
1156   }
1157 }
1158 
1159 /// Rewrite a previously scheduled instruction to use the register value
1160 /// from the new instruction. Make sure the instruction occurs in the
1161 /// basic block, and we don't change the uses in the new instruction.
1162 void ModuloScheduleExpander::rewriteScheduledInstr(
1163     MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1164     unsigned PhiNum, MachineInstr *Phi, Register OldReg, Register NewReg,
1165     Register PrevReg) {
1166   bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1167   int StagePhi = Schedule.getStage(Phi) + PhiNum;
1168   // Rewrite uses that have been scheduled already to use the new
1169   // Phi register.
1170   for (MachineOperand &UseOp :
1171        llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
1172     MachineInstr *UseMI = UseOp.getParent();
1173     if (UseMI->getParent() != BB)
1174       continue;
1175     if (UseMI->isPHI()) {
1176       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
1177         continue;
1178       if (getLoopPhiReg(*UseMI, BB) != OldReg)
1179         continue;
1180     }
1181     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
1182     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1183     MachineInstr *OrigMI = OrigInstr->second;
1184     int StageSched = Schedule.getStage(OrigMI);
1185     int CycleSched = Schedule.getCycle(OrigMI);
1186     Register ReplaceReg;
1187     // This is the stage for the scheduled instruction.
1188     if (StagePhi == StageSched && Phi->isPHI()) {
1189       int CyclePhi = Schedule.getCycle(Phi);
1190       if (PrevReg && InProlog)
1191         ReplaceReg = PrevReg;
1192       else if (PrevReg && !isLoopCarried(*Phi) &&
1193                (CyclePhi <= CycleSched || OrigMI->isPHI()))
1194         ReplaceReg = PrevReg;
1195       else
1196         ReplaceReg = NewReg;
1197     }
1198     // The scheduled instruction occurs before the scheduled Phi, and the
1199     // Phi is not loop carried.
1200     if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
1201       ReplaceReg = NewReg;
1202     if (StagePhi > StageSched && Phi->isPHI())
1203       ReplaceReg = NewReg;
1204     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1205       ReplaceReg = NewReg;
1206     if (ReplaceReg) {
1207       const TargetRegisterClass *NRC =
1208           MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
1209       if (NRC)
1210         UseOp.setReg(ReplaceReg);
1211       else {
1212         Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
1213         MachineInstr *newCopy = BuildMI(*BB, UseMI, UseMI->getDebugLoc(),
1214                                         TII->get(TargetOpcode::COPY), SplitReg)
1215                                     .addReg(ReplaceReg);
1216         UseOp.setReg(SplitReg);
1217         LIS.InsertMachineInstrInMaps(*newCopy);
1218       }
1219     }
1220   }
1221 }
1222 
1223 bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1224   if (!Phi.isPHI())
1225     return false;
1226   int DefCycle = Schedule.getCycle(&Phi);
1227   int DefStage = Schedule.getStage(&Phi);
1228 
1229   Register InitVal;
1230   Register LoopVal;
1231   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
1232   MachineInstr *Use = MRI.getVRegDef(LoopVal);
1233   if (!Use || Use->isPHI())
1234     return true;
1235   int LoopCycle = Schedule.getCycle(Use);
1236   int LoopStage = Schedule.getStage(Use);
1237   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1238 }
1239 
1240 //===----------------------------------------------------------------------===//
1241 // PeelingModuloScheduleExpander implementation
1242 //===----------------------------------------------------------------------===//
1243 // This is a reimplementation of ModuloScheduleExpander that works by creating
1244 // a fully correct steady-state kernel and peeling off the prolog and epilogs.
1245 //===----------------------------------------------------------------------===//
1246 
1247 namespace {
1248 // Remove any dead phis in MBB. Dead phis either have only one block as input
1249 // (in which case they are the identity) or have no uses.
1250 void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1251                        LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1252   bool Changed = true;
1253   while (Changed) {
1254     Changed = false;
1255     for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
1256       assert(MI.isPHI());
1257       if (MRI.use_empty(MI.getOperand(0).getReg())) {
1258         if (LIS)
1259           LIS->RemoveMachineInstrFromMaps(MI);
1260         MI.eraseFromParent();
1261         Changed = true;
1262       } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1263         const TargetRegisterClass *ConstrainRegClass =
1264             MRI.constrainRegClass(MI.getOperand(1).getReg(),
1265                                   MRI.getRegClass(MI.getOperand(0).getReg()));
1266         assert(ConstrainRegClass &&
1267                "Expected a valid constrained register class!");
1268         (void)ConstrainRegClass;
1269         MRI.replaceRegWith(MI.getOperand(0).getReg(),
1270                            MI.getOperand(1).getReg());
1271         if (LIS)
1272           LIS->RemoveMachineInstrFromMaps(MI);
1273         MI.eraseFromParent();
1274         Changed = true;
1275       }
1276     }
1277   }
1278 }
1279 
1280 /// Rewrites the kernel block in-place to adhere to the given schedule.
1281 /// KernelRewriter holds all of the state required to perform the rewriting.
1282 class KernelRewriter {
1283   ModuloSchedule &S;
1284   MachineBasicBlock *BB;
1285   MachineBasicBlock *PreheaderBB, *ExitBB;
1286   MachineRegisterInfo &MRI;
1287   const TargetInstrInfo *TII;
1288   LiveIntervals *LIS;
1289 
1290   // Map from register class to canonical undef register for that class.
1291   DenseMap<const TargetRegisterClass *, Register> Undefs;
1292   // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1293   // this map is only used when InitReg is non-undef.
1294   DenseMap<std::pair<Register, Register>, Register> Phis;
1295   // Map from LoopReg to phi register where the InitReg is undef.
1296   DenseMap<Register, Register> UndefPhis;
1297 
1298   // Reg is used by MI. Return the new register MI should use to adhere to the
1299   // schedule. Insert phis as necessary.
1300   Register remapUse(Register Reg, MachineInstr &MI);
1301   // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1302   // If InitReg is not given it is chosen arbitrarily. It will either be undef
1303   // or will be chosen so as to share another phi.
1304   Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1305                const TargetRegisterClass *RC = nullptr);
1306   // Create an undef register of the given register class.
1307   Register undef(const TargetRegisterClass *RC);
1308 
1309 public:
1310   KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1311                  LiveIntervals *LIS = nullptr);
1312   void rewrite();
1313 };
1314 } // namespace
1315 
1316 KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1317                                MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1318     : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1319       ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1320       TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1321   PreheaderBB = *BB->pred_begin();
1322   if (PreheaderBB == BB)
1323     PreheaderBB = *std::next(BB->pred_begin());
1324 }
1325 
1326 void KernelRewriter::rewrite() {
1327   // Rearrange the loop to be in schedule order. Note that the schedule may
1328   // contain instructions that are not owned by the loop block (InstrChanges and
1329   // friends), so we gracefully handle unowned instructions and delete any
1330   // instructions that weren't in the schedule.
1331   auto InsertPt = BB->getFirstTerminator();
1332   MachineInstr *FirstMI = nullptr;
1333   for (MachineInstr *MI : S.getInstructions()) {
1334     if (MI->isPHI())
1335       continue;
1336     if (MI->getParent())
1337       MI->removeFromParent();
1338     BB->insert(InsertPt, MI);
1339     if (!FirstMI)
1340       FirstMI = MI;
1341   }
1342   assert(FirstMI && "Failed to find first MI in schedule");
1343 
1344   // At this point all of the scheduled instructions are between FirstMI
1345   // and the end of the block. Kill from the first non-phi to FirstMI.
1346   for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1347     if (LIS)
1348       LIS->RemoveMachineInstrFromMaps(*I);
1349     (I++)->eraseFromParent();
1350   }
1351 
1352   // Now remap every instruction in the loop.
1353   for (MachineInstr &MI : *BB) {
1354     if (MI.isPHI() || MI.isTerminator())
1355       continue;
1356     for (MachineOperand &MO : MI.uses()) {
1357       if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1358         continue;
1359       Register Reg = remapUse(MO.getReg(), MI);
1360       MO.setReg(Reg);
1361     }
1362   }
1363   EliminateDeadPhis(BB, MRI, LIS);
1364 
1365   // Ensure a phi exists for all instructions that are either referenced by
1366   // an illegal phi or by an instruction outside the loop. This allows us to
1367   // treat remaps of these values the same as "normal" values that come from
1368   // loop-carried phis.
1369   for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1370     if (MI->isPHI()) {
1371       Register R = MI->getOperand(0).getReg();
1372       phi(R);
1373       continue;
1374     }
1375 
1376     for (MachineOperand &Def : MI->defs()) {
1377       for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
1378         if (MI.getParent() != BB) {
1379           phi(Def.getReg());
1380           break;
1381         }
1382       }
1383     }
1384   }
1385 }
1386 
1387 Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1388   MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1389   if (!Producer)
1390     return Reg;
1391 
1392   int ConsumerStage = S.getStage(&MI);
1393   if (!Producer->isPHI()) {
1394     // Non-phi producers are simple to remap. Insert as many phis as the
1395     // difference between the consumer and producer stages.
1396     if (Producer->getParent() != BB)
1397       // Producer was not inside the loop. Use the register as-is.
1398       return Reg;
1399     int ProducerStage = S.getStage(Producer);
1400     assert(ConsumerStage != -1 &&
1401            "In-loop consumer should always be scheduled!");
1402     assert(ConsumerStage >= ProducerStage);
1403     unsigned StageDiff = ConsumerStage - ProducerStage;
1404 
1405     for (unsigned I = 0; I < StageDiff; ++I)
1406       Reg = phi(Reg);
1407     return Reg;
1408   }
1409 
1410   // First, dive through the phi chain to find the defaults for the generated
1411   // phis.
1412   SmallVector<std::optional<Register>, 4> Defaults;
1413   Register LoopReg = Reg;
1414   auto LoopProducer = Producer;
1415   while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1416     LoopReg = getLoopPhiReg(*LoopProducer, BB);
1417     Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
1418     LoopProducer = MRI.getUniqueVRegDef(LoopReg);
1419     assert(LoopProducer);
1420   }
1421   int LoopProducerStage = S.getStage(LoopProducer);
1422 
1423   std::optional<Register> IllegalPhiDefault;
1424 
1425   if (LoopProducerStage == -1) {
1426     // Do nothing.
1427   } else if (LoopProducerStage > ConsumerStage) {
1428     // This schedule is only representable if ProducerStage == ConsumerStage+1.
1429     // In addition, Consumer's cycle must be scheduled after Producer in the
1430     // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1431     // functions.
1432 #ifndef NDEBUG // Silence unused variables in non-asserts mode.
1433     int LoopProducerCycle = S.getCycle(LoopProducer);
1434     int ConsumerCycle = S.getCycle(&MI);
1435 #endif
1436     assert(LoopProducerCycle <= ConsumerCycle);
1437     assert(LoopProducerStage == ConsumerStage + 1);
1438     // Peel off the first phi from Defaults and insert a phi between producer
1439     // and consumer. This phi will not be at the front of the block so we
1440     // consider it illegal. It will only exist during the rewrite process; it
1441     // needs to exist while we peel off prologs because these could take the
1442     // default value. After that we can replace all uses with the loop producer
1443     // value.
1444     IllegalPhiDefault = Defaults.front();
1445     Defaults.erase(Defaults.begin());
1446   } else {
1447     assert(ConsumerStage >= LoopProducerStage);
1448     int StageDiff = ConsumerStage - LoopProducerStage;
1449     if (StageDiff > 0) {
1450       LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1451                         << " to " << (Defaults.size() + StageDiff) << "\n");
1452       // If we need more phis than we have defaults for, pad out with undefs for
1453       // the earliest phis, which are at the end of the defaults chain (the
1454       // chain is in reverse order).
1455       Defaults.resize(Defaults.size() + StageDiff,
1456                       Defaults.empty() ? std::optional<Register>()
1457                                        : Defaults.back());
1458     }
1459   }
1460 
1461   // Now we know the number of stages to jump back, insert the phi chain.
1462   auto DefaultI = Defaults.rbegin();
1463   while (DefaultI != Defaults.rend())
1464     LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
1465 
1466   if (IllegalPhiDefault) {
1467     // The consumer optionally consumes LoopProducer in the same iteration
1468     // (because the producer is scheduled at an earlier cycle than the consumer)
1469     // or the initial value. To facilitate this we create an illegal block here
1470     // by embedding a phi in the middle of the block. We will fix this up
1471     // immediately prior to pruning.
1472     auto RC = MRI.getRegClass(Reg);
1473     Register R = MRI.createVirtualRegister(RC);
1474     MachineInstr *IllegalPhi =
1475         BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1476             .addReg(*IllegalPhiDefault)
1477             .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
1478             .addReg(LoopReg)
1479             .addMBB(BB); // Block choice is arbitrary and has no effect.
1480     // Illegal phi should belong to the producer stage so that it can be
1481     // filtered correctly during peeling.
1482     S.setStage(IllegalPhi, LoopProducerStage);
1483     return R;
1484   }
1485 
1486   return LoopReg;
1487 }
1488 
1489 Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1490                              const TargetRegisterClass *RC) {
1491   // If the init register is not undef, try and find an existing phi.
1492   if (InitReg) {
1493     auto I = Phis.find({LoopReg, *InitReg});
1494     if (I != Phis.end())
1495       return I->second;
1496   } else {
1497     for (auto &KV : Phis) {
1498       if (KV.first.first == LoopReg)
1499         return KV.second;
1500     }
1501   }
1502 
1503   // InitReg is either undef or no existing phi takes InitReg as input. Try and
1504   // find a phi that takes undef as input.
1505   auto I = UndefPhis.find(LoopReg);
1506   if (I != UndefPhis.end()) {
1507     Register R = I->second;
1508     if (!InitReg)
1509       // Found a phi taking undef as input, and this input is undef so return
1510       // without any more changes.
1511       return R;
1512     // Found a phi taking undef as input, so rewrite it to take InitReg.
1513     MachineInstr *MI = MRI.getVRegDef(R);
1514     MI->getOperand(1).setReg(*InitReg);
1515     Phis.insert({{LoopReg, *InitReg}, R});
1516     const TargetRegisterClass *ConstrainRegClass =
1517         MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1518     assert(ConstrainRegClass && "Expected a valid constrained register class!");
1519     (void)ConstrainRegClass;
1520     UndefPhis.erase(I);
1521     return R;
1522   }
1523 
1524   // Failed to find any existing phi to reuse, so create a new one.
1525   if (!RC)
1526     RC = MRI.getRegClass(LoopReg);
1527   Register R = MRI.createVirtualRegister(RC);
1528   if (InitReg) {
1529     const TargetRegisterClass *ConstrainRegClass =
1530         MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1531     assert(ConstrainRegClass && "Expected a valid constrained register class!");
1532     (void)ConstrainRegClass;
1533   }
1534   BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
1535       .addReg(InitReg ? *InitReg : undef(RC))
1536       .addMBB(PreheaderBB)
1537       .addReg(LoopReg)
1538       .addMBB(BB);
1539   if (!InitReg)
1540     UndefPhis[LoopReg] = R;
1541   else
1542     Phis[{LoopReg, *InitReg}] = R;
1543   return R;
1544 }
1545 
1546 Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1547   Register &R = Undefs[RC];
1548   if (R == 0) {
1549     // Create an IMPLICIT_DEF that defines this register if we need it.
1550     // All uses of this should be removed by the time we have finished unrolling
1551     // prologs and epilogs.
1552     R = MRI.createVirtualRegister(RC);
1553     auto *InsertBB = &PreheaderBB->getParent()->front();
1554     BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
1555             TII->get(TargetOpcode::IMPLICIT_DEF), R);
1556   }
1557   return R;
1558 }
1559 
1560 namespace {
1561 /// Describes an operand in the kernel of a pipelined loop. Characteristics of
1562 /// the operand are discovered, such as how many in-loop PHIs it has to jump
1563 /// through and defaults for these phis.
1564 class KernelOperandInfo {
1565   MachineBasicBlock *BB;
1566   MachineRegisterInfo &MRI;
1567   SmallVector<Register, 4> PhiDefaults;
1568   MachineOperand *Source;
1569   MachineOperand *Target;
1570 
1571 public:
1572   KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1573                     const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1574       : MRI(MRI) {
1575     Source = MO;
1576     BB = MO->getParent()->getParent();
1577     while (isRegInLoop(MO)) {
1578       MachineInstr *MI = MRI.getVRegDef(MO->getReg());
1579       if (MI->isFullCopy()) {
1580         MO = &MI->getOperand(1);
1581         continue;
1582       }
1583       if (!MI->isPHI())
1584         break;
1585       // If this is an illegal phi, don't count it in distance.
1586       if (IllegalPhis.count(MI)) {
1587         MO = &MI->getOperand(3);
1588         continue;
1589       }
1590 
1591       Register Default = getInitPhiReg(*MI, BB);
1592       MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
1593                                             : &MI->getOperand(3);
1594       PhiDefaults.push_back(Default);
1595     }
1596     Target = MO;
1597   }
1598 
1599   bool operator==(const KernelOperandInfo &Other) const {
1600     return PhiDefaults.size() == Other.PhiDefaults.size();
1601   }
1602 
1603   void print(raw_ostream &OS) const {
1604     OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1605        << *Source->getParent();
1606   }
1607 
1608 private:
1609   bool isRegInLoop(MachineOperand *MO) {
1610     return MO->isReg() && MO->getReg().isVirtual() &&
1611            MRI.getVRegDef(MO->getReg())->getParent() == BB;
1612   }
1613 };
1614 } // namespace
1615 
1616 MachineBasicBlock *
1617 PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1618   MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
1619   if (LPD == LPD_Front)
1620     PeeledFront.push_back(NewBB);
1621   else
1622     PeeledBack.push_front(NewBB);
1623   for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1624        ++I, ++NI) {
1625     CanonicalMIs[&*I] = &*I;
1626     CanonicalMIs[&*NI] = &*I;
1627     BlockMIs[{NewBB, &*I}] = &*NI;
1628     BlockMIs[{BB, &*I}] = &*I;
1629   }
1630   return NewBB;
1631 }
1632 
1633 void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1634                                                        int MinStage) {
1635   for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1636        I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
1637     MachineInstr *MI = &*I++;
1638     int Stage = getStage(MI);
1639     if (Stage == -1 || Stage >= MinStage)
1640       continue;
1641 
1642     for (MachineOperand &DefMO : MI->defs()) {
1643       SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1644       for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1645         // Only PHIs can use values from this block by construction.
1646         // Match with the equivalent PHI in B.
1647         assert(UseMI.isPHI());
1648         Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1649                                                MI->getParent());
1650         Subs.emplace_back(&UseMI, Reg);
1651       }
1652       for (auto &Sub : Subs)
1653         Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1654                                       *MRI.getTargetRegisterInfo());
1655     }
1656     if (LIS)
1657       LIS->RemoveMachineInstrFromMaps(*MI);
1658     MI->eraseFromParent();
1659   }
1660 }
1661 
1662 void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1663     MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1664   auto InsertPt = DestBB->getFirstNonPHI();
1665   DenseMap<Register, Register> Remaps;
1666   for (MachineInstr &MI : llvm::make_early_inc_range(
1667            llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
1668     if (MI.isPHI()) {
1669       // This is an illegal PHI. If we move any instructions using an illegal
1670       // PHI, we need to create a legal Phi.
1671       if (getStage(&MI) != Stage) {
1672         // The legal Phi is not necessary if the illegal phi's stage
1673         // is being moved.
1674         Register PhiR = MI.getOperand(0).getReg();
1675         auto RC = MRI.getRegClass(PhiR);
1676         Register NR = MRI.createVirtualRegister(RC);
1677         MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
1678                                    DebugLoc(), TII->get(TargetOpcode::PHI), NR)
1679                                .addReg(PhiR)
1680                                .addMBB(SourceBB);
1681         BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1682         CanonicalMIs[NI] = CanonicalMIs[&MI];
1683         Remaps[PhiR] = NR;
1684       }
1685     }
1686     if (getStage(&MI) != Stage)
1687       continue;
1688     MI.removeFromParent();
1689     DestBB->insert(InsertPt, &MI);
1690     auto *KernelMI = CanonicalMIs[&MI];
1691     BlockMIs[{DestBB, KernelMI}] = &MI;
1692     BlockMIs.erase({SourceBB, KernelMI});
1693   }
1694   SmallVector<MachineInstr *, 4> PhiToDelete;
1695   for (MachineInstr &MI : DestBB->phis()) {
1696     assert(MI.getNumOperands() == 3);
1697     MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
1698     // If the instruction referenced by the phi is moved inside the block
1699     // we don't need the phi anymore.
1700     if (getStage(Def) == Stage) {
1701       Register PhiReg = MI.getOperand(0).getReg();
1702       assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1703                                             /*TRI=*/nullptr) != -1);
1704       MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
1705       MI.getOperand(0).setReg(PhiReg);
1706       PhiToDelete.push_back(&MI);
1707     }
1708   }
1709   for (auto *P : PhiToDelete)
1710     P->eraseFromParent();
1711   InsertPt = DestBB->getFirstNonPHI();
1712   // Helper to clone Phi instructions into the destination block. We clone Phi
1713   // greedily to avoid combinatorial explosion of Phi instructions.
1714   auto clonePhi = [&](MachineInstr *Phi) {
1715     MachineInstr *NewMI = MF.CloneMachineInstr(Phi);
1716     DestBB->insert(InsertPt, NewMI);
1717     Register OrigR = Phi->getOperand(0).getReg();
1718     Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
1719     NewMI->getOperand(0).setReg(R);
1720     NewMI->getOperand(1).setReg(OrigR);
1721     NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
1722     Remaps[OrigR] = R;
1723     CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1724     BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1725     PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1726     return R;
1727   };
1728   for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1729     for (MachineOperand &MO : I->uses()) {
1730       if (!MO.isReg())
1731         continue;
1732       if (auto It = Remaps.find(MO.getReg()); It != Remaps.end())
1733         MO.setReg(It->second);
1734       else {
1735         // If we are using a phi from the source block we need to add a new phi
1736         // pointing to the old one.
1737         MachineInstr *Use = MRI.getUniqueVRegDef(MO.getReg());
1738         if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1739           Register R = clonePhi(Use);
1740           MO.setReg(R);
1741         }
1742       }
1743     }
1744   }
1745 }
1746 
1747 Register
1748 PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1749                                                   MachineInstr *Phi) {
1750   unsigned distance = PhiNodeLoopIteration[Phi];
1751   MachineInstr *CanonicalUse = CanonicalPhi;
1752   Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
1753   for (unsigned I = 0; I < distance; ++I) {
1754     assert(CanonicalUse->isPHI());
1755     assert(CanonicalUse->getNumOperands() == 5);
1756     unsigned LoopRegIdx = 3, InitRegIdx = 1;
1757     if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
1758       std::swap(LoopRegIdx, InitRegIdx);
1759     CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
1760     CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
1761   }
1762   return CanonicalUseReg;
1763 }
1764 
1765 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1766   BitVector LS(Schedule.getNumStages(), true);
1767   BitVector AS(Schedule.getNumStages(), true);
1768   LiveStages[BB] = LS;
1769   AvailableStages[BB] = AS;
1770 
1771   // Peel out the prologs.
1772   LS.reset();
1773   for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1774     LS[I] = true;
1775     Prologs.push_back(peelKernel(LPD_Front));
1776     LiveStages[Prologs.back()] = LS;
1777     AvailableStages[Prologs.back()] = LS;
1778   }
1779 
1780   // Create a block that will end up as the new loop exiting block (dominated by
1781   // all prologs and epilogs). It will only contain PHIs, in the same order as
1782   // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1783   // that the exiting block is a (sub) clone of BB. This in turn gives us the
1784   // property that any value deffed in BB but used outside of BB is used by a
1785   // PHI in the exiting block.
1786   MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1787   EliminateDeadPhis(ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1788   // Push out the epilogs, again in reverse order.
1789   // We can't assume anything about the minumum loop trip count at this point,
1790   // so emit a fairly complex epilog.
1791 
1792   // We first peel number of stages minus one epilogue. Then we remove dead
1793   // stages and reorder instructions based on their stage. If we have 3 stages
1794   // we generate first:
1795   // E0[3, 2, 1]
1796   // E1[3', 2']
1797   // E2[3'']
1798   // And then we move instructions based on their stages to have:
1799   // E0[3]
1800   // E1[2, 3']
1801   // E2[1, 2', 3'']
1802   // The transformation is legal because we only move instructions past
1803   // instructions of a previous loop iteration.
1804   for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1805     Epilogs.push_back(peelKernel(LPD_Back));
1806     MachineBasicBlock *B = Epilogs.back();
1807     filterInstructions(B, Schedule.getNumStages() - I);
1808     // Keep track at which iteration each phi belongs to. We need it to know
1809     // what version of the variable to use during prologue/epilogue stitching.
1810     EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1811     for (MachineInstr &Phi : B->phis())
1812       PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1813   }
1814   for (size_t I = 0; I < Epilogs.size(); I++) {
1815     LS.reset();
1816     for (size_t J = I; J < Epilogs.size(); J++) {
1817       int Iteration = J;
1818       unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1819       // Move stage one block at a time so that Phi nodes are updated correctly.
1820       for (size_t K = Iteration; K > I; K--)
1821         moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
1822       LS[Stage] = true;
1823     }
1824     LiveStages[Epilogs[I]] = LS;
1825     AvailableStages[Epilogs[I]] = AS;
1826   }
1827 
1828   // Now we've defined all the prolog and epilog blocks as a fallthrough
1829   // sequence, add the edges that will be followed if the loop trip count is
1830   // lower than the number of stages (connecting prologs directly with epilogs).
1831   auto PI = Prologs.begin();
1832   auto EI = Epilogs.begin();
1833   assert(Prologs.size() == Epilogs.size());
1834   for (; PI != Prologs.end(); ++PI, ++EI) {
1835     MachineBasicBlock *Pred = *(*EI)->pred_begin();
1836     (*PI)->addSuccessor(*EI);
1837     for (MachineInstr &MI : (*EI)->phis()) {
1838       Register Reg = MI.getOperand(1).getReg();
1839       MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1840       if (Use && Use->getParent() == Pred) {
1841         MachineInstr *CanonicalUse = CanonicalMIs[Use];
1842         if (CanonicalUse->isPHI()) {
1843           // If the use comes from a phi we need to skip as many phi as the
1844           // distance between the epilogue and the kernel. Trace through the phi
1845           // chain to find the right value.
1846           Reg = getPhiCanonicalReg(CanonicalUse, Use);
1847         }
1848         Reg = getEquivalentRegisterIn(Reg, *PI);
1849       }
1850       MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
1851       MI.addOperand(MachineOperand::CreateMBB(*PI));
1852     }
1853   }
1854 
1855   // Create a list of all blocks in order.
1856   SmallVector<MachineBasicBlock *, 8> Blocks;
1857   llvm::append_range(Blocks, PeeledFront);
1858   Blocks.push_back(BB);
1859   llvm::append_range(Blocks, PeeledBack);
1860 
1861   // Iterate in reverse order over all instructions, remapping as we go.
1862   for (MachineBasicBlock *B : reverse(Blocks)) {
1863     for (auto I = B->instr_rbegin();
1864          I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
1865       MachineBasicBlock::reverse_instr_iterator MI = I++;
1866       rewriteUsesOf(&*MI);
1867     }
1868   }
1869   for (auto *MI : IllegalPhisToDelete) {
1870     if (LIS)
1871       LIS->RemoveMachineInstrFromMaps(*MI);
1872     MI->eraseFromParent();
1873   }
1874   IllegalPhisToDelete.clear();
1875 
1876   // Now all remapping has been done, we're free to optimize the generated code.
1877   for (MachineBasicBlock *B : reverse(Blocks))
1878     EliminateDeadPhis(B, MRI, LIS);
1879   EliminateDeadPhis(ExitingBB, MRI, LIS);
1880 }
1881 
1882 MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1883   MachineFunction &MF = *BB->getParent();
1884   MachineBasicBlock *Exit = *BB->succ_begin();
1885   if (Exit == BB)
1886     Exit = *std::next(BB->succ_begin());
1887 
1888   MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1889   MF.insert(std::next(BB->getIterator()), NewBB);
1890 
1891   // Clone all phis in BB into NewBB and rewrite.
1892   for (MachineInstr &MI : BB->phis()) {
1893     auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
1894     Register OldR = MI.getOperand(3).getReg();
1895     Register R = MRI.createVirtualRegister(RC);
1896     SmallVector<MachineInstr *, 4> Uses;
1897     for (MachineInstr &Use : MRI.use_instructions(OldR))
1898       if (Use.getParent() != BB)
1899         Uses.push_back(&Use);
1900     for (MachineInstr *Use : Uses)
1901       Use->substituteRegister(OldR, R, /*SubIdx=*/0,
1902                               *MRI.getTargetRegisterInfo());
1903     MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1904         .addReg(OldR)
1905         .addMBB(BB);
1906     BlockMIs[{NewBB, &MI}] = NI;
1907     CanonicalMIs[NI] = &MI;
1908   }
1909   BB->replaceSuccessor(Exit, NewBB);
1910   Exit->replacePhiUsesWith(BB, NewBB);
1911   NewBB->addSuccessor(Exit);
1912 
1913   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1914   SmallVector<MachineOperand, 4> Cond;
1915   bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
1916   (void)CanAnalyzeBr;
1917   assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1918   TII->removeBranch(*BB);
1919   TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
1920                     Cond, DebugLoc());
1921   TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
1922   return NewBB;
1923 }
1924 
1925 Register
1926 PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1927                                                        MachineBasicBlock *BB) {
1928   MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1929   unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1930   return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
1931 }
1932 
1933 void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1934   if (MI->isPHI()) {
1935     // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1936     // and it is produced by this block.
1937     Register PhiR = MI->getOperand(0).getReg();
1938     Register R = MI->getOperand(3).getReg();
1939     int RMIStage = getStage(MRI.getUniqueVRegDef(R));
1940     if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
1941       R = MI->getOperand(1).getReg();
1942     MRI.setRegClass(R, MRI.getRegClass(PhiR));
1943     MRI.replaceRegWith(PhiR, R);
1944     // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1945     // later to figure out how to remap registers.
1946     MI->getOperand(0).setReg(PhiR);
1947     IllegalPhisToDelete.push_back(MI);
1948     return;
1949   }
1950 
1951   int Stage = getStage(MI);
1952   if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
1953       LiveStages[MI->getParent()].test(Stage))
1954     // Instruction is live, no rewriting to do.
1955     return;
1956 
1957   for (MachineOperand &DefMO : MI->defs()) {
1958     SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1959     for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1960       // Only PHIs can use values from this block by construction.
1961       // Match with the equivalent PHI in B.
1962       assert(UseMI.isPHI());
1963       Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1964                                              MI->getParent());
1965       Subs.emplace_back(&UseMI, Reg);
1966     }
1967     for (auto &Sub : Subs)
1968       Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1969                                     *MRI.getTargetRegisterInfo());
1970   }
1971   if (LIS)
1972     LIS->RemoveMachineInstrFromMaps(*MI);
1973   MI->eraseFromParent();
1974 }
1975 
1976 void PeelingModuloScheduleExpander::fixupBranches() {
1977   // Work outwards from the kernel.
1978   bool KernelDisposed = false;
1979   int TC = Schedule.getNumStages() - 1;
1980   for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1981        ++PI, ++EI, --TC) {
1982     MachineBasicBlock *Prolog = *PI;
1983     MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1984     MachineBasicBlock *Epilog = *EI;
1985     SmallVector<MachineOperand, 4> Cond;
1986     TII->removeBranch(*Prolog);
1987     std::optional<bool> StaticallyGreater =
1988         LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
1989     if (!StaticallyGreater) {
1990       LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1991       // Dynamically branch based on Cond.
1992       TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
1993     } else if (*StaticallyGreater == false) {
1994       LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1995       // Prolog never falls through; branch to epilog and orphan interior
1996       // blocks. Leave it to unreachable-block-elim to clean up.
1997       Prolog->removeSuccessor(Fallthrough);
1998       for (MachineInstr &P : Fallthrough->phis()) {
1999         P.removeOperand(2);
2000         P.removeOperand(1);
2001       }
2002       TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
2003       KernelDisposed = true;
2004     } else {
2005       LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
2006       // Prolog always falls through; remove incoming values in epilog.
2007       Prolog->removeSuccessor(Epilog);
2008       for (MachineInstr &P : Epilog->phis()) {
2009         P.removeOperand(4);
2010         P.removeOperand(3);
2011       }
2012     }
2013   }
2014 
2015   if (!KernelDisposed) {
2016     LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
2017     LoopInfo->setPreheader(Prologs.back());
2018   } else {
2019     LoopInfo->disposed();
2020   }
2021 }
2022 
2023 void PeelingModuloScheduleExpander::rewriteKernel() {
2024   KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2025   KR.rewrite();
2026 }
2027 
2028 void PeelingModuloScheduleExpander::expand() {
2029   BB = Schedule.getLoop()->getTopBlock();
2030   Preheader = Schedule.getLoop()->getLoopPreheader();
2031   LLVM_DEBUG(Schedule.dump());
2032   LoopInfo = TII->analyzeLoopForPipelining(BB);
2033   assert(LoopInfo);
2034 
2035   rewriteKernel();
2036   peelPrologAndEpilogs();
2037   fixupBranches();
2038 }
2039 
2040 void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2041   BB = Schedule.getLoop()->getTopBlock();
2042   Preheader = Schedule.getLoop()->getLoopPreheader();
2043 
2044   // Dump the schedule before we invalidate and remap all its instructions.
2045   // Stash it in a string so we can print it if we found an error.
2046   std::string ScheduleDump;
2047   raw_string_ostream OS(ScheduleDump);
2048   Schedule.print(OS);
2049   OS.flush();
2050 
2051   // First, run the normal ModuleScheduleExpander. We don't support any
2052   // InstrChanges.
2053   assert(LIS && "Requires LiveIntervals!");
2054   ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2055                              ModuloScheduleExpander::InstrChangesTy());
2056   MSE.expand();
2057   MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2058   if (!ExpandedKernel) {
2059     // The expander optimized away the kernel. We can't do any useful checking.
2060     MSE.cleanup();
2061     return;
2062   }
2063   // Before running the KernelRewriter, re-add BB into the CFG.
2064   Preheader->addSuccessor(BB);
2065 
2066   // Now run the new expansion algorithm.
2067   KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2068   KR.rewrite();
2069   peelPrologAndEpilogs();
2070 
2071   // Collect all illegal phis that the new algorithm created. We'll give these
2072   // to KernelOperandInfo.
2073   SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2074   for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2075     if (NI->isPHI())
2076       IllegalPhis.insert(&*NI);
2077   }
2078 
2079   // Co-iterate across both kernels. We expect them to be identical apart from
2080   // phis and full COPYs (we look through both).
2081   SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2082   auto OI = ExpandedKernel->begin();
2083   auto NI = BB->begin();
2084   for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2085     while (OI->isPHI() || OI->isFullCopy())
2086       ++OI;
2087     while (NI->isPHI() || NI->isFullCopy())
2088       ++NI;
2089     assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2090     // Analyze every operand separately.
2091     for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2092          OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2093       KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2094                         KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2095   }
2096 
2097   bool Failed = false;
2098   for (auto &OldAndNew : KOIs) {
2099     if (OldAndNew.first == OldAndNew.second)
2100       continue;
2101     Failed = true;
2102     errs() << "Modulo kernel validation error: [\n";
2103     errs() << " [golden] ";
2104     OldAndNew.first.print(errs());
2105     errs() << "          ";
2106     OldAndNew.second.print(errs());
2107     errs() << "]\n";
2108   }
2109 
2110   if (Failed) {
2111     errs() << "Golden reference kernel:\n";
2112     ExpandedKernel->print(errs());
2113     errs() << "New kernel:\n";
2114     BB->print(errs());
2115     errs() << ScheduleDump;
2116     report_fatal_error(
2117         "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2118   }
2119 
2120   // Cleanup by removing BB from the CFG again as the original
2121   // ModuloScheduleExpander intended.
2122   Preheader->removeSuccessor(BB);
2123   MSE.cleanup();
2124 }
2125 
2126 MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2127   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2128 
2129   // TODO: Offset information needs to be corrected.
2130   NewMI->dropMemRefs(MF);
2131 
2132   return NewMI;
2133 }
2134 
2135 /// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2136 /// If it is already dedicated exit, return it. Otherwise, insert a new
2137 /// block between them and return the new block.
2138 static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2139                                               MachineBasicBlock *Exit,
2140                                               LiveIntervals &LIS) {
2141   if (Exit->pred_size() == 1)
2142     return Exit;
2143 
2144   MachineFunction *MF = Loop->getParent();
2145   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2146 
2147   MachineBasicBlock *NewExit =
2148       MF->CreateMachineBasicBlock(Loop->getBasicBlock());
2149   MF->insert(Loop->getIterator(), NewExit);
2150   LIS.insertMBBInMaps(NewExit);
2151 
2152   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2153   SmallVector<MachineOperand, 4> Cond;
2154   TII->analyzeBranch(*Loop, TBB, FBB, Cond);
2155   if (TBB == Loop)
2156     FBB = NewExit;
2157   else if (FBB == Loop)
2158     TBB = NewExit;
2159   else
2160     llvm_unreachable("unexpected loop structure");
2161   TII->removeBranch(*Loop);
2162   TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
2163   Loop->replaceSuccessor(Exit, NewExit);
2164   TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
2165   NewExit->addSuccessor(Exit);
2166 
2167   Exit->replacePhiUsesWith(Loop, NewExit);
2168 
2169   return NewExit;
2170 }
2171 
2172 /// Insert branch code into the end of MBB. It branches to GreaterThan if the
2173 /// remaining trip count for instructions in LastStage0Insts is greater than
2174 /// RequiredTC, and to Otherwise otherwise.
2175 void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2176                                                  int RequiredTC,
2177                                                  InstrMapTy &LastStage0Insts,
2178                                                  MachineBasicBlock &GreaterThan,
2179                                                  MachineBasicBlock &Otherwise) {
2180   SmallVector<MachineOperand, 4> Cond;
2181   LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond,
2182                                                       LastStage0Insts);
2183 
2184   if (SwapBranchTargetsMVE) {
2185     // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2186     // FBB for optimal performance.
2187     if (TII->reverseBranchCondition(Cond))
2188       llvm_unreachable("can not reverse branch condition");
2189     TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc());
2190   } else {
2191     TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc());
2192   }
2193 }
2194 
2195 /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2196 /// other necessary blocks. The control flow is modified to execute the
2197 /// pipelined loop if the trip count satisfies the condition, otherwise the
2198 /// original loop. The original loop is also used to execute the remainder
2199 /// iterations which occur due to unrolling.
2200 void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2201   // The control flow for pipelining with MVE:
2202   //
2203   // OrigPreheader:
2204   //   // The block that is originally the loop preheader
2205   //   goto Check
2206   //
2207   // Check:
2208   //   // Check whether the trip count satisfies the requirements to pipeline.
2209   //   if (LoopCounter > NumStages + NumUnroll - 2)
2210   //     // The minimum number of iterations to pipeline =
2211   //     //   iterations executed in prolog/epilog (NumStages-1) +
2212   //     //   iterations executed in one kernel run (NumUnroll)
2213   //     goto Prolog
2214   //   // fallback to the original loop
2215   //   goto NewPreheader
2216   //
2217   // Prolog:
2218   //   // All prolog stages. There are no direct branches to the epilogue.
2219   //   goto NewKernel
2220   //
2221   // NewKernel:
2222   //   // NumUnroll copies of the kernel
2223   //   if (LoopCounter > MVE-1)
2224   //     goto NewKernel
2225   //   goto Epilog
2226   //
2227   // Epilog:
2228   //   // All epilog stages.
2229   //   if (LoopCounter > 0)
2230   //     // The remainder is executed in the original loop
2231   //     goto NewPreheader
2232   //   goto NewExit
2233   //
2234   // NewPreheader:
2235   //   // Newly created preheader for the original loop.
2236   //   // The initial values of the phis in the loop are merged from two paths.
2237   //   NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2238   //   goto OrigKernel
2239   //
2240   // OrigKernel:
2241   //   // The original loop block.
2242   //   if (LoopCounter != 0)
2243   //     goto OrigKernel
2244   //   goto NewExit
2245   //
2246   // NewExit:
2247   //   // Newly created dedicated exit for the original loop.
2248   //   // Merge values which are referenced after the loop
2249   //   Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2250   //   goto OrigExit
2251   //
2252   // OrigExit:
2253   //   // The block that is originally the loop exit.
2254   //   // If it is already deicated exit, NewExit is not created.
2255 
2256   // An example of where each stage is executed:
2257   // Assume #Stages 3, #MVE 4, #Iterations 12
2258   // Iter   0 1 2 3 4 5 6 7 8 9 10-11
2259   // -------------------------------------------------
2260   // Stage  0                          Prolog#0
2261   // Stage  1 0                        Prolog#1
2262   // Stage  2 1 0                      Kernel Unroll#0 Iter#0
2263   // Stage    2 1 0                    Kernel Unroll#1 Iter#0
2264   // Stage      2 1 0                  Kernel Unroll#2 Iter#0
2265   // Stage        2 1 0                Kernel Unroll#3 Iter#0
2266   // Stage          2 1 0              Kernel Unroll#0 Iter#1
2267   // Stage            2 1 0            Kernel Unroll#1 Iter#1
2268   // Stage              2 1 0          Kernel Unroll#2 Iter#1
2269   // Stage                2 1 0        Kernel Unroll#3 Iter#1
2270   // Stage                  2 1        Epilog#0
2271   // Stage                    2        Epilog#1
2272   // Stage                      0-2    OrigKernel
2273 
2274   LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
2275   assert(LoopInfo && "Must be able to analyze loop!");
2276 
2277   calcNumUnroll();
2278 
2279   Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2280   Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2281   NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2282   Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2283   NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2284 
2285   MF.insert(OrigKernel->getIterator(), Check);
2286   LIS.insertMBBInMaps(Check);
2287   MF.insert(OrigKernel->getIterator(), Prolog);
2288   LIS.insertMBBInMaps(Prolog);
2289   MF.insert(OrigKernel->getIterator(), NewKernel);
2290   LIS.insertMBBInMaps(NewKernel);
2291   MF.insert(OrigKernel->getIterator(), Epilog);
2292   LIS.insertMBBInMaps(Epilog);
2293   MF.insert(OrigKernel->getIterator(), NewPreheader);
2294   LIS.insertMBBInMaps(NewPreheader);
2295 
2296   NewExit = createDedicatedExit(OrigKernel, OrigExit, LIS);
2297 
2298   NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
2299   TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
2300 
2301   OrigPreheader->addSuccessor(Check);
2302   TII->removeBranch(*OrigPreheader);
2303   TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
2304 
2305   Check->addSuccessor(Prolog);
2306   Check->addSuccessor(NewPreheader);
2307 
2308   Prolog->addSuccessor(NewKernel);
2309 
2310   NewKernel->addSuccessor(NewKernel);
2311   NewKernel->addSuccessor(Epilog);
2312 
2313   Epilog->addSuccessor(NewPreheader);
2314   Epilog->addSuccessor(NewExit);
2315 
2316   InstrMapTy LastStage0Insts;
2317   insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2,
2318                    LastStage0Insts, *Prolog, *NewPreheader);
2319 
2320   // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2321   // register#
2322   SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2323   generateProlog(PrologVRMap);
2324   generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2325   generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2326 }
2327 
2328 /// Replace MI's use operands according to the maps.
2329 void ModuloScheduleExpanderMVE::updateInstrUse(
2330     MachineInstr *MI, int StageNum, int PhaseNum,
2331     SmallVectorImpl<ValueMapTy> &CurVRMap,
2332     SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2333   // If MI is in the prolog/kernel/epilog block, CurVRMap is
2334   // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2335   // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2336   // Refer to the appropriate map according to the stage difference between
2337   // MI and the definition of an operand.
2338 
2339   for (MachineOperand &UseMO : MI->uses()) {
2340     if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2341       continue;
2342     int DiffStage = 0;
2343     Register OrigReg = UseMO.getReg();
2344     MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
2345     if (!DefInst || DefInst->getParent() != OrigKernel)
2346       continue;
2347     Register InitReg;
2348     Register DefReg = OrigReg;
2349     if (DefInst->isPHI()) {
2350       ++DiffStage;
2351       Register LoopReg;
2352       getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
2353       // LoopReg is guaranteed to be defined within the loop by canApply()
2354       DefReg = LoopReg;
2355       DefInst = MRI.getVRegDef(LoopReg);
2356     }
2357     unsigned DefStageNum = Schedule.getStage(DefInst);
2358     DiffStage += StageNum - DefStageNum;
2359     Register NewReg;
2360     if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
2361       // NewReg is defined in a previous phase of the same block
2362       NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2363     else if (!PrevVRMap)
2364       // Since this is the first iteration, refer the initial register of the
2365       // loop
2366       NewReg = InitReg;
2367     else
2368       // Cases where DiffStage is larger than PhaseNum.
2369       // If MI is in the kernel block, the value is defined by the previous
2370       // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2371       // value is defined in the kernel block and KernelVRMap is referenced.
2372       NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2373 
2374     const TargetRegisterClass *NRC =
2375         MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
2376     if (NRC)
2377       UseMO.setReg(NewReg);
2378     else {
2379       Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2380       MachineInstr *NewCopy = BuildMI(*OrigKernel, MI, MI->getDebugLoc(),
2381                                       TII->get(TargetOpcode::COPY), SplitReg)
2382                                   .addReg(NewReg);
2383       LIS.InsertMachineInstrInMaps(*NewCopy);
2384       UseMO.setReg(SplitReg);
2385     }
2386   }
2387 }
2388 
2389 /// Return a phi if Reg is referenced by the phi.
2390 /// canApply() guarantees that at most only one such phi exists.
2391 static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2392   for (MachineInstr &Phi : Loop->phis()) {
2393     Register InitVal, LoopVal;
2394     getPhiRegs(Phi, Loop, InitVal, LoopVal);
2395     if (LoopVal == Reg)
2396       return &Phi;
2397   }
2398   return nullptr;
2399 }
2400 
2401 /// Generate phis for registers defined by OrigMI.
2402 void ModuloScheduleExpanderMVE::generatePhi(
2403     MachineInstr *OrigMI, int UnrollNum,
2404     SmallVectorImpl<ValueMapTy> &PrologVRMap,
2405     SmallVectorImpl<ValueMapTy> &KernelVRMap,
2406     SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2407   int StageNum = Schedule.getStage(OrigMI);
2408   bool UsePrologReg;
2409   if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2410     UsePrologReg = true;
2411   else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2412     UsePrologReg = false;
2413   else
2414     return;
2415 
2416   // Examples that show which stages are merged by phi.
2417   // Meaning of the symbol following the stage number:
2418   //   a/b: Stages with the same letter are merged (UsePrologReg == true)
2419   //   +: Merged with the initial value (UsePrologReg == false)
2420   //   *: No phis required
2421   //
2422   // #Stages 3, #MVE 4
2423   // Iter   0 1 2 3 4 5 6 7 8
2424   // -----------------------------------------
2425   // Stage  0a                 Prolog#0
2426   // Stage  1a 0b              Prolog#1
2427   // Stage  2* 1* 0*           Kernel Unroll#0
2428   // Stage     2* 1* 0+        Kernel Unroll#1
2429   // Stage        2* 1+ 0a     Kernel Unroll#2
2430   // Stage           2+ 1a 0b  Kernel Unroll#3
2431   //
2432   // #Stages 3, #MVE 2
2433   // Iter   0 1 2 3 4 5 6 7 8
2434   // -----------------------------------------
2435   // Stage  0a                 Prolog#0
2436   // Stage  1a 0b              Prolog#1
2437   // Stage  2* 1+ 0a           Kernel Unroll#0
2438   // Stage     2+ 1a 0b        Kernel Unroll#1
2439   //
2440   // #Stages 3, #MVE 1
2441   // Iter   0 1 2 3 4 5 6 7 8
2442   // -----------------------------------------
2443   // Stage  0*                 Prolog#0
2444   // Stage  1a 0b              Prolog#1
2445   // Stage  2+ 1a 0b           Kernel Unroll#0
2446 
2447   for (MachineOperand &DefMO : OrigMI->defs()) {
2448     if (!DefMO.isReg() || DefMO.isDead())
2449       continue;
2450     Register OrigReg = DefMO.getReg();
2451     auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
2452     if (NewReg == KernelVRMap[UnrollNum].end())
2453       continue;
2454     Register CorrespondReg;
2455     if (UsePrologReg) {
2456       int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2457       CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2458     } else {
2459       MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
2460       if (!Phi)
2461         continue;
2462       CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
2463     }
2464 
2465     assert(CorrespondReg.isValid());
2466     Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2467     MachineInstr *NewPhi =
2468         BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
2469                 TII->get(TargetOpcode::PHI), PhiReg)
2470             .addReg(NewReg->second)
2471             .addMBB(NewKernel)
2472             .addReg(CorrespondReg)
2473             .addMBB(Prolog);
2474     LIS.InsertMachineInstrInMaps(*NewPhi);
2475     PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2476   }
2477 }
2478 
2479 static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2480                           MachineBasicBlock *NewMBB) {
2481   for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2482     if (Phi.getOperand(Idx).getReg() == OrigReg) {
2483       Phi.getOperand(Idx).setReg(NewReg);
2484       Phi.getOperand(Idx + 1).setMBB(NewMBB);
2485       return;
2486     }
2487   }
2488 }
2489 
2490 /// Generate phis that merge values from multiple routes
2491 void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2492                                                           Register NewReg) {
2493   SmallVector<MachineOperand *> UsesAfterLoop;
2494   SmallVector<MachineInstr *> LoopPhis;
2495   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
2496                                          E = MRI.use_end();
2497        I != E; ++I) {
2498     MachineOperand &O = *I;
2499     if (O.getParent()->getParent() != OrigKernel &&
2500         O.getParent()->getParent() != Prolog &&
2501         O.getParent()->getParent() != NewKernel &&
2502         O.getParent()->getParent() != Epilog)
2503       UsesAfterLoop.push_back(&O);
2504     if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2505       LoopPhis.push_back(O.getParent());
2506   }
2507 
2508   // Merge the route that only execute the pipelined loop (when there are no
2509   // remaining iterations) with the route that execute the original loop.
2510   if (!UsesAfterLoop.empty()) {
2511     Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2512     MachineInstr *NewPhi =
2513         BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
2514                 TII->get(TargetOpcode::PHI), PhiReg)
2515             .addReg(OrigReg)
2516             .addMBB(OrigKernel)
2517             .addReg(NewReg)
2518             .addMBB(Epilog);
2519     LIS.InsertMachineInstrInMaps(*NewPhi);
2520 
2521     for (MachineOperand *MO : UsesAfterLoop)
2522       MO->setReg(PhiReg);
2523 
2524     // The interval of OrigReg is invalid and should be recalculated when
2525     // LiveInterval::getInterval() is called.
2526     if (LIS.hasInterval(OrigReg))
2527       LIS.removeInterval(OrigReg);
2528   }
2529 
2530   // Merge routes from the pipelined loop and the bypassed route before the
2531   // original loop
2532   if (!LoopPhis.empty()) {
2533     for (MachineInstr *Phi : LoopPhis) {
2534       Register InitReg, LoopReg;
2535       getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
2536       Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
2537       MachineInstr *NewPhi =
2538           BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(),
2539                   Phi->getDebugLoc(), TII->get(TargetOpcode::PHI), NewInit)
2540               .addReg(InitReg)
2541               .addMBB(Check)
2542               .addReg(NewReg)
2543               .addMBB(Epilog);
2544       LIS.InsertMachineInstrInMaps(*NewPhi);
2545       replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
2546     }
2547   }
2548 }
2549 
2550 void ModuloScheduleExpanderMVE::generateProlog(
2551     SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2552   PrologVRMap.clear();
2553   PrologVRMap.resize(Schedule.getNumStages() - 1);
2554   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2555   for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2556        ++PrologNum) {
2557     for (MachineInstr *MI : Schedule.getInstructions()) {
2558       if (MI->isPHI())
2559         continue;
2560       int StageNum = Schedule.getStage(MI);
2561       if (StageNum > PrologNum)
2562         continue;
2563       MachineInstr *NewMI = cloneInstr(MI);
2564       updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
2565       NewMIMap[NewMI] = {PrologNum, StageNum};
2566       Prolog->push_back(NewMI);
2567       LIS.InsertMachineInstrInMaps(*NewMI);
2568     }
2569   }
2570 
2571   for (auto I : NewMIMap) {
2572     MachineInstr *MI = I.first;
2573     int PrologNum = I.second.first;
2574     int StageNum = I.second.second;
2575     updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
2576   }
2577 
2578   LLVM_DEBUG({
2579     dbgs() << "prolog:\n";
2580     Prolog->dump();
2581   });
2582 }
2583 
2584 void ModuloScheduleExpanderMVE::generateKernel(
2585     SmallVectorImpl<ValueMapTy> &PrologVRMap,
2586     SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2587   KernelVRMap.clear();
2588   KernelVRMap.resize(NumUnroll);
2589   SmallVector<ValueMapTy> PhiVRMap;
2590   PhiVRMap.resize(NumUnroll);
2591   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2592   for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2593     for (MachineInstr *MI : Schedule.getInstructions()) {
2594       if (MI->isPHI())
2595         continue;
2596       int StageNum = Schedule.getStage(MI);
2597       MachineInstr *NewMI = cloneInstr(MI);
2598       if (UnrollNum == NumUnroll - 1)
2599         LastStage0Insts[MI] = NewMI;
2600       updateInstrDef(NewMI, KernelVRMap[UnrollNum],
2601                      (UnrollNum == NumUnroll - 1 && StageNum == 0));
2602       generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2603       NewMIMap[NewMI] = {UnrollNum, StageNum};
2604       NewKernel->push_back(NewMI);
2605       LIS.InsertMachineInstrInMaps(*NewMI);
2606     }
2607   }
2608 
2609   for (auto I : NewMIMap) {
2610     MachineInstr *MI = I.first;
2611     int UnrollNum = I.second.first;
2612     int StageNum = I.second.second;
2613     updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
2614   }
2615 
2616   // If remaining trip count is greater than NumUnroll-1, loop continues
2617   insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel,
2618                    *Epilog);
2619 
2620   LLVM_DEBUG({
2621     dbgs() << "kernel:\n";
2622     NewKernel->dump();
2623   });
2624 }
2625 
2626 void ModuloScheduleExpanderMVE::generateEpilog(
2627     SmallVectorImpl<ValueMapTy> &KernelVRMap,
2628     SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2629   EpilogVRMap.clear();
2630   EpilogVRMap.resize(Schedule.getNumStages() - 1);
2631   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2632   for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2633        ++EpilogNum) {
2634     for (MachineInstr *MI : Schedule.getInstructions()) {
2635       if (MI->isPHI())
2636         continue;
2637       int StageNum = Schedule.getStage(MI);
2638       if (StageNum <= EpilogNum)
2639         continue;
2640       MachineInstr *NewMI = cloneInstr(MI);
2641       updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
2642       NewMIMap[NewMI] = {EpilogNum, StageNum};
2643       Epilog->push_back(NewMI);
2644       LIS.InsertMachineInstrInMaps(*NewMI);
2645     }
2646   }
2647 
2648   for (auto I : NewMIMap) {
2649     MachineInstr *MI = I.first;
2650     int EpilogNum = I.second.first;
2651     int StageNum = I.second.second;
2652     updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
2653   }
2654 
2655   // If there are remaining iterations, they are executed in the original loop.
2656   // Instructions related to loop control, such as loop counter comparison,
2657   // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2658   // in stage 0. Thus, the map is for the last one in the kernel.
2659   insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit);
2660 
2661   LLVM_DEBUG({
2662     dbgs() << "epilog:\n";
2663     Epilog->dump();
2664   });
2665 }
2666 
2667 /// Calculate the number of unroll required and set it to NumUnroll
2668 void ModuloScheduleExpanderMVE::calcNumUnroll() {
2669   DenseMap<MachineInstr *, unsigned> Inst2Idx;
2670   NumUnroll = 1;
2671   for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2672     Inst2Idx[Schedule.getInstructions()[I]] = I;
2673 
2674   for (MachineInstr *MI : Schedule.getInstructions()) {
2675     if (MI->isPHI())
2676       continue;
2677     int StageNum = Schedule.getStage(MI);
2678     for (const MachineOperand &MO : MI->uses()) {
2679       if (!MO.isReg() || !MO.getReg().isVirtual())
2680         continue;
2681       MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
2682       if (DefMI->getParent() != OrigKernel)
2683         continue;
2684 
2685       int NumUnrollLocal = 1;
2686       if (DefMI->isPHI()) {
2687         ++NumUnrollLocal;
2688         // canApply() guarantees that DefMI is not phi and is an instruction in
2689         // the loop
2690         DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
2691       }
2692       NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
2693       if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2694         --NumUnrollLocal;
2695       NumUnroll = std::max(NumUnroll, NumUnrollLocal);
2696     }
2697   }
2698   LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2699 }
2700 
2701 /// Create new virtual registers for definitions of NewMI and update NewMI.
2702 /// If the definitions are referenced after the pipelined loop, phis are
2703 /// created to merge with other routes.
2704 void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2705                                                ValueMapTy &VRMap,
2706                                                bool LastDef) {
2707   for (MachineOperand &MO : NewMI->all_defs()) {
2708     if (!MO.getReg().isVirtual())
2709       continue;
2710     Register Reg = MO.getReg();
2711     const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2712     Register NewReg = MRI.createVirtualRegister(RC);
2713     MO.setReg(NewReg);
2714     VRMap[Reg] = NewReg;
2715     if (LastDef)
2716       mergeRegUsesAfterPipeline(Reg, NewReg);
2717   }
2718 }
2719 
2720 void ModuloScheduleExpanderMVE::expand() {
2721   OrigKernel = Schedule.getLoop()->getTopBlock();
2722   OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2723   OrigExit = Schedule.getLoop()->getExitBlock();
2724 
2725   LLVM_DEBUG(Schedule.dump());
2726 
2727   generatePipelinedLoop();
2728 }
2729 
2730 /// Check if ModuloScheduleExpanderMVE can be applied to L
2731 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2732   if (!L.getExitBlock()) {
2733     LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n");
2734     return false;
2735   }
2736 
2737   MachineBasicBlock *BB = L.getTopBlock();
2738   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2739 
2740   // Put some constraints on the operands of the phis to simplify the
2741   // transformation
2742   DenseSet<Register> UsedByPhi;
2743   for (MachineInstr &MI : BB->phis()) {
2744     // Registers defined by phis must be used only inside the loop and be never
2745     // used by phis.
2746     for (MachineOperand &MO : MI.defs())
2747       if (MO.isReg())
2748         for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
2749           if (Ref.getParent() != BB || Ref.isPHI()) {
2750             LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
2751                                  "referenced outside of the loop or by phi.\n");
2752             return false;
2753           }
2754 
2755     // A source register from the loop block must be defined inside the loop.
2756     // A register defined inside the loop must be referenced by only one phi at
2757     // most.
2758     Register InitVal, LoopVal;
2759     getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
2760     if (!Register(LoopVal).isVirtual() ||
2761         MRI.getVRegDef(LoopVal)->getParent() != BB) {
2762       LLVM_DEBUG(
2763           dbgs() << "Can not apply MVE expander: A phi source value coming "
2764                     "from the loop is not defined in the loop.\n");
2765       return false;
2766     }
2767     if (UsedByPhi.count(LoopVal)) {
2768       LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2769                            "loop is referenced by two or more phis.\n");
2770       return false;
2771     }
2772     UsedByPhi.insert(LoopVal);
2773   }
2774 
2775   return true;
2776 }
2777 
2778 //===----------------------------------------------------------------------===//
2779 // ModuloScheduleTestPass implementation
2780 //===----------------------------------------------------------------------===//
2781 // This pass constructs a ModuloSchedule from its module and runs
2782 // ModuloScheduleExpander.
2783 //
2784 // The module is expected to contain a single-block analyzable loop.
2785 // The total order of instructions is taken from the loop as-is.
2786 // Instructions are expected to be annotated with a PostInstrSymbol.
2787 // This PostInstrSymbol must have the following format:
2788 //  "Stage=%d Cycle=%d".
2789 //===----------------------------------------------------------------------===//
2790 
2791 namespace {
2792 class ModuloScheduleTest : public MachineFunctionPass {
2793 public:
2794   static char ID;
2795 
2796   ModuloScheduleTest() : MachineFunctionPass(ID) {
2797     initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
2798   }
2799 
2800   bool runOnMachineFunction(MachineFunction &MF) override;
2801   void runOnLoop(MachineFunction &MF, MachineLoop &L);
2802 
2803   void getAnalysisUsage(AnalysisUsage &AU) const override {
2804     AU.addRequired<MachineLoopInfoWrapperPass>();
2805     AU.addRequired<LiveIntervalsWrapperPass>();
2806     MachineFunctionPass::getAnalysisUsage(AU);
2807   }
2808 };
2809 } // namespace
2810 
2811 char ModuloScheduleTest::ID = 0;
2812 
2813 INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2814                       "Modulo Schedule test pass", false, false)
2815 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2816 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2817 INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2818                     "Modulo Schedule test pass", false, false)
2819 
2820 bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2821   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2822   for (auto *L : MLI) {
2823     if (L->getTopBlock() != L->getBottomBlock())
2824       continue;
2825     runOnLoop(MF, *L);
2826     return false;
2827   }
2828   return false;
2829 }
2830 
2831 static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2832   std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
2833   std::pair<StringRef, StringRef> StageTokenAndValue =
2834       getToken(StageAndCycle.first, "-");
2835   std::pair<StringRef, StringRef> CycleTokenAndValue =
2836       getToken(StageAndCycle.second, "-");
2837   if (StageTokenAndValue.first != "Stage" ||
2838       CycleTokenAndValue.first != "_Cycle") {
2839     llvm_unreachable(
2840         "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2841     return;
2842   }
2843 
2844   StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
2845   CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
2846 
2847   dbgs() << "  Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2848 }
2849 
2850 void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2851   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2852   MachineBasicBlock *BB = L.getTopBlock();
2853   dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2854 
2855   DenseMap<MachineInstr *, int> Cycle, Stage;
2856   std::vector<MachineInstr *> Instrs;
2857   for (MachineInstr &MI : *BB) {
2858     if (MI.isTerminator())
2859       continue;
2860     Instrs.push_back(&MI);
2861     if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2862       dbgs() << "Parsing post-instr symbol for " << MI;
2863       parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
2864     }
2865   }
2866 
2867   ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2868                     std::move(Stage));
2869   ModuloScheduleExpander MSE(
2870       MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2871   MSE.expand();
2872   MSE.cleanup();
2873 }
2874 
2875 //===----------------------------------------------------------------------===//
2876 // ModuloScheduleTestAnnotater implementation
2877 //===----------------------------------------------------------------------===//
2878 
2879 void ModuloScheduleTestAnnotater::annotate() {
2880   for (MachineInstr *MI : S.getInstructions()) {
2881     SmallVector<char, 16> SV;
2882     raw_svector_ostream OS(SV);
2883     OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2884     MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
2885     MI->setPostInstrSymbol(MF, Sym);
2886   }
2887 }
2888