xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp (revision b6708045590712930c533e916e3d6fdfe48ec5ba)
1  //===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  
9  #include "llvm/CodeGen/ModuloSchedule.h"
10  #include "llvm/ADT/StringExtras.h"
11  #include "llvm/Analysis/MemoryLocation.h"
12  #include "llvm/CodeGen/LiveIntervals.h"
13  #include "llvm/CodeGen/MachineInstrBuilder.h"
14  #include "llvm/CodeGen/MachineLoopInfo.h"
15  #include "llvm/CodeGen/MachineRegisterInfo.h"
16  #include "llvm/InitializePasses.h"
17  #include "llvm/MC/MCContext.h"
18  #include "llvm/Support/Debug.h"
19  #include "llvm/Support/ErrorHandling.h"
20  #include "llvm/Support/raw_ostream.h"
21  
22  #define DEBUG_TYPE "pipeliner"
23  using namespace llvm;
24  
25  static cl::opt<bool> SwapBranchTargetsMVE(
26      "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false),
27      cl::desc("Swap target blocks of a conditional branch for MVE expander"));
28  
29  void ModuloSchedule::print(raw_ostream &OS) {
30    for (MachineInstr *MI : ScheduledInstrs)
31      OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
32  }
33  
34  //===----------------------------------------------------------------------===//
35  // ModuloScheduleExpander implementation
36  //===----------------------------------------------------------------------===//
37  
38  /// Return the register values for  the operands of a Phi instruction.
39  /// This function assume the instruction is a Phi.
40  static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
41                         unsigned &InitVal, unsigned &LoopVal) {
42    assert(Phi.isPHI() && "Expecting a Phi.");
43  
44    InitVal = 0;
45    LoopVal = 0;
46    for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
47      if (Phi.getOperand(i + 1).getMBB() != Loop)
48        InitVal = Phi.getOperand(i).getReg();
49      else
50        LoopVal = Phi.getOperand(i).getReg();
51  
52    assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
53  }
54  
55  /// Return the Phi register value that comes from the incoming block.
56  static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
57    for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
58      if (Phi.getOperand(i + 1).getMBB() != LoopBB)
59        return Phi.getOperand(i).getReg();
60    return 0;
61  }
62  
63  /// Return the Phi register value that comes the loop block.
64  static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
65    for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
66      if (Phi.getOperand(i + 1).getMBB() == LoopBB)
67        return Phi.getOperand(i).getReg();
68    return 0;
69  }
70  
71  void ModuloScheduleExpander::expand() {
72    BB = Schedule.getLoop()->getTopBlock();
73    Preheader = *BB->pred_begin();
74    if (Preheader == BB)
75      Preheader = *std::next(BB->pred_begin());
76  
77    // Iterate over the definitions in each instruction, and compute the
78    // stage difference for each use.  Keep the maximum value.
79    for (MachineInstr *MI : Schedule.getInstructions()) {
80      int DefStage = Schedule.getStage(MI);
81      for (const MachineOperand &Op : MI->all_defs()) {
82        Register Reg = Op.getReg();
83        unsigned MaxDiff = 0;
84        bool PhiIsSwapped = false;
85        for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
86          MachineInstr *UseMI = UseOp.getParent();
87          int UseStage = Schedule.getStage(UseMI);
88          unsigned Diff = 0;
89          if (UseStage != -1 && UseStage >= DefStage)
90            Diff = UseStage - DefStage;
91          if (MI->isPHI()) {
92            if (isLoopCarried(*MI))
93              ++Diff;
94            else
95              PhiIsSwapped = true;
96          }
97          MaxDiff = std::max(Diff, MaxDiff);
98        }
99        RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
100      }
101    }
102  
103    generatePipelinedLoop();
104  }
105  
106  void ModuloScheduleExpander::generatePipelinedLoop() {
107    LoopInfo = TII->analyzeLoopForPipelining(BB);
108    assert(LoopInfo && "Must be able to analyze loop!");
109  
110    // Create a new basic block for the kernel and add it to the CFG.
111    MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
112  
113    unsigned MaxStageCount = Schedule.getNumStages() - 1;
114  
115    // Remember the registers that are used in different stages. The index is
116    // the iteration, or stage, that the instruction is scheduled in.  This is
117    // a map between register names in the original block and the names created
118    // in each stage of the pipelined loop.
119    ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
120  
121    // The renaming destination by Phis for the registers across stages.
122    // This map is updated during Phis generation to point to the most recent
123    // renaming destination.
124    ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
125  
126    InstrMapTy InstrMap;
127  
128    SmallVector<MachineBasicBlock *, 4> PrologBBs;
129  
130    // Generate the prolog instructions that set up the pipeline.
131    generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
132    MF.insert(BB->getIterator(), KernelBB);
133    LIS.insertMBBInMaps(KernelBB);
134  
135    // Rearrange the instructions to generate the new, pipelined loop,
136    // and update register names as needed.
137    for (MachineInstr *CI : Schedule.getInstructions()) {
138      if (CI->isPHI())
139        continue;
140      unsigned StageNum = Schedule.getStage(CI);
141      MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
142      updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
143      KernelBB->push_back(NewMI);
144      InstrMap[NewMI] = CI;
145    }
146  
147    // Copy any terminator instructions to the new kernel, and update
148    // names as needed.
149    for (MachineInstr &MI : BB->terminators()) {
150      MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
151      updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
152      KernelBB->push_back(NewMI);
153      InstrMap[NewMI] = &MI;
154    }
155  
156    NewKernel = KernelBB;
157    KernelBB->transferSuccessors(BB);
158    KernelBB->replaceSuccessor(BB, KernelBB);
159  
160    generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
161                         InstrMap, MaxStageCount, MaxStageCount, false);
162    generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
163                 InstrMap, MaxStageCount, MaxStageCount, false);
164  
165    LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
166  
167    SmallVector<MachineBasicBlock *, 4> EpilogBBs;
168    // Generate the epilog instructions to complete the pipeline.
169    generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
170                   PrologBBs);
171  
172    // We need this step because the register allocation doesn't handle some
173    // situations well, so we insert copies to help out.
174    splitLifetimes(KernelBB, EpilogBBs);
175  
176    // Remove dead instructions due to loop induction variables.
177    removeDeadInstructions(KernelBB, EpilogBBs);
178  
179    // Add branches between prolog and epilog blocks.
180    addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
181  
182    delete[] VRMap;
183    delete[] VRMapPhi;
184  }
185  
186  void ModuloScheduleExpander::cleanup() {
187    // Remove the original loop since it's no longer referenced.
188    for (auto &I : *BB)
189      LIS.RemoveMachineInstrFromMaps(I);
190    BB->clear();
191    BB->eraseFromParent();
192  }
193  
194  /// Generate the pipeline prolog code.
195  void ModuloScheduleExpander::generateProlog(unsigned LastStage,
196                                              MachineBasicBlock *KernelBB,
197                                              ValueMapTy *VRMap,
198                                              MBBVectorTy &PrologBBs) {
199    MachineBasicBlock *PredBB = Preheader;
200    InstrMapTy InstrMap;
201  
202    // Generate a basic block for each stage, not including the last stage,
203    // which will be generated in the kernel. Each basic block may contain
204    // instructions from multiple stages/iterations.
205    for (unsigned i = 0; i < LastStage; ++i) {
206      // Create and insert the prolog basic block prior to the original loop
207      // basic block.  The original loop is removed later.
208      MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
209      PrologBBs.push_back(NewBB);
210      MF.insert(BB->getIterator(), NewBB);
211      NewBB->transferSuccessors(PredBB);
212      PredBB->addSuccessor(NewBB);
213      PredBB = NewBB;
214      LIS.insertMBBInMaps(NewBB);
215  
216      // Generate instructions for each appropriate stage. Process instructions
217      // in original program order.
218      for (int StageNum = i; StageNum >= 0; --StageNum) {
219        for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
220                                         BBE = BB->getFirstTerminator();
221             BBI != BBE; ++BBI) {
222          if (Schedule.getStage(&*BBI) == StageNum) {
223            if (BBI->isPHI())
224              continue;
225            MachineInstr *NewMI =
226                cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
227            updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
228            NewBB->push_back(NewMI);
229            InstrMap[NewMI] = &*BBI;
230          }
231        }
232      }
233      rewritePhiValues(NewBB, i, VRMap, InstrMap);
234      LLVM_DEBUG({
235        dbgs() << "prolog:\n";
236        NewBB->dump();
237      });
238    }
239  
240    PredBB->replaceSuccessor(BB, KernelBB);
241  
242    // Check if we need to remove the branch from the preheader to the original
243    // loop, and replace it with a branch to the new loop.
244    unsigned numBranches = TII->removeBranch(*Preheader);
245    if (numBranches) {
246      SmallVector<MachineOperand, 0> Cond;
247      TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
248    }
249  }
250  
251  /// Generate the pipeline epilog code. The epilog code finishes the iterations
252  /// that were started in either the prolog or the kernel.  We create a basic
253  /// block for each stage that needs to complete.
254  void ModuloScheduleExpander::generateEpilog(
255      unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
256      ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
257      MBBVectorTy &PrologBBs) {
258    // We need to change the branch from the kernel to the first epilog block, so
259    // this call to analyze branch uses the kernel rather than the original BB.
260    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
261    SmallVector<MachineOperand, 4> Cond;
262    bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
263    assert(!checkBranch && "generateEpilog must be able to analyze the branch");
264    if (checkBranch)
265      return;
266  
267    MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
268    if (*LoopExitI == KernelBB)
269      ++LoopExitI;
270    assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
271    MachineBasicBlock *LoopExitBB = *LoopExitI;
272  
273    MachineBasicBlock *PredBB = KernelBB;
274    MachineBasicBlock *EpilogStart = LoopExitBB;
275    InstrMapTy InstrMap;
276  
277    // Generate a basic block for each stage, not including the last stage,
278    // which was generated for the kernel.  Each basic block may contain
279    // instructions from multiple stages/iterations.
280    int EpilogStage = LastStage + 1;
281    for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
282      MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
283      EpilogBBs.push_back(NewBB);
284      MF.insert(BB->getIterator(), NewBB);
285  
286      PredBB->replaceSuccessor(LoopExitBB, NewBB);
287      NewBB->addSuccessor(LoopExitBB);
288      LIS.insertMBBInMaps(NewBB);
289  
290      if (EpilogStart == LoopExitBB)
291        EpilogStart = NewBB;
292  
293      // Add instructions to the epilog depending on the current block.
294      // Process instructions in original program order.
295      for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
296        for (auto &BBI : *BB) {
297          if (BBI.isPHI())
298            continue;
299          MachineInstr *In = &BBI;
300          if ((unsigned)Schedule.getStage(In) == StageNum) {
301            // Instructions with memoperands in the epilog are updated with
302            // conservative values.
303            MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
304            updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
305            NewBB->push_back(NewMI);
306            InstrMap[NewMI] = In;
307          }
308        }
309      }
310      generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
311                           InstrMap, LastStage, EpilogStage, i == 1);
312      generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
313                   InstrMap, LastStage, EpilogStage, i == 1);
314      PredBB = NewBB;
315  
316      LLVM_DEBUG({
317        dbgs() << "epilog:\n";
318        NewBB->dump();
319      });
320    }
321  
322    // Fix any Phi nodes in the loop exit block.
323    LoopExitBB->replacePhiUsesWith(BB, PredBB);
324  
325    // Create a branch to the new epilog from the kernel.
326    // Remove the original branch and add a new branch to the epilog.
327    TII->removeBranch(*KernelBB);
328    assert((OrigBB == TBB || OrigBB == FBB) &&
329           "Unable to determine looping branch direction");
330    if (OrigBB != TBB)
331      TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
332    else
333      TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
334    // Add a branch to the loop exit.
335    if (EpilogBBs.size() > 0) {
336      MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
337      SmallVector<MachineOperand, 4> Cond1;
338      TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
339    }
340  }
341  
342  /// Replace all uses of FromReg that appear outside the specified
343  /// basic block with ToReg.
344  static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
345                                      MachineBasicBlock *MBB,
346                                      MachineRegisterInfo &MRI,
347                                      LiveIntervals &LIS) {
348    for (MachineOperand &O :
349         llvm::make_early_inc_range(MRI.use_operands(FromReg)))
350      if (O.getParent()->getParent() != MBB)
351        O.setReg(ToReg);
352    if (!LIS.hasInterval(ToReg))
353      LIS.createEmptyInterval(ToReg);
354  }
355  
356  /// Return true if the register has a use that occurs outside the
357  /// specified loop.
358  static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
359                              MachineRegisterInfo &MRI) {
360    for (const MachineOperand &MO : MRI.use_operands(Reg))
361      if (MO.getParent()->getParent() != BB)
362        return true;
363    return false;
364  }
365  
366  /// Generate Phis for the specific block in the generated pipelined code.
367  /// This function looks at the Phis from the original code to guide the
368  /// creation of new Phis.
369  void ModuloScheduleExpander::generateExistingPhis(
370      MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
371      MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
372      unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
373    // Compute the stage number for the initial value of the Phi, which
374    // comes from the prolog. The prolog to use depends on to which kernel/
375    // epilog that we're adding the Phi.
376    unsigned PrologStage = 0;
377    unsigned PrevStage = 0;
378    bool InKernel = (LastStageNum == CurStageNum);
379    if (InKernel) {
380      PrologStage = LastStageNum - 1;
381      PrevStage = CurStageNum;
382    } else {
383      PrologStage = LastStageNum - (CurStageNum - LastStageNum);
384      PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
385    }
386  
387    for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
388                                     BBE = BB->getFirstNonPHI();
389         BBI != BBE; ++BBI) {
390      Register Def = BBI->getOperand(0).getReg();
391  
392      unsigned InitVal = 0;
393      unsigned LoopVal = 0;
394      getPhiRegs(*BBI, BB, InitVal, LoopVal);
395  
396      unsigned PhiOp1 = 0;
397      // The Phi value from the loop body typically is defined in the loop, but
398      // not always. So, we need to check if the value is defined in the loop.
399      unsigned PhiOp2 = LoopVal;
400      if (VRMap[LastStageNum].count(LoopVal))
401        PhiOp2 = VRMap[LastStageNum][LoopVal];
402  
403      int StageScheduled = Schedule.getStage(&*BBI);
404      int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
405      unsigned NumStages = getStagesForReg(Def, CurStageNum);
406      if (NumStages == 0) {
407        // We don't need to generate a Phi anymore, but we need to rename any uses
408        // of the Phi value.
409        unsigned NewReg = VRMap[PrevStage][LoopVal];
410        rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
411                              InitVal, NewReg);
412        if (VRMap[CurStageNum].count(LoopVal))
413          VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
414      }
415      // Adjust the number of Phis needed depending on the number of prologs left,
416      // and the distance from where the Phi is first scheduled. The number of
417      // Phis cannot exceed the number of prolog stages. Each stage can
418      // potentially define two values.
419      unsigned MaxPhis = PrologStage + 2;
420      if (!InKernel && (int)PrologStage <= LoopValStage)
421        MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
422      unsigned NumPhis = std::min(NumStages, MaxPhis);
423  
424      unsigned NewReg = 0;
425      unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
426      // In the epilog, we may need to look back one stage to get the correct
427      // Phi name, because the epilog and prolog blocks execute the same stage.
428      // The correct name is from the previous block only when the Phi has
429      // been completely scheduled prior to the epilog, and Phi value is not
430      // needed in multiple stages.
431      int StageDiff = 0;
432      if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
433          NumPhis == 1)
434        StageDiff = 1;
435      // Adjust the computations below when the phi and the loop definition
436      // are scheduled in different stages.
437      if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
438        StageDiff = StageScheduled - LoopValStage;
439      for (unsigned np = 0; np < NumPhis; ++np) {
440        // If the Phi hasn't been scheduled, then use the initial Phi operand
441        // value. Otherwise, use the scheduled version of the instruction. This
442        // is a little complicated when a Phi references another Phi.
443        if (np > PrologStage || StageScheduled >= (int)LastStageNum)
444          PhiOp1 = InitVal;
445        // Check if the Phi has already been scheduled in a prolog stage.
446        else if (PrologStage >= AccessStage + StageDiff + np &&
447                 VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
448          PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
449        // Check if the Phi has already been scheduled, but the loop instruction
450        // is either another Phi, or doesn't occur in the loop.
451        else if (PrologStage >= AccessStage + StageDiff + np) {
452          // If the Phi references another Phi, we need to examine the other
453          // Phi to get the correct value.
454          PhiOp1 = LoopVal;
455          MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
456          int Indirects = 1;
457          while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
458            int PhiStage = Schedule.getStage(InstOp1);
459            if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
460              PhiOp1 = getInitPhiReg(*InstOp1, BB);
461            else
462              PhiOp1 = getLoopPhiReg(*InstOp1, BB);
463            InstOp1 = MRI.getVRegDef(PhiOp1);
464            int PhiOpStage = Schedule.getStage(InstOp1);
465            int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
466            if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
467                VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
468              PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
469              break;
470            }
471            ++Indirects;
472          }
473        } else
474          PhiOp1 = InitVal;
475        // If this references a generated Phi in the kernel, get the Phi operand
476        // from the incoming block.
477        if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
478          if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
479            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
480  
481        MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
482        bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
483        // In the epilog, a map lookup is needed to get the value from the kernel,
484        // or previous epilog block. How is does this depends on if the
485        // instruction is scheduled in the previous block.
486        if (!InKernel) {
487          int StageDiffAdj = 0;
488          if (LoopValStage != -1 && StageScheduled > LoopValStage)
489            StageDiffAdj = StageScheduled - LoopValStage;
490          // Use the loop value defined in the kernel, unless the kernel
491          // contains the last definition of the Phi.
492          if (np == 0 && PrevStage == LastStageNum &&
493              (StageScheduled != 0 || LoopValStage != 0) &&
494              VRMap[PrevStage - StageDiffAdj].count(LoopVal))
495            PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
496          // Use the value defined by the Phi. We add one because we switch
497          // from looking at the loop value to the Phi definition.
498          else if (np > 0 && PrevStage == LastStageNum &&
499                   VRMap[PrevStage - np + 1].count(Def))
500            PhiOp2 = VRMap[PrevStage - np + 1][Def];
501          // Use the loop value defined in the kernel.
502          else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
503                   VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
504            PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
505          // Use the value defined by the Phi, unless we're generating the first
506          // epilog and the Phi refers to a Phi in a different stage.
507          else if (VRMap[PrevStage - np].count(Def) &&
508                   (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
509                    (LoopValStage == StageScheduled)))
510            PhiOp2 = VRMap[PrevStage - np][Def];
511        }
512  
513        // Check if we can reuse an existing Phi. This occurs when a Phi
514        // references another Phi, and the other Phi is scheduled in an
515        // earlier stage. We can try to reuse an existing Phi up until the last
516        // stage of the current Phi.
517        if (LoopDefIsPhi) {
518          if (static_cast<int>(PrologStage - np) >= StageScheduled) {
519            int LVNumStages = getStagesForPhi(LoopVal);
520            int StageDiff = (StageScheduled - LoopValStage);
521            LVNumStages -= StageDiff;
522            // Make sure the loop value Phi has been processed already.
523            if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
524              NewReg = PhiOp2;
525              unsigned ReuseStage = CurStageNum;
526              if (isLoopCarried(*PhiInst))
527                ReuseStage -= LVNumStages;
528              // Check if the Phi to reuse has been generated yet. If not, then
529              // there is nothing to reuse.
530              if (VRMap[ReuseStage - np].count(LoopVal)) {
531                NewReg = VRMap[ReuseStage - np][LoopVal];
532  
533                rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
534                                      Def, NewReg);
535                // Update the map with the new Phi name.
536                VRMap[CurStageNum - np][Def] = NewReg;
537                PhiOp2 = NewReg;
538                if (VRMap[LastStageNum - np - 1].count(LoopVal))
539                  PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
540  
541                if (IsLast && np == NumPhis - 1)
542                  replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
543                continue;
544              }
545            }
546          }
547          if (InKernel && StageDiff > 0 &&
548              VRMap[CurStageNum - StageDiff - np].count(LoopVal))
549            PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
550        }
551  
552        const TargetRegisterClass *RC = MRI.getRegClass(Def);
553        NewReg = MRI.createVirtualRegister(RC);
554  
555        MachineInstrBuilder NewPhi =
556            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
557                    TII->get(TargetOpcode::PHI), NewReg);
558        NewPhi.addReg(PhiOp1).addMBB(BB1);
559        NewPhi.addReg(PhiOp2).addMBB(BB2);
560        if (np == 0)
561          InstrMap[NewPhi] = &*BBI;
562  
563        // We define the Phis after creating the new pipelined code, so
564        // we need to rename the Phi values in scheduled instructions.
565  
566        unsigned PrevReg = 0;
567        if (InKernel && VRMap[PrevStage - np].count(LoopVal))
568          PrevReg = VRMap[PrevStage - np][LoopVal];
569        rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
570                              NewReg, PrevReg);
571        // If the Phi has been scheduled, use the new name for rewriting.
572        if (VRMap[CurStageNum - np].count(Def)) {
573          unsigned R = VRMap[CurStageNum - np][Def];
574          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
575                                NewReg);
576        }
577  
578        // Check if we need to rename any uses that occurs after the loop. The
579        // register to replace depends on whether the Phi is scheduled in the
580        // epilog.
581        if (IsLast && np == NumPhis - 1)
582          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
583  
584        // In the kernel, a dependent Phi uses the value from this Phi.
585        if (InKernel)
586          PhiOp2 = NewReg;
587  
588        // Update the map with the new Phi name.
589        VRMap[CurStageNum - np][Def] = NewReg;
590      }
591  
592      while (NumPhis++ < NumStages) {
593        rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
594                              NewReg, 0);
595      }
596  
597      // Check if we need to rename a Phi that has been eliminated due to
598      // scheduling.
599      if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
600        replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
601    }
602  }
603  
604  /// Generate Phis for the specified block in the generated pipelined code.
605  /// These are new Phis needed because the definition is scheduled after the
606  /// use in the pipelined sequence.
607  void ModuloScheduleExpander::generatePhis(
608      MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
609      MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
610      InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
611      bool IsLast) {
612    // Compute the stage number that contains the initial Phi value, and
613    // the Phi from the previous stage.
614    unsigned PrologStage = 0;
615    unsigned PrevStage = 0;
616    unsigned StageDiff = CurStageNum - LastStageNum;
617    bool InKernel = (StageDiff == 0);
618    if (InKernel) {
619      PrologStage = LastStageNum - 1;
620      PrevStage = CurStageNum;
621    } else {
622      PrologStage = LastStageNum - StageDiff;
623      PrevStage = LastStageNum + StageDiff - 1;
624    }
625  
626    for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
627                                     BBE = BB->instr_end();
628         BBI != BBE; ++BBI) {
629      for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
630        MachineOperand &MO = BBI->getOperand(i);
631        if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
632          continue;
633  
634        int StageScheduled = Schedule.getStage(&*BBI);
635        assert(StageScheduled != -1 && "Expecting scheduled instruction.");
636        Register Def = MO.getReg();
637        unsigned NumPhis = getStagesForReg(Def, CurStageNum);
638        // An instruction scheduled in stage 0 and is used after the loop
639        // requires a phi in the epilog for the last definition from either
640        // the kernel or prolog.
641        if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
642            hasUseAfterLoop(Def, BB, MRI))
643          NumPhis = 1;
644        if (!InKernel && (unsigned)StageScheduled > PrologStage)
645          continue;
646  
647        unsigned PhiOp2;
648        if (InKernel) {
649          PhiOp2 = VRMap[PrevStage][Def];
650          if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
651            if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
652              PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
653        }
654        // The number of Phis can't exceed the number of prolog stages. The
655        // prolog stage number is zero based.
656        if (NumPhis > PrologStage + 1 - StageScheduled)
657          NumPhis = PrologStage + 1 - StageScheduled;
658        for (unsigned np = 0; np < NumPhis; ++np) {
659          // Example for
660          // Org:
661          //   %Org = ... (Scheduled at Stage#0, NumPhi = 2)
662          //
663          // Prolog0 (Stage0):
664          //   %Clone0 = ...
665          // Prolog1 (Stage1):
666          //   %Clone1 = ...
667          // Kernel (Stage2):
668          //   %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
669          //   %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
670          //   %Clone2 = ...
671          // Epilog0 (Stage3):
672          //   %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
673          //   %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
674          // Epilog1 (Stage4):
675          //   %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
676          //
677          // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
678          // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
679          // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
680  
681          unsigned PhiOp1 = VRMap[PrologStage][Def];
682          if (np <= PrologStage)
683            PhiOp1 = VRMap[PrologStage - np][Def];
684          if (!InKernel) {
685            if (PrevStage == LastStageNum && np == 0)
686              PhiOp2 = VRMap[LastStageNum][Def];
687            else
688              PhiOp2 = VRMapPhi[PrevStage - np][Def];
689          }
690  
691          const TargetRegisterClass *RC = MRI.getRegClass(Def);
692          Register NewReg = MRI.createVirtualRegister(RC);
693  
694          MachineInstrBuilder NewPhi =
695              BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
696                      TII->get(TargetOpcode::PHI), NewReg);
697          NewPhi.addReg(PhiOp1).addMBB(BB1);
698          NewPhi.addReg(PhiOp2).addMBB(BB2);
699          if (np == 0)
700            InstrMap[NewPhi] = &*BBI;
701  
702          // Rewrite uses and update the map. The actions depend upon whether
703          // we generating code for the kernel or epilog blocks.
704          if (InKernel) {
705            rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
706                                  NewReg);
707            rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
708                                  NewReg);
709  
710            PhiOp2 = NewReg;
711            VRMapPhi[PrevStage - np - 1][Def] = NewReg;
712          } else {
713            VRMapPhi[CurStageNum - np][Def] = NewReg;
714            if (np == NumPhis - 1)
715              rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
716                                    NewReg);
717          }
718          if (IsLast && np == NumPhis - 1)
719            replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
720        }
721      }
722    }
723  }
724  
725  /// Remove instructions that generate values with no uses.
726  /// Typically, these are induction variable operations that generate values
727  /// used in the loop itself.  A dead instruction has a definition with
728  /// no uses, or uses that occur in the original loop only.
729  void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
730                                                      MBBVectorTy &EpilogBBs) {
731    // For each epilog block, check that the value defined by each instruction
732    // is used.  If not, delete it.
733    for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
734      for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
735                                                     ME = MBB->instr_rend();
736           MI != ME;) {
737        // From DeadMachineInstructionElem. Don't delete inline assembly.
738        if (MI->isInlineAsm()) {
739          ++MI;
740          continue;
741        }
742        bool SawStore = false;
743        // Check if it's safe to remove the instruction due to side effects.
744        // We can, and want to, remove Phis here.
745        if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
746          ++MI;
747          continue;
748        }
749        bool used = true;
750        for (const MachineOperand &MO : MI->all_defs()) {
751          Register reg = MO.getReg();
752          // Assume physical registers are used, unless they are marked dead.
753          if (reg.isPhysical()) {
754            used = !MO.isDead();
755            if (used)
756              break;
757            continue;
758          }
759          unsigned realUses = 0;
760          for (const MachineOperand &U : MRI.use_operands(reg)) {
761            // Check if there are any uses that occur only in the original
762            // loop.  If so, that's not a real use.
763            if (U.getParent()->getParent() != BB) {
764              realUses++;
765              used = true;
766              break;
767            }
768          }
769          if (realUses > 0)
770            break;
771          used = false;
772        }
773        if (!used) {
774          LIS.RemoveMachineInstrFromMaps(*MI);
775          MI++->eraseFromParent();
776          continue;
777        }
778        ++MI;
779      }
780    // In the kernel block, check if we can remove a Phi that generates a value
781    // used in an instruction removed in the epilog block.
782    for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
783      Register reg = MI.getOperand(0).getReg();
784      if (MRI.use_begin(reg) == MRI.use_end()) {
785        LIS.RemoveMachineInstrFromMaps(MI);
786        MI.eraseFromParent();
787      }
788    }
789  }
790  
791  /// For loop carried definitions, we split the lifetime of a virtual register
792  /// that has uses past the definition in the next iteration. A copy with a new
793  /// virtual register is inserted before the definition, which helps with
794  /// generating a better register assignment.
795  ///
796  ///   v1 = phi(a, v2)     v1 = phi(a, v2)
797  ///   v2 = phi(b, v3)     v2 = phi(b, v3)
798  ///   v3 = ..             v4 = copy v1
799  ///   .. = V1             v3 = ..
800  ///                       .. = v4
801  void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
802                                              MBBVectorTy &EpilogBBs) {
803    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
804    for (auto &PHI : KernelBB->phis()) {
805      Register Def = PHI.getOperand(0).getReg();
806      // Check for any Phi definition that used as an operand of another Phi
807      // in the same block.
808      for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
809                                                   E = MRI.use_instr_end();
810           I != E; ++I) {
811        if (I->isPHI() && I->getParent() == KernelBB) {
812          // Get the loop carried definition.
813          unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
814          if (!LCDef)
815            continue;
816          MachineInstr *MI = MRI.getVRegDef(LCDef);
817          if (!MI || MI->getParent() != KernelBB || MI->isPHI())
818            continue;
819          // Search through the rest of the block looking for uses of the Phi
820          // definition. If one occurs, then split the lifetime.
821          unsigned SplitReg = 0;
822          for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
823                                      KernelBB->instr_end()))
824            if (BBJ.readsRegister(Def, /*TRI=*/nullptr)) {
825              // We split the lifetime when we find the first use.
826              if (SplitReg == 0) {
827                SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
828                BuildMI(*KernelBB, MI, MI->getDebugLoc(),
829                        TII->get(TargetOpcode::COPY), SplitReg)
830                    .addReg(Def);
831              }
832              BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
833            }
834          if (!SplitReg)
835            continue;
836          // Search through each of the epilog blocks for any uses to be renamed.
837          for (auto &Epilog : EpilogBBs)
838            for (auto &I : *Epilog)
839              if (I.readsRegister(Def, /*TRI=*/nullptr))
840                I.substituteRegister(Def, SplitReg, 0, *TRI);
841          break;
842        }
843      }
844    }
845  }
846  
847  /// Remove the incoming block from the Phis in a basic block.
848  static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
849    for (MachineInstr &MI : *BB) {
850      if (!MI.isPHI())
851        break;
852      for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
853        if (MI.getOperand(i + 1).getMBB() == Incoming) {
854          MI.removeOperand(i + 1);
855          MI.removeOperand(i);
856          break;
857        }
858    }
859  }
860  
861  /// Create branches from each prolog basic block to the appropriate epilog
862  /// block.  These edges are needed if the loop ends before reaching the
863  /// kernel.
864  void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
865                                           MBBVectorTy &PrologBBs,
866                                           MachineBasicBlock *KernelBB,
867                                           MBBVectorTy &EpilogBBs,
868                                           ValueMapTy *VRMap) {
869    assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
870    MachineBasicBlock *LastPro = KernelBB;
871    MachineBasicBlock *LastEpi = KernelBB;
872  
873    // Start from the blocks connected to the kernel and work "out"
874    // to the first prolog and the last epilog blocks.
875    SmallVector<MachineInstr *, 4> PrevInsts;
876    unsigned MaxIter = PrologBBs.size() - 1;
877    for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
878      // Add branches to the prolog that go to the corresponding
879      // epilog, and the fall-thru prolog/kernel block.
880      MachineBasicBlock *Prolog = PrologBBs[j];
881      MachineBasicBlock *Epilog = EpilogBBs[i];
882  
883      SmallVector<MachineOperand, 4> Cond;
884      std::optional<bool> StaticallyGreater =
885          LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
886      unsigned numAdded = 0;
887      if (!StaticallyGreater) {
888        Prolog->addSuccessor(Epilog);
889        numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
890      } else if (*StaticallyGreater == false) {
891        Prolog->addSuccessor(Epilog);
892        Prolog->removeSuccessor(LastPro);
893        LastEpi->removeSuccessor(Epilog);
894        numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
895        removePhis(Epilog, LastEpi);
896        // Remove the blocks that are no longer referenced.
897        if (LastPro != LastEpi) {
898          LastEpi->clear();
899          LastEpi->eraseFromParent();
900        }
901        if (LastPro == KernelBB) {
902          LoopInfo->disposed();
903          NewKernel = nullptr;
904        }
905        LastPro->clear();
906        LastPro->eraseFromParent();
907      } else {
908        numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
909        removePhis(Epilog, Prolog);
910      }
911      LastPro = Prolog;
912      LastEpi = Epilog;
913      for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
914                                                     E = Prolog->instr_rend();
915           I != E && numAdded > 0; ++I, --numAdded)
916        updateInstruction(&*I, false, j, 0, VRMap);
917    }
918  
919    if (NewKernel) {
920      LoopInfo->setPreheader(PrologBBs[MaxIter]);
921      LoopInfo->adjustTripCount(-(MaxIter + 1));
922    }
923  }
924  
925  /// Return true if we can compute the amount the instruction changes
926  /// during each iteration. Set Delta to the amount of the change.
927  bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
928    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
929    const MachineOperand *BaseOp;
930    int64_t Offset;
931    bool OffsetIsScalable;
932    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
933      return false;
934  
935    // FIXME: This algorithm assumes instructions have fixed-size offsets.
936    if (OffsetIsScalable)
937      return false;
938  
939    if (!BaseOp->isReg())
940      return false;
941  
942    Register BaseReg = BaseOp->getReg();
943  
944    MachineRegisterInfo &MRI = MF.getRegInfo();
945    // Check if there is a Phi. If so, get the definition in the loop.
946    MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
947    if (BaseDef && BaseDef->isPHI()) {
948      BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
949      BaseDef = MRI.getVRegDef(BaseReg);
950    }
951    if (!BaseDef)
952      return false;
953  
954    int D = 0;
955    if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
956      return false;
957  
958    Delta = D;
959    return true;
960  }
961  
962  /// Update the memory operand with a new offset when the pipeliner
963  /// generates a new copy of the instruction that refers to a
964  /// different memory location.
965  void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
966                                                 MachineInstr &OldMI,
967                                                 unsigned Num) {
968    if (Num == 0)
969      return;
970    // If the instruction has memory operands, then adjust the offset
971    // when the instruction appears in different stages.
972    if (NewMI.memoperands_empty())
973      return;
974    SmallVector<MachineMemOperand *, 2> NewMMOs;
975    for (MachineMemOperand *MMO : NewMI.memoperands()) {
976      // TODO: Figure out whether isAtomic is really necessary (see D57601).
977      if (MMO->isVolatile() || MMO->isAtomic() ||
978          (MMO->isInvariant() && MMO->isDereferenceable()) ||
979          (!MMO->getValue())) {
980        NewMMOs.push_back(MMO);
981        continue;
982      }
983      unsigned Delta;
984      if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
985        int64_t AdjOffset = Delta * Num;
986        NewMMOs.push_back(
987            MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
988      } else {
989        NewMMOs.push_back(MF.getMachineMemOperand(
990            MMO, 0, LocationSize::beforeOrAfterPointer()));
991      }
992    }
993    NewMI.setMemRefs(MF, NewMMOs);
994  }
995  
996  /// Clone the instruction for the new pipelined loop and update the
997  /// memory operands, if needed.
998  MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
999                                                   unsigned CurStageNum,
1000                                                   unsigned InstStageNum) {
1001    MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1002    updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1003    return NewMI;
1004  }
1005  
1006  /// Clone the instruction for the new pipelined loop. If needed, this
1007  /// function updates the instruction using the values saved in the
1008  /// InstrChanges structure.
1009  MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1010      MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1011    MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1012    auto It = InstrChanges.find(OldMI);
1013    if (It != InstrChanges.end()) {
1014      std::pair<unsigned, int64_t> RegAndOffset = It->second;
1015      unsigned BasePos, OffsetPos;
1016      if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
1017        return nullptr;
1018      int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
1019      MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
1020      if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
1021        NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1022      NewMI->getOperand(OffsetPos).setImm(NewOffset);
1023    }
1024    updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1025    return NewMI;
1026  }
1027  
1028  /// Update the machine instruction with new virtual registers.  This
1029  /// function may change the definitions and/or uses.
1030  void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1031                                                 bool LastDef,
1032                                                 unsigned CurStageNum,
1033                                                 unsigned InstrStageNum,
1034                                                 ValueMapTy *VRMap) {
1035    for (MachineOperand &MO : NewMI->operands()) {
1036      if (!MO.isReg() || !MO.getReg().isVirtual())
1037        continue;
1038      Register reg = MO.getReg();
1039      if (MO.isDef()) {
1040        // Create a new virtual register for the definition.
1041        const TargetRegisterClass *RC = MRI.getRegClass(reg);
1042        Register NewReg = MRI.createVirtualRegister(RC);
1043        MO.setReg(NewReg);
1044        VRMap[CurStageNum][reg] = NewReg;
1045        if (LastDef)
1046          replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
1047      } else if (MO.isUse()) {
1048        MachineInstr *Def = MRI.getVRegDef(reg);
1049        // Compute the stage that contains the last definition for instruction.
1050        int DefStageNum = Schedule.getStage(Def);
1051        unsigned StageNum = CurStageNum;
1052        if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1053          // Compute the difference in stages between the defintion and the use.
1054          unsigned StageDiff = (InstrStageNum - DefStageNum);
1055          // Make an adjustment to get the last definition.
1056          StageNum -= StageDiff;
1057        }
1058        if (VRMap[StageNum].count(reg))
1059          MO.setReg(VRMap[StageNum][reg]);
1060      }
1061    }
1062  }
1063  
1064  /// Return the instruction in the loop that defines the register.
1065  /// If the definition is a Phi, then follow the Phi operand to
1066  /// the instruction in the loop.
1067  MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
1068    SmallPtrSet<MachineInstr *, 8> Visited;
1069    MachineInstr *Def = MRI.getVRegDef(Reg);
1070    while (Def->isPHI()) {
1071      if (!Visited.insert(Def).second)
1072        break;
1073      for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1074        if (Def->getOperand(i + 1).getMBB() == BB) {
1075          Def = MRI.getVRegDef(Def->getOperand(i).getReg());
1076          break;
1077        }
1078    }
1079    return Def;
1080  }
1081  
1082  /// Return the new name for the value from the previous stage.
1083  unsigned ModuloScheduleExpander::getPrevMapVal(
1084      unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
1085      ValueMapTy *VRMap, MachineBasicBlock *BB) {
1086    unsigned PrevVal = 0;
1087    if (StageNum > PhiStage) {
1088      MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
1089      if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
1090        // The name is defined in the previous stage.
1091        PrevVal = VRMap[StageNum - 1][LoopVal];
1092      else if (VRMap[StageNum].count(LoopVal))
1093        // The previous name is defined in the current stage when the instruction
1094        // order is swapped.
1095        PrevVal = VRMap[StageNum][LoopVal];
1096      else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1097        // The loop value hasn't yet been scheduled.
1098        PrevVal = LoopVal;
1099      else if (StageNum == PhiStage + 1)
1100        // The loop value is another phi, which has not been scheduled.
1101        PrevVal = getInitPhiReg(*LoopInst, BB);
1102      else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1103        // The loop value is another phi, which has been scheduled.
1104        PrevVal =
1105            getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
1106                          LoopStage, VRMap, BB);
1107    }
1108    return PrevVal;
1109  }
1110  
1111  /// Rewrite the Phi values in the specified block to use the mappings
1112  /// from the initial operand. Once the Phi is scheduled, we switch
1113  /// to using the loop value instead of the Phi value, so those names
1114  /// do not need to be rewritten.
1115  void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1116                                                unsigned StageNum,
1117                                                ValueMapTy *VRMap,
1118                                                InstrMapTy &InstrMap) {
1119    for (auto &PHI : BB->phis()) {
1120      unsigned InitVal = 0;
1121      unsigned LoopVal = 0;
1122      getPhiRegs(PHI, BB, InitVal, LoopVal);
1123      Register PhiDef = PHI.getOperand(0).getReg();
1124  
1125      unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
1126      unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
1127      unsigned NumPhis = getStagesForPhi(PhiDef);
1128      if (NumPhis > StageNum)
1129        NumPhis = StageNum;
1130      for (unsigned np = 0; np <= NumPhis; ++np) {
1131        unsigned NewVal =
1132            getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1133        if (!NewVal)
1134          NewVal = InitVal;
1135        rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
1136                              NewVal);
1137      }
1138    }
1139  }
1140  
1141  /// Rewrite a previously scheduled instruction to use the register value
1142  /// from the new instruction. Make sure the instruction occurs in the
1143  /// basic block, and we don't change the uses in the new instruction.
1144  void ModuloScheduleExpander::rewriteScheduledInstr(
1145      MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1146      unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
1147      unsigned PrevReg) {
1148    bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1149    int StagePhi = Schedule.getStage(Phi) + PhiNum;
1150    // Rewrite uses that have been scheduled already to use the new
1151    // Phi register.
1152    for (MachineOperand &UseOp :
1153         llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
1154      MachineInstr *UseMI = UseOp.getParent();
1155      if (UseMI->getParent() != BB)
1156        continue;
1157      if (UseMI->isPHI()) {
1158        if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
1159          continue;
1160        if (getLoopPhiReg(*UseMI, BB) != OldReg)
1161          continue;
1162      }
1163      InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
1164      assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1165      MachineInstr *OrigMI = OrigInstr->second;
1166      int StageSched = Schedule.getStage(OrigMI);
1167      int CycleSched = Schedule.getCycle(OrigMI);
1168      unsigned ReplaceReg = 0;
1169      // This is the stage for the scheduled instruction.
1170      if (StagePhi == StageSched && Phi->isPHI()) {
1171        int CyclePhi = Schedule.getCycle(Phi);
1172        if (PrevReg && InProlog)
1173          ReplaceReg = PrevReg;
1174        else if (PrevReg && !isLoopCarried(*Phi) &&
1175                 (CyclePhi <= CycleSched || OrigMI->isPHI()))
1176          ReplaceReg = PrevReg;
1177        else
1178          ReplaceReg = NewReg;
1179      }
1180      // The scheduled instruction occurs before the scheduled Phi, and the
1181      // Phi is not loop carried.
1182      if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
1183        ReplaceReg = NewReg;
1184      if (StagePhi > StageSched && Phi->isPHI())
1185        ReplaceReg = NewReg;
1186      if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1187        ReplaceReg = NewReg;
1188      if (ReplaceReg) {
1189        const TargetRegisterClass *NRC =
1190            MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
1191        if (NRC)
1192          UseOp.setReg(ReplaceReg);
1193        else {
1194          Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
1195          BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
1196                  SplitReg)
1197              .addReg(ReplaceReg);
1198          UseOp.setReg(SplitReg);
1199        }
1200      }
1201    }
1202  }
1203  
1204  bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1205    if (!Phi.isPHI())
1206      return false;
1207    int DefCycle = Schedule.getCycle(&Phi);
1208    int DefStage = Schedule.getStage(&Phi);
1209  
1210    unsigned InitVal = 0;
1211    unsigned LoopVal = 0;
1212    getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
1213    MachineInstr *Use = MRI.getVRegDef(LoopVal);
1214    if (!Use || Use->isPHI())
1215      return true;
1216    int LoopCycle = Schedule.getCycle(Use);
1217    int LoopStage = Schedule.getStage(Use);
1218    return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1219  }
1220  
1221  //===----------------------------------------------------------------------===//
1222  // PeelingModuloScheduleExpander implementation
1223  //===----------------------------------------------------------------------===//
1224  // This is a reimplementation of ModuloScheduleExpander that works by creating
1225  // a fully correct steady-state kernel and peeling off the prolog and epilogs.
1226  //===----------------------------------------------------------------------===//
1227  
1228  namespace {
1229  // Remove any dead phis in MBB. Dead phis either have only one block as input
1230  // (in which case they are the identity) or have no uses.
1231  void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1232                         LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1233    bool Changed = true;
1234    while (Changed) {
1235      Changed = false;
1236      for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
1237        assert(MI.isPHI());
1238        if (MRI.use_empty(MI.getOperand(0).getReg())) {
1239          if (LIS)
1240            LIS->RemoveMachineInstrFromMaps(MI);
1241          MI.eraseFromParent();
1242          Changed = true;
1243        } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1244          const TargetRegisterClass *ConstrainRegClass =
1245              MRI.constrainRegClass(MI.getOperand(1).getReg(),
1246                                    MRI.getRegClass(MI.getOperand(0).getReg()));
1247          assert(ConstrainRegClass &&
1248                 "Expected a valid constrained register class!");
1249          (void)ConstrainRegClass;
1250          MRI.replaceRegWith(MI.getOperand(0).getReg(),
1251                             MI.getOperand(1).getReg());
1252          if (LIS)
1253            LIS->RemoveMachineInstrFromMaps(MI);
1254          MI.eraseFromParent();
1255          Changed = true;
1256        }
1257      }
1258    }
1259  }
1260  
1261  /// Rewrites the kernel block in-place to adhere to the given schedule.
1262  /// KernelRewriter holds all of the state required to perform the rewriting.
1263  class KernelRewriter {
1264    ModuloSchedule &S;
1265    MachineBasicBlock *BB;
1266    MachineBasicBlock *PreheaderBB, *ExitBB;
1267    MachineRegisterInfo &MRI;
1268    const TargetInstrInfo *TII;
1269    LiveIntervals *LIS;
1270  
1271    // Map from register class to canonical undef register for that class.
1272    DenseMap<const TargetRegisterClass *, Register> Undefs;
1273    // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1274    // this map is only used when InitReg is non-undef.
1275    DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
1276    // Map from LoopReg to phi register where the InitReg is undef.
1277    DenseMap<Register, Register> UndefPhis;
1278  
1279    // Reg is used by MI. Return the new register MI should use to adhere to the
1280    // schedule. Insert phis as necessary.
1281    Register remapUse(Register Reg, MachineInstr &MI);
1282    // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1283    // If InitReg is not given it is chosen arbitrarily. It will either be undef
1284    // or will be chosen so as to share another phi.
1285    Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1286                 const TargetRegisterClass *RC = nullptr);
1287    // Create an undef register of the given register class.
1288    Register undef(const TargetRegisterClass *RC);
1289  
1290  public:
1291    KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1292                   LiveIntervals *LIS = nullptr);
1293    void rewrite();
1294  };
1295  } // namespace
1296  
1297  KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1298                                 MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1299      : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1300        ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1301        TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1302    PreheaderBB = *BB->pred_begin();
1303    if (PreheaderBB == BB)
1304      PreheaderBB = *std::next(BB->pred_begin());
1305  }
1306  
1307  void KernelRewriter::rewrite() {
1308    // Rearrange the loop to be in schedule order. Note that the schedule may
1309    // contain instructions that are not owned by the loop block (InstrChanges and
1310    // friends), so we gracefully handle unowned instructions and delete any
1311    // instructions that weren't in the schedule.
1312    auto InsertPt = BB->getFirstTerminator();
1313    MachineInstr *FirstMI = nullptr;
1314    for (MachineInstr *MI : S.getInstructions()) {
1315      if (MI->isPHI())
1316        continue;
1317      if (MI->getParent())
1318        MI->removeFromParent();
1319      BB->insert(InsertPt, MI);
1320      if (!FirstMI)
1321        FirstMI = MI;
1322    }
1323    assert(FirstMI && "Failed to find first MI in schedule");
1324  
1325    // At this point all of the scheduled instructions are between FirstMI
1326    // and the end of the block. Kill from the first non-phi to FirstMI.
1327    for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1328      if (LIS)
1329        LIS->RemoveMachineInstrFromMaps(*I);
1330      (I++)->eraseFromParent();
1331    }
1332  
1333    // Now remap every instruction in the loop.
1334    for (MachineInstr &MI : *BB) {
1335      if (MI.isPHI() || MI.isTerminator())
1336        continue;
1337      for (MachineOperand &MO : MI.uses()) {
1338        if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1339          continue;
1340        Register Reg = remapUse(MO.getReg(), MI);
1341        MO.setReg(Reg);
1342      }
1343    }
1344    EliminateDeadPhis(BB, MRI, LIS);
1345  
1346    // Ensure a phi exists for all instructions that are either referenced by
1347    // an illegal phi or by an instruction outside the loop. This allows us to
1348    // treat remaps of these values the same as "normal" values that come from
1349    // loop-carried phis.
1350    for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1351      if (MI->isPHI()) {
1352        Register R = MI->getOperand(0).getReg();
1353        phi(R);
1354        continue;
1355      }
1356  
1357      for (MachineOperand &Def : MI->defs()) {
1358        for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
1359          if (MI.getParent() != BB) {
1360            phi(Def.getReg());
1361            break;
1362          }
1363        }
1364      }
1365    }
1366  }
1367  
1368  Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1369    MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1370    if (!Producer)
1371      return Reg;
1372  
1373    int ConsumerStage = S.getStage(&MI);
1374    if (!Producer->isPHI()) {
1375      // Non-phi producers are simple to remap. Insert as many phis as the
1376      // difference between the consumer and producer stages.
1377      if (Producer->getParent() != BB)
1378        // Producer was not inside the loop. Use the register as-is.
1379        return Reg;
1380      int ProducerStage = S.getStage(Producer);
1381      assert(ConsumerStage != -1 &&
1382             "In-loop consumer should always be scheduled!");
1383      assert(ConsumerStage >= ProducerStage);
1384      unsigned StageDiff = ConsumerStage - ProducerStage;
1385  
1386      for (unsigned I = 0; I < StageDiff; ++I)
1387        Reg = phi(Reg);
1388      return Reg;
1389    }
1390  
1391    // First, dive through the phi chain to find the defaults for the generated
1392    // phis.
1393    SmallVector<std::optional<Register>, 4> Defaults;
1394    Register LoopReg = Reg;
1395    auto LoopProducer = Producer;
1396    while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1397      LoopReg = getLoopPhiReg(*LoopProducer, BB);
1398      Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
1399      LoopProducer = MRI.getUniqueVRegDef(LoopReg);
1400      assert(LoopProducer);
1401    }
1402    int LoopProducerStage = S.getStage(LoopProducer);
1403  
1404    std::optional<Register> IllegalPhiDefault;
1405  
1406    if (LoopProducerStage == -1) {
1407      // Do nothing.
1408    } else if (LoopProducerStage > ConsumerStage) {
1409      // This schedule is only representable if ProducerStage == ConsumerStage+1.
1410      // In addition, Consumer's cycle must be scheduled after Producer in the
1411      // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1412      // functions.
1413  #ifndef NDEBUG // Silence unused variables in non-asserts mode.
1414      int LoopProducerCycle = S.getCycle(LoopProducer);
1415      int ConsumerCycle = S.getCycle(&MI);
1416  #endif
1417      assert(LoopProducerCycle <= ConsumerCycle);
1418      assert(LoopProducerStage == ConsumerStage + 1);
1419      // Peel off the first phi from Defaults and insert a phi between producer
1420      // and consumer. This phi will not be at the front of the block so we
1421      // consider it illegal. It will only exist during the rewrite process; it
1422      // needs to exist while we peel off prologs because these could take the
1423      // default value. After that we can replace all uses with the loop producer
1424      // value.
1425      IllegalPhiDefault = Defaults.front();
1426      Defaults.erase(Defaults.begin());
1427    } else {
1428      assert(ConsumerStage >= LoopProducerStage);
1429      int StageDiff = ConsumerStage - LoopProducerStage;
1430      if (StageDiff > 0) {
1431        LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1432                          << " to " << (Defaults.size() + StageDiff) << "\n");
1433        // If we need more phis than we have defaults for, pad out with undefs for
1434        // the earliest phis, which are at the end of the defaults chain (the
1435        // chain is in reverse order).
1436        Defaults.resize(Defaults.size() + StageDiff,
1437                        Defaults.empty() ? std::optional<Register>()
1438                                         : Defaults.back());
1439      }
1440    }
1441  
1442    // Now we know the number of stages to jump back, insert the phi chain.
1443    auto DefaultI = Defaults.rbegin();
1444    while (DefaultI != Defaults.rend())
1445      LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
1446  
1447    if (IllegalPhiDefault) {
1448      // The consumer optionally consumes LoopProducer in the same iteration
1449      // (because the producer is scheduled at an earlier cycle than the consumer)
1450      // or the initial value. To facilitate this we create an illegal block here
1451      // by embedding a phi in the middle of the block. We will fix this up
1452      // immediately prior to pruning.
1453      auto RC = MRI.getRegClass(Reg);
1454      Register R = MRI.createVirtualRegister(RC);
1455      MachineInstr *IllegalPhi =
1456          BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1457              .addReg(*IllegalPhiDefault)
1458              .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
1459              .addReg(LoopReg)
1460              .addMBB(BB); // Block choice is arbitrary and has no effect.
1461      // Illegal phi should belong to the producer stage so that it can be
1462      // filtered correctly during peeling.
1463      S.setStage(IllegalPhi, LoopProducerStage);
1464      return R;
1465    }
1466  
1467    return LoopReg;
1468  }
1469  
1470  Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1471                               const TargetRegisterClass *RC) {
1472    // If the init register is not undef, try and find an existing phi.
1473    if (InitReg) {
1474      auto I = Phis.find({LoopReg, *InitReg});
1475      if (I != Phis.end())
1476        return I->second;
1477    } else {
1478      for (auto &KV : Phis) {
1479        if (KV.first.first == LoopReg)
1480          return KV.second;
1481      }
1482    }
1483  
1484    // InitReg is either undef or no existing phi takes InitReg as input. Try and
1485    // find a phi that takes undef as input.
1486    auto I = UndefPhis.find(LoopReg);
1487    if (I != UndefPhis.end()) {
1488      Register R = I->second;
1489      if (!InitReg)
1490        // Found a phi taking undef as input, and this input is undef so return
1491        // without any more changes.
1492        return R;
1493      // Found a phi taking undef as input, so rewrite it to take InitReg.
1494      MachineInstr *MI = MRI.getVRegDef(R);
1495      MI->getOperand(1).setReg(*InitReg);
1496      Phis.insert({{LoopReg, *InitReg}, R});
1497      const TargetRegisterClass *ConstrainRegClass =
1498          MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1499      assert(ConstrainRegClass && "Expected a valid constrained register class!");
1500      (void)ConstrainRegClass;
1501      UndefPhis.erase(I);
1502      return R;
1503    }
1504  
1505    // Failed to find any existing phi to reuse, so create a new one.
1506    if (!RC)
1507      RC = MRI.getRegClass(LoopReg);
1508    Register R = MRI.createVirtualRegister(RC);
1509    if (InitReg) {
1510      const TargetRegisterClass *ConstrainRegClass =
1511          MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1512      assert(ConstrainRegClass && "Expected a valid constrained register class!");
1513      (void)ConstrainRegClass;
1514    }
1515    BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
1516        .addReg(InitReg ? *InitReg : undef(RC))
1517        .addMBB(PreheaderBB)
1518        .addReg(LoopReg)
1519        .addMBB(BB);
1520    if (!InitReg)
1521      UndefPhis[LoopReg] = R;
1522    else
1523      Phis[{LoopReg, *InitReg}] = R;
1524    return R;
1525  }
1526  
1527  Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1528    Register &R = Undefs[RC];
1529    if (R == 0) {
1530      // Create an IMPLICIT_DEF that defines this register if we need it.
1531      // All uses of this should be removed by the time we have finished unrolling
1532      // prologs and epilogs.
1533      R = MRI.createVirtualRegister(RC);
1534      auto *InsertBB = &PreheaderBB->getParent()->front();
1535      BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
1536              TII->get(TargetOpcode::IMPLICIT_DEF), R);
1537    }
1538    return R;
1539  }
1540  
1541  namespace {
1542  /// Describes an operand in the kernel of a pipelined loop. Characteristics of
1543  /// the operand are discovered, such as how many in-loop PHIs it has to jump
1544  /// through and defaults for these phis.
1545  class KernelOperandInfo {
1546    MachineBasicBlock *BB;
1547    MachineRegisterInfo &MRI;
1548    SmallVector<Register, 4> PhiDefaults;
1549    MachineOperand *Source;
1550    MachineOperand *Target;
1551  
1552  public:
1553    KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1554                      const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1555        : MRI(MRI) {
1556      Source = MO;
1557      BB = MO->getParent()->getParent();
1558      while (isRegInLoop(MO)) {
1559        MachineInstr *MI = MRI.getVRegDef(MO->getReg());
1560        if (MI->isFullCopy()) {
1561          MO = &MI->getOperand(1);
1562          continue;
1563        }
1564        if (!MI->isPHI())
1565          break;
1566        // If this is an illegal phi, don't count it in distance.
1567        if (IllegalPhis.count(MI)) {
1568          MO = &MI->getOperand(3);
1569          continue;
1570        }
1571  
1572        Register Default = getInitPhiReg(*MI, BB);
1573        MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
1574                                              : &MI->getOperand(3);
1575        PhiDefaults.push_back(Default);
1576      }
1577      Target = MO;
1578    }
1579  
1580    bool operator==(const KernelOperandInfo &Other) const {
1581      return PhiDefaults.size() == Other.PhiDefaults.size();
1582    }
1583  
1584    void print(raw_ostream &OS) const {
1585      OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1586         << *Source->getParent();
1587    }
1588  
1589  private:
1590    bool isRegInLoop(MachineOperand *MO) {
1591      return MO->isReg() && MO->getReg().isVirtual() &&
1592             MRI.getVRegDef(MO->getReg())->getParent() == BB;
1593    }
1594  };
1595  } // namespace
1596  
1597  MachineBasicBlock *
1598  PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1599    MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
1600    if (LPD == LPD_Front)
1601      PeeledFront.push_back(NewBB);
1602    else
1603      PeeledBack.push_front(NewBB);
1604    for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1605         ++I, ++NI) {
1606      CanonicalMIs[&*I] = &*I;
1607      CanonicalMIs[&*NI] = &*I;
1608      BlockMIs[{NewBB, &*I}] = &*NI;
1609      BlockMIs[{BB, &*I}] = &*I;
1610    }
1611    return NewBB;
1612  }
1613  
1614  void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1615                                                         int MinStage) {
1616    for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1617         I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
1618      MachineInstr *MI = &*I++;
1619      int Stage = getStage(MI);
1620      if (Stage == -1 || Stage >= MinStage)
1621        continue;
1622  
1623      for (MachineOperand &DefMO : MI->defs()) {
1624        SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1625        for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1626          // Only PHIs can use values from this block by construction.
1627          // Match with the equivalent PHI in B.
1628          assert(UseMI.isPHI());
1629          Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1630                                                 MI->getParent());
1631          Subs.emplace_back(&UseMI, Reg);
1632        }
1633        for (auto &Sub : Subs)
1634          Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1635                                        *MRI.getTargetRegisterInfo());
1636      }
1637      if (LIS)
1638        LIS->RemoveMachineInstrFromMaps(*MI);
1639      MI->eraseFromParent();
1640    }
1641  }
1642  
1643  void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1644      MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1645    auto InsertPt = DestBB->getFirstNonPHI();
1646    DenseMap<Register, Register> Remaps;
1647    for (MachineInstr &MI : llvm::make_early_inc_range(
1648             llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
1649      if (MI.isPHI()) {
1650        // This is an illegal PHI. If we move any instructions using an illegal
1651        // PHI, we need to create a legal Phi.
1652        if (getStage(&MI) != Stage) {
1653          // The legal Phi is not necessary if the illegal phi's stage
1654          // is being moved.
1655          Register PhiR = MI.getOperand(0).getReg();
1656          auto RC = MRI.getRegClass(PhiR);
1657          Register NR = MRI.createVirtualRegister(RC);
1658          MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
1659                                     DebugLoc(), TII->get(TargetOpcode::PHI), NR)
1660                                 .addReg(PhiR)
1661                                 .addMBB(SourceBB);
1662          BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1663          CanonicalMIs[NI] = CanonicalMIs[&MI];
1664          Remaps[PhiR] = NR;
1665        }
1666      }
1667      if (getStage(&MI) != Stage)
1668        continue;
1669      MI.removeFromParent();
1670      DestBB->insert(InsertPt, &MI);
1671      auto *KernelMI = CanonicalMIs[&MI];
1672      BlockMIs[{DestBB, KernelMI}] = &MI;
1673      BlockMIs.erase({SourceBB, KernelMI});
1674    }
1675    SmallVector<MachineInstr *, 4> PhiToDelete;
1676    for (MachineInstr &MI : DestBB->phis()) {
1677      assert(MI.getNumOperands() == 3);
1678      MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
1679      // If the instruction referenced by the phi is moved inside the block
1680      // we don't need the phi anymore.
1681      if (getStage(Def) == Stage) {
1682        Register PhiReg = MI.getOperand(0).getReg();
1683        assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1684                                              /*TRI=*/nullptr) != -1);
1685        MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
1686        MI.getOperand(0).setReg(PhiReg);
1687        PhiToDelete.push_back(&MI);
1688      }
1689    }
1690    for (auto *P : PhiToDelete)
1691      P->eraseFromParent();
1692    InsertPt = DestBB->getFirstNonPHI();
1693    // Helper to clone Phi instructions into the destination block. We clone Phi
1694    // greedily to avoid combinatorial explosion of Phi instructions.
1695    auto clonePhi = [&](MachineInstr *Phi) {
1696      MachineInstr *NewMI = MF.CloneMachineInstr(Phi);
1697      DestBB->insert(InsertPt, NewMI);
1698      Register OrigR = Phi->getOperand(0).getReg();
1699      Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
1700      NewMI->getOperand(0).setReg(R);
1701      NewMI->getOperand(1).setReg(OrigR);
1702      NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
1703      Remaps[OrigR] = R;
1704      CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1705      BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1706      PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1707      return R;
1708    };
1709    for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1710      for (MachineOperand &MO : I->uses()) {
1711        if (!MO.isReg())
1712          continue;
1713        if (Remaps.count(MO.getReg()))
1714          MO.setReg(Remaps[MO.getReg()]);
1715        else {
1716          // If we are using a phi from the source block we need to add a new phi
1717          // pointing to the old one.
1718          MachineInstr *Use = MRI.getUniqueVRegDef(MO.getReg());
1719          if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1720            Register R = clonePhi(Use);
1721            MO.setReg(R);
1722          }
1723        }
1724      }
1725    }
1726  }
1727  
1728  Register
1729  PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1730                                                    MachineInstr *Phi) {
1731    unsigned distance = PhiNodeLoopIteration[Phi];
1732    MachineInstr *CanonicalUse = CanonicalPhi;
1733    Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
1734    for (unsigned I = 0; I < distance; ++I) {
1735      assert(CanonicalUse->isPHI());
1736      assert(CanonicalUse->getNumOperands() == 5);
1737      unsigned LoopRegIdx = 3, InitRegIdx = 1;
1738      if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
1739        std::swap(LoopRegIdx, InitRegIdx);
1740      CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
1741      CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
1742    }
1743    return CanonicalUseReg;
1744  }
1745  
1746  void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1747    BitVector LS(Schedule.getNumStages(), true);
1748    BitVector AS(Schedule.getNumStages(), true);
1749    LiveStages[BB] = LS;
1750    AvailableStages[BB] = AS;
1751  
1752    // Peel out the prologs.
1753    LS.reset();
1754    for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1755      LS[I] = true;
1756      Prologs.push_back(peelKernel(LPD_Front));
1757      LiveStages[Prologs.back()] = LS;
1758      AvailableStages[Prologs.back()] = LS;
1759    }
1760  
1761    // Create a block that will end up as the new loop exiting block (dominated by
1762    // all prologs and epilogs). It will only contain PHIs, in the same order as
1763    // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1764    // that the exiting block is a (sub) clone of BB. This in turn gives us the
1765    // property that any value deffed in BB but used outside of BB is used by a
1766    // PHI in the exiting block.
1767    MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1768    EliminateDeadPhis(ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1769    // Push out the epilogs, again in reverse order.
1770    // We can't assume anything about the minumum loop trip count at this point,
1771    // so emit a fairly complex epilog.
1772  
1773    // We first peel number of stages minus one epilogue. Then we remove dead
1774    // stages and reorder instructions based on their stage. If we have 3 stages
1775    // we generate first:
1776    // E0[3, 2, 1]
1777    // E1[3', 2']
1778    // E2[3'']
1779    // And then we move instructions based on their stages to have:
1780    // E0[3]
1781    // E1[2, 3']
1782    // E2[1, 2', 3'']
1783    // The transformation is legal because we only move instructions past
1784    // instructions of a previous loop iteration.
1785    for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1786      Epilogs.push_back(peelKernel(LPD_Back));
1787      MachineBasicBlock *B = Epilogs.back();
1788      filterInstructions(B, Schedule.getNumStages() - I);
1789      // Keep track at which iteration each phi belongs to. We need it to know
1790      // what version of the variable to use during prologue/epilogue stitching.
1791      EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1792      for (MachineInstr &Phi : B->phis())
1793        PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1794    }
1795    for (size_t I = 0; I < Epilogs.size(); I++) {
1796      LS.reset();
1797      for (size_t J = I; J < Epilogs.size(); J++) {
1798        int Iteration = J;
1799        unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1800        // Move stage one block at a time so that Phi nodes are updated correctly.
1801        for (size_t K = Iteration; K > I; K--)
1802          moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
1803        LS[Stage] = true;
1804      }
1805      LiveStages[Epilogs[I]] = LS;
1806      AvailableStages[Epilogs[I]] = AS;
1807    }
1808  
1809    // Now we've defined all the prolog and epilog blocks as a fallthrough
1810    // sequence, add the edges that will be followed if the loop trip count is
1811    // lower than the number of stages (connecting prologs directly with epilogs).
1812    auto PI = Prologs.begin();
1813    auto EI = Epilogs.begin();
1814    assert(Prologs.size() == Epilogs.size());
1815    for (; PI != Prologs.end(); ++PI, ++EI) {
1816      MachineBasicBlock *Pred = *(*EI)->pred_begin();
1817      (*PI)->addSuccessor(*EI);
1818      for (MachineInstr &MI : (*EI)->phis()) {
1819        Register Reg = MI.getOperand(1).getReg();
1820        MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1821        if (Use && Use->getParent() == Pred) {
1822          MachineInstr *CanonicalUse = CanonicalMIs[Use];
1823          if (CanonicalUse->isPHI()) {
1824            // If the use comes from a phi we need to skip as many phi as the
1825            // distance between the epilogue and the kernel. Trace through the phi
1826            // chain to find the right value.
1827            Reg = getPhiCanonicalReg(CanonicalUse, Use);
1828          }
1829          Reg = getEquivalentRegisterIn(Reg, *PI);
1830        }
1831        MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
1832        MI.addOperand(MachineOperand::CreateMBB(*PI));
1833      }
1834    }
1835  
1836    // Create a list of all blocks in order.
1837    SmallVector<MachineBasicBlock *, 8> Blocks;
1838    llvm::copy(PeeledFront, std::back_inserter(Blocks));
1839    Blocks.push_back(BB);
1840    llvm::copy(PeeledBack, std::back_inserter(Blocks));
1841  
1842    // Iterate in reverse order over all instructions, remapping as we go.
1843    for (MachineBasicBlock *B : reverse(Blocks)) {
1844      for (auto I = B->instr_rbegin();
1845           I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
1846        MachineBasicBlock::reverse_instr_iterator MI = I++;
1847        rewriteUsesOf(&*MI);
1848      }
1849    }
1850    for (auto *MI : IllegalPhisToDelete) {
1851      if (LIS)
1852        LIS->RemoveMachineInstrFromMaps(*MI);
1853      MI->eraseFromParent();
1854    }
1855    IllegalPhisToDelete.clear();
1856  
1857    // Now all remapping has been done, we're free to optimize the generated code.
1858    for (MachineBasicBlock *B : reverse(Blocks))
1859      EliminateDeadPhis(B, MRI, LIS);
1860    EliminateDeadPhis(ExitingBB, MRI, LIS);
1861  }
1862  
1863  MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1864    MachineFunction &MF = *BB->getParent();
1865    MachineBasicBlock *Exit = *BB->succ_begin();
1866    if (Exit == BB)
1867      Exit = *std::next(BB->succ_begin());
1868  
1869    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1870    MF.insert(std::next(BB->getIterator()), NewBB);
1871  
1872    // Clone all phis in BB into NewBB and rewrite.
1873    for (MachineInstr &MI : BB->phis()) {
1874      auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
1875      Register OldR = MI.getOperand(3).getReg();
1876      Register R = MRI.createVirtualRegister(RC);
1877      SmallVector<MachineInstr *, 4> Uses;
1878      for (MachineInstr &Use : MRI.use_instructions(OldR))
1879        if (Use.getParent() != BB)
1880          Uses.push_back(&Use);
1881      for (MachineInstr *Use : Uses)
1882        Use->substituteRegister(OldR, R, /*SubIdx=*/0,
1883                                *MRI.getTargetRegisterInfo());
1884      MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1885          .addReg(OldR)
1886          .addMBB(BB);
1887      BlockMIs[{NewBB, &MI}] = NI;
1888      CanonicalMIs[NI] = &MI;
1889    }
1890    BB->replaceSuccessor(Exit, NewBB);
1891    Exit->replacePhiUsesWith(BB, NewBB);
1892    NewBB->addSuccessor(Exit);
1893  
1894    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1895    SmallVector<MachineOperand, 4> Cond;
1896    bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
1897    (void)CanAnalyzeBr;
1898    assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1899    TII->removeBranch(*BB);
1900    TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
1901                      Cond, DebugLoc());
1902    TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
1903    return NewBB;
1904  }
1905  
1906  Register
1907  PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1908                                                         MachineBasicBlock *BB) {
1909    MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1910    unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1911    return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
1912  }
1913  
1914  void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1915    if (MI->isPHI()) {
1916      // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1917      // and it is produced by this block.
1918      Register PhiR = MI->getOperand(0).getReg();
1919      Register R = MI->getOperand(3).getReg();
1920      int RMIStage = getStage(MRI.getUniqueVRegDef(R));
1921      if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
1922        R = MI->getOperand(1).getReg();
1923      MRI.setRegClass(R, MRI.getRegClass(PhiR));
1924      MRI.replaceRegWith(PhiR, R);
1925      // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1926      // later to figure out how to remap registers.
1927      MI->getOperand(0).setReg(PhiR);
1928      IllegalPhisToDelete.push_back(MI);
1929      return;
1930    }
1931  
1932    int Stage = getStage(MI);
1933    if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
1934        LiveStages[MI->getParent()].test(Stage))
1935      // Instruction is live, no rewriting to do.
1936      return;
1937  
1938    for (MachineOperand &DefMO : MI->defs()) {
1939      SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1940      for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1941        // Only PHIs can use values from this block by construction.
1942        // Match with the equivalent PHI in B.
1943        assert(UseMI.isPHI());
1944        Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1945                                               MI->getParent());
1946        Subs.emplace_back(&UseMI, Reg);
1947      }
1948      for (auto &Sub : Subs)
1949        Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1950                                      *MRI.getTargetRegisterInfo());
1951    }
1952    if (LIS)
1953      LIS->RemoveMachineInstrFromMaps(*MI);
1954    MI->eraseFromParent();
1955  }
1956  
1957  void PeelingModuloScheduleExpander::fixupBranches() {
1958    // Work outwards from the kernel.
1959    bool KernelDisposed = false;
1960    int TC = Schedule.getNumStages() - 1;
1961    for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1962         ++PI, ++EI, --TC) {
1963      MachineBasicBlock *Prolog = *PI;
1964      MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1965      MachineBasicBlock *Epilog = *EI;
1966      SmallVector<MachineOperand, 4> Cond;
1967      TII->removeBranch(*Prolog);
1968      std::optional<bool> StaticallyGreater =
1969          LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
1970      if (!StaticallyGreater) {
1971        LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1972        // Dynamically branch based on Cond.
1973        TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
1974      } else if (*StaticallyGreater == false) {
1975        LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1976        // Prolog never falls through; branch to epilog and orphan interior
1977        // blocks. Leave it to unreachable-block-elim to clean up.
1978        Prolog->removeSuccessor(Fallthrough);
1979        for (MachineInstr &P : Fallthrough->phis()) {
1980          P.removeOperand(2);
1981          P.removeOperand(1);
1982        }
1983        TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
1984        KernelDisposed = true;
1985      } else {
1986        LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
1987        // Prolog always falls through; remove incoming values in epilog.
1988        Prolog->removeSuccessor(Epilog);
1989        for (MachineInstr &P : Epilog->phis()) {
1990          P.removeOperand(4);
1991          P.removeOperand(3);
1992        }
1993      }
1994    }
1995  
1996    if (!KernelDisposed) {
1997      LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
1998      LoopInfo->setPreheader(Prologs.back());
1999    } else {
2000      LoopInfo->disposed();
2001    }
2002  }
2003  
2004  void PeelingModuloScheduleExpander::rewriteKernel() {
2005    KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2006    KR.rewrite();
2007  }
2008  
2009  void PeelingModuloScheduleExpander::expand() {
2010    BB = Schedule.getLoop()->getTopBlock();
2011    Preheader = Schedule.getLoop()->getLoopPreheader();
2012    LLVM_DEBUG(Schedule.dump());
2013    LoopInfo = TII->analyzeLoopForPipelining(BB);
2014    assert(LoopInfo);
2015  
2016    rewriteKernel();
2017    peelPrologAndEpilogs();
2018    fixupBranches();
2019  }
2020  
2021  void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2022    BB = Schedule.getLoop()->getTopBlock();
2023    Preheader = Schedule.getLoop()->getLoopPreheader();
2024  
2025    // Dump the schedule before we invalidate and remap all its instructions.
2026    // Stash it in a string so we can print it if we found an error.
2027    std::string ScheduleDump;
2028    raw_string_ostream OS(ScheduleDump);
2029    Schedule.print(OS);
2030    OS.flush();
2031  
2032    // First, run the normal ModuleScheduleExpander. We don't support any
2033    // InstrChanges.
2034    assert(LIS && "Requires LiveIntervals!");
2035    ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2036                               ModuloScheduleExpander::InstrChangesTy());
2037    MSE.expand();
2038    MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2039    if (!ExpandedKernel) {
2040      // The expander optimized away the kernel. We can't do any useful checking.
2041      MSE.cleanup();
2042      return;
2043    }
2044    // Before running the KernelRewriter, re-add BB into the CFG.
2045    Preheader->addSuccessor(BB);
2046  
2047    // Now run the new expansion algorithm.
2048    KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2049    KR.rewrite();
2050    peelPrologAndEpilogs();
2051  
2052    // Collect all illegal phis that the new algorithm created. We'll give these
2053    // to KernelOperandInfo.
2054    SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2055    for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2056      if (NI->isPHI())
2057        IllegalPhis.insert(&*NI);
2058    }
2059  
2060    // Co-iterate across both kernels. We expect them to be identical apart from
2061    // phis and full COPYs (we look through both).
2062    SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2063    auto OI = ExpandedKernel->begin();
2064    auto NI = BB->begin();
2065    for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2066      while (OI->isPHI() || OI->isFullCopy())
2067        ++OI;
2068      while (NI->isPHI() || NI->isFullCopy())
2069        ++NI;
2070      assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2071      // Analyze every operand separately.
2072      for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2073           OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2074        KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2075                          KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2076    }
2077  
2078    bool Failed = false;
2079    for (auto &OldAndNew : KOIs) {
2080      if (OldAndNew.first == OldAndNew.second)
2081        continue;
2082      Failed = true;
2083      errs() << "Modulo kernel validation error: [\n";
2084      errs() << " [golden] ";
2085      OldAndNew.first.print(errs());
2086      errs() << "          ";
2087      OldAndNew.second.print(errs());
2088      errs() << "]\n";
2089    }
2090  
2091    if (Failed) {
2092      errs() << "Golden reference kernel:\n";
2093      ExpandedKernel->print(errs());
2094      errs() << "New kernel:\n";
2095      BB->print(errs());
2096      errs() << ScheduleDump;
2097      report_fatal_error(
2098          "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2099    }
2100  
2101    // Cleanup by removing BB from the CFG again as the original
2102    // ModuloScheduleExpander intended.
2103    Preheader->removeSuccessor(BB);
2104    MSE.cleanup();
2105  }
2106  
2107  MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2108    MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2109  
2110    // TODO: Offset information needs to be corrected.
2111    NewMI->dropMemRefs(MF);
2112  
2113    return NewMI;
2114  }
2115  
2116  /// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2117  /// If it is already dedicated exit, return it. Otherwise, insert a new
2118  /// block between them and return the new block.
2119  static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2120                                                MachineBasicBlock *Exit) {
2121    if (Exit->pred_size() == 1)
2122      return Exit;
2123  
2124    MachineFunction *MF = Loop->getParent();
2125    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2126  
2127    MachineBasicBlock *NewExit =
2128        MF->CreateMachineBasicBlock(Loop->getBasicBlock());
2129    MF->insert(Loop->getIterator(), NewExit);
2130  
2131    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2132    SmallVector<MachineOperand, 4> Cond;
2133    TII->analyzeBranch(*Loop, TBB, FBB, Cond);
2134    if (TBB == Loop)
2135      FBB = NewExit;
2136    else if (FBB == Loop)
2137      TBB = NewExit;
2138    else
2139      llvm_unreachable("unexpected loop structure");
2140    TII->removeBranch(*Loop);
2141    TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
2142    Loop->replaceSuccessor(Exit, NewExit);
2143    TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
2144    NewExit->addSuccessor(Exit);
2145  
2146    Exit->replacePhiUsesWith(Loop, NewExit);
2147  
2148    return NewExit;
2149  }
2150  
2151  /// Insert branch code into the end of MBB. It branches to GreaterThan if the
2152  /// remaining trip count for instructions in LastStage0Insts is greater than
2153  /// RequiredTC, and to Otherwise otherwise.
2154  void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2155                                                   int RequiredTC,
2156                                                   InstrMapTy &LastStage0Insts,
2157                                                   MachineBasicBlock &GreaterThan,
2158                                                   MachineBasicBlock &Otherwise) {
2159    SmallVector<MachineOperand, 4> Cond;
2160    LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond,
2161                                                        LastStage0Insts);
2162  
2163    if (SwapBranchTargetsMVE) {
2164      // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2165      // FBB for optimal performance.
2166      if (TII->reverseBranchCondition(Cond))
2167        llvm_unreachable("can not reverse branch condition");
2168      TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc());
2169    } else {
2170      TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc());
2171    }
2172  }
2173  
2174  /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2175  /// other necessary blocks. The control flow is modified to execute the
2176  /// pipelined loop if the trip count satisfies the condition, otherwise the
2177  /// original loop. The original loop is also used to execute the remainder
2178  /// iterations which occur due to unrolling.
2179  void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2180    // The control flow for pipelining with MVE:
2181    //
2182    // OrigPreheader:
2183    //   // The block that is originally the loop preheader
2184    //   goto Check
2185    //
2186    // Check:
2187    //   // Check whether the trip count satisfies the requirements to pipeline.
2188    //   if (LoopCounter > NumStages + NumUnroll - 2)
2189    //     // The minimum number of iterations to pipeline =
2190    //     //   iterations executed in prolog/epilog (NumStages-1) +
2191    //     //   iterations executed in one kernel run (NumUnroll)
2192    //     goto Prolog
2193    //   // fallback to the original loop
2194    //   goto NewPreheader
2195    //
2196    // Prolog:
2197    //   // All prolog stages. There are no direct branches to the epilogue.
2198    //   goto NewKernel
2199    //
2200    // NewKernel:
2201    //   // NumUnroll copies of the kernel
2202    //   if (LoopCounter > MVE-1)
2203    //     goto NewKernel
2204    //   goto Epilog
2205    //
2206    // Epilog:
2207    //   // All epilog stages.
2208    //   if (LoopCounter > 0)
2209    //     // The remainder is executed in the original loop
2210    //     goto NewPreheader
2211    //   goto NewExit
2212    //
2213    // NewPreheader:
2214    //   // Newly created preheader for the original loop.
2215    //   // The initial values of the phis in the loop are merged from two paths.
2216    //   NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2217    //   goto OrigKernel
2218    //
2219    // OrigKernel:
2220    //   // The original loop block.
2221    //   if (LoopCounter != 0)
2222    //     goto OrigKernel
2223    //   goto NewExit
2224    //
2225    // NewExit:
2226    //   // Newly created dedicated exit for the original loop.
2227    //   // Merge values which are referenced after the loop
2228    //   Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2229    //   goto OrigExit
2230    //
2231    // OrigExit:
2232    //   // The block that is originally the loop exit.
2233    //   // If it is already deicated exit, NewExit is not created.
2234  
2235    // An example of where each stage is executed:
2236    // Assume #Stages 3, #MVE 4, #Iterations 12
2237    // Iter   0 1 2 3 4 5 6 7 8 9 10-11
2238    // -------------------------------------------------
2239    // Stage  0                          Prolog#0
2240    // Stage  1 0                        Prolog#1
2241    // Stage  2 1 0                      Kernel Unroll#0 Iter#0
2242    // Stage    2 1 0                    Kernel Unroll#1 Iter#0
2243    // Stage      2 1 0                  Kernel Unroll#2 Iter#0
2244    // Stage        2 1 0                Kernel Unroll#3 Iter#0
2245    // Stage          2 1 0              Kernel Unroll#0 Iter#1
2246    // Stage            2 1 0            Kernel Unroll#1 Iter#1
2247    // Stage              2 1 0          Kernel Unroll#2 Iter#1
2248    // Stage                2 1 0        Kernel Unroll#3 Iter#1
2249    // Stage                  2 1        Epilog#0
2250    // Stage                    2        Epilog#1
2251    // Stage                      0-2    OrigKernel
2252  
2253    LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
2254    assert(LoopInfo && "Must be able to analyze loop!");
2255  
2256    calcNumUnroll();
2257  
2258    Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2259    Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2260    NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2261    Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2262    NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2263  
2264    MF.insert(OrigKernel->getIterator(), Check);
2265    MF.insert(OrigKernel->getIterator(), Prolog);
2266    MF.insert(OrigKernel->getIterator(), NewKernel);
2267    MF.insert(OrigKernel->getIterator(), Epilog);
2268    MF.insert(OrigKernel->getIterator(), NewPreheader);
2269  
2270    NewExit = createDedicatedExit(OrigKernel, OrigExit);
2271  
2272    NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
2273    TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
2274  
2275    OrigPreheader->addSuccessor(Check);
2276    TII->removeBranch(*OrigPreheader);
2277    TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
2278  
2279    Check->addSuccessor(Prolog);
2280    Check->addSuccessor(NewPreheader);
2281  
2282    Prolog->addSuccessor(NewKernel);
2283  
2284    NewKernel->addSuccessor(NewKernel);
2285    NewKernel->addSuccessor(Epilog);
2286  
2287    Epilog->addSuccessor(NewPreheader);
2288    Epilog->addSuccessor(NewExit);
2289  
2290    InstrMapTy LastStage0Insts;
2291    insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2,
2292                     LastStage0Insts, *Prolog, *NewPreheader);
2293  
2294    // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2295    // register#
2296    SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2297    generateProlog(PrologVRMap);
2298    generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2299    generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2300  }
2301  
2302  /// Replace MI's use operands according to the maps.
2303  void ModuloScheduleExpanderMVE::updateInstrUse(
2304      MachineInstr *MI, int StageNum, int PhaseNum,
2305      SmallVectorImpl<ValueMapTy> &CurVRMap,
2306      SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2307    // If MI is in the prolog/kernel/epilog block, CurVRMap is
2308    // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2309    // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2310    // Refer to the appropriate map according to the stage difference between
2311    // MI and the definition of an operand.
2312  
2313    for (MachineOperand &UseMO : MI->uses()) {
2314      if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2315        continue;
2316      int DiffStage = 0;
2317      Register OrigReg = UseMO.getReg();
2318      MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
2319      if (!DefInst || DefInst->getParent() != OrigKernel)
2320        continue;
2321      unsigned InitReg = 0;
2322      unsigned DefReg = OrigReg;
2323      if (DefInst->isPHI()) {
2324        ++DiffStage;
2325        unsigned LoopReg;
2326        getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
2327        // LoopReg is guaranteed to be defined within the loop by canApply()
2328        DefReg = LoopReg;
2329        DefInst = MRI.getVRegDef(LoopReg);
2330      }
2331      unsigned DefStageNum = Schedule.getStage(DefInst);
2332      DiffStage += StageNum - DefStageNum;
2333      Register NewReg;
2334      if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
2335        // NewReg is defined in a previous phase of the same block
2336        NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2337      else if (!PrevVRMap)
2338        // Since this is the first iteration, refer the initial register of the
2339        // loop
2340        NewReg = InitReg;
2341      else
2342        // Cases where DiffStage is larger than PhaseNum.
2343        // If MI is in the kernel block, the value is defined by the previous
2344        // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2345        // value is defined in the kernel block and KernelVRMap is referenced.
2346        NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2347  
2348      const TargetRegisterClass *NRC =
2349          MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
2350      if (NRC)
2351        UseMO.setReg(NewReg);
2352      else {
2353        Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2354        BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
2355                SplitReg)
2356            .addReg(NewReg);
2357        UseMO.setReg(SplitReg);
2358      }
2359    }
2360  }
2361  
2362  /// Return a phi if Reg is referenced by the phi.
2363  /// canApply() guarantees that at most only one such phi exists.
2364  static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2365    for (MachineInstr &Phi : Loop->phis()) {
2366      unsigned InitVal, LoopVal;
2367      getPhiRegs(Phi, Loop, InitVal, LoopVal);
2368      if (LoopVal == Reg)
2369        return &Phi;
2370    }
2371    return nullptr;
2372  }
2373  
2374  /// Generate phis for registers defined by OrigMI.
2375  void ModuloScheduleExpanderMVE::generatePhi(
2376      MachineInstr *OrigMI, int UnrollNum,
2377      SmallVectorImpl<ValueMapTy> &PrologVRMap,
2378      SmallVectorImpl<ValueMapTy> &KernelVRMap,
2379      SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2380    int StageNum = Schedule.getStage(OrigMI);
2381    bool UsePrologReg;
2382    if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2383      UsePrologReg = true;
2384    else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2385      UsePrologReg = false;
2386    else
2387      return;
2388  
2389    // Examples that show which stages are merged by phi.
2390    // Meaning of the symbol following the stage number:
2391    //   a/b: Stages with the same letter are merged (UsePrologReg == true)
2392    //   +: Merged with the initial value (UsePrologReg == false)
2393    //   *: No phis required
2394    //
2395    // #Stages 3, #MVE 4
2396    // Iter   0 1 2 3 4 5 6 7 8
2397    // -----------------------------------------
2398    // Stage  0a                 Prolog#0
2399    // Stage  1a 0b              Prolog#1
2400    // Stage  2* 1* 0*           Kernel Unroll#0
2401    // Stage     2* 1* 0+        Kernel Unroll#1
2402    // Stage        2* 1+ 0a     Kernel Unroll#2
2403    // Stage           2+ 1a 0b  Kernel Unroll#3
2404    //
2405    // #Stages 3, #MVE 2
2406    // Iter   0 1 2 3 4 5 6 7 8
2407    // -----------------------------------------
2408    // Stage  0a                 Prolog#0
2409    // Stage  1a 0b              Prolog#1
2410    // Stage  2* 1+ 0a           Kernel Unroll#0
2411    // Stage     2+ 1a 0b        Kernel Unroll#1
2412    //
2413    // #Stages 3, #MVE 1
2414    // Iter   0 1 2 3 4 5 6 7 8
2415    // -----------------------------------------
2416    // Stage  0*                 Prolog#0
2417    // Stage  1a 0b              Prolog#1
2418    // Stage  2+ 1a 0b           Kernel Unroll#0
2419  
2420    for (MachineOperand &DefMO : OrigMI->defs()) {
2421      if (!DefMO.isReg() || DefMO.isDead())
2422        continue;
2423      Register OrigReg = DefMO.getReg();
2424      auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
2425      if (NewReg == KernelVRMap[UnrollNum].end())
2426        continue;
2427      Register CorrespondReg;
2428      if (UsePrologReg) {
2429        int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2430        CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2431      } else {
2432        MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
2433        if (!Phi)
2434          continue;
2435        CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
2436      }
2437  
2438      assert(CorrespondReg.isValid());
2439      Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2440      BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
2441              TII->get(TargetOpcode::PHI), PhiReg)
2442          .addReg(NewReg->second)
2443          .addMBB(NewKernel)
2444          .addReg(CorrespondReg)
2445          .addMBB(Prolog);
2446      PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2447    }
2448  }
2449  
2450  static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2451                            MachineBasicBlock *NewMBB) {
2452    for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2453      if (Phi.getOperand(Idx).getReg() == OrigReg) {
2454        Phi.getOperand(Idx).setReg(NewReg);
2455        Phi.getOperand(Idx + 1).setMBB(NewMBB);
2456        return;
2457      }
2458    }
2459  }
2460  
2461  /// Generate phis that merge values from multiple routes
2462  void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2463                                                            Register NewReg) {
2464    SmallVector<MachineOperand *> UsesAfterLoop;
2465    SmallVector<MachineInstr *> LoopPhis;
2466    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
2467                                           E = MRI.use_end();
2468         I != E; ++I) {
2469      MachineOperand &O = *I;
2470      if (O.getParent()->getParent() != OrigKernel &&
2471          O.getParent()->getParent() != Prolog &&
2472          O.getParent()->getParent() != NewKernel &&
2473          O.getParent()->getParent() != Epilog)
2474        UsesAfterLoop.push_back(&O);
2475      if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2476        LoopPhis.push_back(O.getParent());
2477    }
2478  
2479    // Merge the route that only execute the pipelined loop (when there are no
2480    // remaining iterations) with the route that execute the original loop.
2481    if (!UsesAfterLoop.empty()) {
2482      Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2483      BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
2484              TII->get(TargetOpcode::PHI), PhiReg)
2485          .addReg(OrigReg)
2486          .addMBB(OrigKernel)
2487          .addReg(NewReg)
2488          .addMBB(Epilog);
2489  
2490      for (MachineOperand *MO : UsesAfterLoop)
2491        MO->setReg(PhiReg);
2492  
2493      if (!LIS.hasInterval(PhiReg))
2494        LIS.createEmptyInterval(PhiReg);
2495    }
2496  
2497    // Merge routes from the pipelined loop and the bypassed route before the
2498    // original loop
2499    if (!LoopPhis.empty()) {
2500      for (MachineInstr *Phi : LoopPhis) {
2501        unsigned InitReg, LoopReg;
2502        getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
2503        Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
2504        BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(),
2505                TII->get(TargetOpcode::PHI), NewInit)
2506            .addReg(InitReg)
2507            .addMBB(Check)
2508            .addReg(NewReg)
2509            .addMBB(Epilog);
2510        replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
2511      }
2512    }
2513  }
2514  
2515  void ModuloScheduleExpanderMVE::generateProlog(
2516      SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2517    PrologVRMap.clear();
2518    PrologVRMap.resize(Schedule.getNumStages() - 1);
2519    DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2520    for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2521         ++PrologNum) {
2522      for (MachineInstr *MI : Schedule.getInstructions()) {
2523        if (MI->isPHI())
2524          continue;
2525        int StageNum = Schedule.getStage(MI);
2526        if (StageNum > PrologNum)
2527          continue;
2528        MachineInstr *NewMI = cloneInstr(MI);
2529        updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
2530        NewMIMap[NewMI] = {PrologNum, StageNum};
2531        Prolog->push_back(NewMI);
2532      }
2533    }
2534  
2535    for (auto I : NewMIMap) {
2536      MachineInstr *MI = I.first;
2537      int PrologNum = I.second.first;
2538      int StageNum = I.second.second;
2539      updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
2540    }
2541  
2542    LLVM_DEBUG({
2543      dbgs() << "prolog:\n";
2544      Prolog->dump();
2545    });
2546  }
2547  
2548  void ModuloScheduleExpanderMVE::generateKernel(
2549      SmallVectorImpl<ValueMapTy> &PrologVRMap,
2550      SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2551    KernelVRMap.clear();
2552    KernelVRMap.resize(NumUnroll);
2553    SmallVector<ValueMapTy> PhiVRMap;
2554    PhiVRMap.resize(NumUnroll);
2555    DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2556    DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0;
2557    for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2558      for (MachineInstr *MI : Schedule.getInstructions()) {
2559        if (MI->isPHI())
2560          continue;
2561        int StageNum = Schedule.getStage(MI);
2562        MachineInstr *NewMI = cloneInstr(MI);
2563        if (UnrollNum == NumUnroll - 1)
2564          LastStage0Insts[MI] = NewMI;
2565        updateInstrDef(NewMI, KernelVRMap[UnrollNum],
2566                       (UnrollNum == NumUnroll - 1 && StageNum == 0));
2567        generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2568        NewMIMap[NewMI] = {UnrollNum, StageNum};
2569        NewKernel->push_back(NewMI);
2570      }
2571    }
2572  
2573    for (auto I : NewMIMap) {
2574      MachineInstr *MI = I.first;
2575      int UnrollNum = I.second.first;
2576      int StageNum = I.second.second;
2577      updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
2578    }
2579  
2580    // If remaining trip count is greater than NumUnroll-1, loop continues
2581    insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel,
2582                     *Epilog);
2583  
2584    LLVM_DEBUG({
2585      dbgs() << "kernel:\n";
2586      NewKernel->dump();
2587    });
2588  }
2589  
2590  void ModuloScheduleExpanderMVE::generateEpilog(
2591      SmallVectorImpl<ValueMapTy> &KernelVRMap,
2592      SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2593    EpilogVRMap.clear();
2594    EpilogVRMap.resize(Schedule.getNumStages() - 1);
2595    DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2596    for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2597         ++EpilogNum) {
2598      for (MachineInstr *MI : Schedule.getInstructions()) {
2599        if (MI->isPHI())
2600          continue;
2601        int StageNum = Schedule.getStage(MI);
2602        if (StageNum <= EpilogNum)
2603          continue;
2604        MachineInstr *NewMI = cloneInstr(MI);
2605        updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
2606        NewMIMap[NewMI] = {EpilogNum, StageNum};
2607        Epilog->push_back(NewMI);
2608      }
2609    }
2610  
2611    for (auto I : NewMIMap) {
2612      MachineInstr *MI = I.first;
2613      int EpilogNum = I.second.first;
2614      int StageNum = I.second.second;
2615      updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
2616    }
2617  
2618    // If there are remaining iterations, they are executed in the original loop.
2619    // Instructions related to loop control, such as loop counter comparison,
2620    // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2621    // in stage 0. Thus, the map is for the last one in the kernel.
2622    insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit);
2623  
2624    LLVM_DEBUG({
2625      dbgs() << "epilog:\n";
2626      Epilog->dump();
2627    });
2628  }
2629  
2630  /// Calculate the number of unroll required and set it to NumUnroll
2631  void ModuloScheduleExpanderMVE::calcNumUnroll() {
2632    DenseMap<MachineInstr *, unsigned> Inst2Idx;
2633    NumUnroll = 1;
2634    for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2635      Inst2Idx[Schedule.getInstructions()[I]] = I;
2636  
2637    for (MachineInstr *MI : Schedule.getInstructions()) {
2638      if (MI->isPHI())
2639        continue;
2640      int StageNum = Schedule.getStage(MI);
2641      for (const MachineOperand &MO : MI->uses()) {
2642        if (!MO.isReg() || !MO.getReg().isVirtual())
2643          continue;
2644        MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
2645        if (DefMI->getParent() != OrigKernel)
2646          continue;
2647  
2648        int NumUnrollLocal = 1;
2649        if (DefMI->isPHI()) {
2650          ++NumUnrollLocal;
2651          // canApply() guarantees that DefMI is not phi and is an instruction in
2652          // the loop
2653          DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
2654        }
2655        NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
2656        if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2657          --NumUnrollLocal;
2658        NumUnroll = std::max(NumUnroll, NumUnrollLocal);
2659      }
2660    }
2661    LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2662  }
2663  
2664  /// Create new virtual registers for definitions of NewMI and update NewMI.
2665  /// If the definitions are referenced after the pipelined loop, phis are
2666  /// created to merge with other routes.
2667  void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2668                                                 ValueMapTy &VRMap,
2669                                                 bool LastDef) {
2670    for (MachineOperand &MO : NewMI->operands()) {
2671      if (!MO.isReg() || !MO.getReg().isVirtual() || !MO.isDef())
2672        continue;
2673      Register Reg = MO.getReg();
2674      const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2675      Register NewReg = MRI.createVirtualRegister(RC);
2676      MO.setReg(NewReg);
2677      VRMap[Reg] = NewReg;
2678      if (LastDef)
2679        mergeRegUsesAfterPipeline(Reg, NewReg);
2680    }
2681  }
2682  
2683  void ModuloScheduleExpanderMVE::expand() {
2684    OrigKernel = Schedule.getLoop()->getTopBlock();
2685    OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2686    OrigExit = Schedule.getLoop()->getExitBlock();
2687  
2688    LLVM_DEBUG(Schedule.dump());
2689  
2690    generatePipelinedLoop();
2691  }
2692  
2693  /// Check if ModuloScheduleExpanderMVE can be applied to L
2694  bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2695    if (!L.getExitBlock()) {
2696      LLVM_DEBUG(
2697          dbgs() << "Can not apply MVE expander: No single exit block.\n";);
2698      return false;
2699    }
2700  
2701    MachineBasicBlock *BB = L.getTopBlock();
2702    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2703  
2704    // Put some constraints on the operands of the phis to simplify the
2705    // transformation
2706    DenseSet<unsigned> UsedByPhi;
2707    for (MachineInstr &MI : BB->phis()) {
2708      // Registers defined by phis must be used only inside the loop and be never
2709      // used by phis.
2710      for (MachineOperand &MO : MI.defs())
2711        if (MO.isReg())
2712          for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
2713            if (Ref.getParent() != BB || Ref.isPHI()) {
2714              LLVM_DEBUG(dbgs()
2715                             << "Can not apply MVE expander: A phi result is "
2716                                "referenced outside of the loop or by phi.\n";);
2717              return false;
2718            }
2719  
2720      // A source register from the loop block must be defined inside the loop.
2721      // A register defined inside the loop must be referenced by only one phi at
2722      // most.
2723      unsigned InitVal, LoopVal;
2724      getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
2725      if (!Register(LoopVal).isVirtual() ||
2726          MRI.getVRegDef(LoopVal)->getParent() != BB) {
2727        LLVM_DEBUG(
2728            dbgs() << "Can not apply MVE expander: A phi source value coming "
2729                      "from the loop is not defined in the loop.\n";);
2730        return false;
2731      }
2732      if (UsedByPhi.count(LoopVal)) {
2733        LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2734                             "loop is referenced by two or more phis.\n";);
2735        return false;
2736      }
2737      UsedByPhi.insert(LoopVal);
2738    }
2739  
2740    return true;
2741  }
2742  
2743  //===----------------------------------------------------------------------===//
2744  // ModuloScheduleTestPass implementation
2745  //===----------------------------------------------------------------------===//
2746  // This pass constructs a ModuloSchedule from its module and runs
2747  // ModuloScheduleExpander.
2748  //
2749  // The module is expected to contain a single-block analyzable loop.
2750  // The total order of instructions is taken from the loop as-is.
2751  // Instructions are expected to be annotated with a PostInstrSymbol.
2752  // This PostInstrSymbol must have the following format:
2753  //  "Stage=%d Cycle=%d".
2754  //===----------------------------------------------------------------------===//
2755  
2756  namespace {
2757  class ModuloScheduleTest : public MachineFunctionPass {
2758  public:
2759    static char ID;
2760  
2761    ModuloScheduleTest() : MachineFunctionPass(ID) {
2762      initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
2763    }
2764  
2765    bool runOnMachineFunction(MachineFunction &MF) override;
2766    void runOnLoop(MachineFunction &MF, MachineLoop &L);
2767  
2768    void getAnalysisUsage(AnalysisUsage &AU) const override {
2769      AU.addRequired<MachineLoopInfoWrapperPass>();
2770      AU.addRequired<LiveIntervalsWrapperPass>();
2771      MachineFunctionPass::getAnalysisUsage(AU);
2772    }
2773  };
2774  } // namespace
2775  
2776  char ModuloScheduleTest::ID = 0;
2777  
2778  INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2779                        "Modulo Schedule test pass", false, false)
2780  INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2781  INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2782  INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2783                      "Modulo Schedule test pass", false, false)
2784  
2785  bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2786    MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2787    for (auto *L : MLI) {
2788      if (L->getTopBlock() != L->getBottomBlock())
2789        continue;
2790      runOnLoop(MF, *L);
2791      return false;
2792    }
2793    return false;
2794  }
2795  
2796  static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2797    std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
2798    std::pair<StringRef, StringRef> StageTokenAndValue =
2799        getToken(StageAndCycle.first, "-");
2800    std::pair<StringRef, StringRef> CycleTokenAndValue =
2801        getToken(StageAndCycle.second, "-");
2802    if (StageTokenAndValue.first != "Stage" ||
2803        CycleTokenAndValue.first != "_Cycle") {
2804      llvm_unreachable(
2805          "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2806      return;
2807    }
2808  
2809    StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
2810    CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
2811  
2812    dbgs() << "  Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2813  }
2814  
2815  void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2816    LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2817    MachineBasicBlock *BB = L.getTopBlock();
2818    dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2819  
2820    DenseMap<MachineInstr *, int> Cycle, Stage;
2821    std::vector<MachineInstr *> Instrs;
2822    for (MachineInstr &MI : *BB) {
2823      if (MI.isTerminator())
2824        continue;
2825      Instrs.push_back(&MI);
2826      if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2827        dbgs() << "Parsing post-instr symbol for " << MI;
2828        parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
2829      }
2830    }
2831  
2832    ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2833                      std::move(Stage));
2834    ModuloScheduleExpander MSE(
2835        MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2836    MSE.expand();
2837    MSE.cleanup();
2838  }
2839  
2840  //===----------------------------------------------------------------------===//
2841  // ModuloScheduleTestAnnotater implementation
2842  //===----------------------------------------------------------------------===//
2843  
2844  void ModuloScheduleTestAnnotater::annotate() {
2845    for (MachineInstr *MI : S.getInstructions()) {
2846      SmallVector<char, 16> SV;
2847      raw_svector_ostream OS(SV);
2848      OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2849      MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
2850      MI->setPostInstrSymbol(MF, Sym);
2851    }
2852  }
2853