xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/WindowScheduler.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //======----------- WindowScheduler.cpp - window scheduler -------------======//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Window Scheduling software pipelining algorithm.
10 //
11 // The fundamental concept of the window scheduling algorithm involves folding
12 // the original MBB at a specific position, followed by list scheduling on the
13 // folded MIs. The optimal scheduling result is then chosen from various folding
14 // positions as the final scheduling outcome.
15 //
16 // The primary challenge in this algorithm lies in generating the folded MIs and
17 // establishing their dependencies. We have innovatively employed a new MBB,
18 // created by copying the original MBB three times, known as TripleMBB. This
19 // TripleMBB enables the convenient implementation of MI folding and dependency
20 // establishment. To facilitate the algorithm's implementation, we have also
21 // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
22 //
23 // Another challenge in the algorithm is the scheduling of phis. Semantically,
24 // it is difficult to place the phis in the window and perform list scheduling.
25 // Therefore, we schedule these phis separately after each list scheduling.
26 //
27 // The provided implementation is designed for use before the Register Allocator
28 // (RA). If the target requires implementation after RA, it is recommended to
29 // reimplement analyseII(), schedulePhi(), and expand(). Additionally,
30 // target-specific logic can be added in initialize(), preProcess(), and
31 // postProcess().
32 //
33 // Lastly, it is worth mentioning that getSearchIndexes() is an important
34 // function. We have experimented with more complex heuristics on downstream
35 // target and achieved favorable results.
36 //
37 //===----------------------------------------------------------------------===//
38 #include "llvm/CodeGen/WindowScheduler.h"
39 #include "llvm/ADT/Statistic.h"
40 #include "llvm/CodeGen/LiveIntervals.h"
41 #include "llvm/CodeGen/MachineLoopInfo.h"
42 #include "llvm/CodeGen/MachinePipeliner.h"
43 #include "llvm/CodeGen/ModuloSchedule.h"
44 #include "llvm/CodeGen/TargetPassConfig.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/TimeProfiler.h"
48 #include "llvm/Target/TargetMachine.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "pipeliner"
53 
54 namespace {
55 STATISTIC(NumTryWindowSchedule,
56           "Number of loops that we attempt to use window scheduling");
57 STATISTIC(NumTryWindowSearch,
58           "Number of times that we run list schedule in the window scheduling");
59 STATISTIC(NumWindowSchedule,
60           "Number of loops that we successfully use window scheduling");
61 STATISTIC(NumFailAnalyseII,
62           "Window scheduling abort due to the failure of the II analysis");
63 
64 cl::opt<unsigned>
65     WindowSearchNum("window-search-num",
66                     cl::desc("The number of searches per loop in the window "
67                              "algorithm. 0 means no search number limit."),
68                     cl::Hidden, cl::init(6));
69 
70 cl::opt<unsigned> WindowSearchRatio(
71     "window-search-ratio",
72     cl::desc("The ratio of searches per loop in the window algorithm. 100 "
73              "means search all positions in the loop, while 0 means not "
74              "performing any search."),
75     cl::Hidden, cl::init(40));
76 
77 cl::opt<unsigned> WindowIICoeff(
78     "window-ii-coeff",
79     cl::desc(
80         "The coefficient used when initializing II in the window algorithm."),
81     cl::Hidden, cl::init(5));
82 
83 cl::opt<unsigned> WindowRegionLimit(
84     "window-region-limit",
85     cl::desc(
86         "The lower limit of the scheduling region in the window algorithm."),
87     cl::Hidden, cl::init(3));
88 
89 cl::opt<unsigned> WindowDiffLimit(
90     "window-diff-limit",
91     cl::desc("The lower limit of the difference between best II and base II in "
92              "the window algorithm. If the difference is smaller than "
93              "this lower limit, window scheduling will not be performed."),
94     cl::Hidden, cl::init(2));
95 } // namespace
96 
97 // WindowIILimit serves as an indicator of abnormal scheduling results and could
98 // potentially be referenced by the derived target window scheduler.
99 static cl::opt<unsigned>
100     WindowIILimit("window-ii-limit",
101                   cl::desc("The upper limit of II in the window algorithm."),
102                   cl::Hidden, cl::init(1000));
103 
104 WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
105     : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
106       Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
107       TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
108   TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
109       createMachineScheduler(/*OnlyBuildGraph=*/true));
110 }
111 
112 bool WindowScheduler::run() {
113   if (!initialize()) {
114     LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
115     return false;
116   }
117   // The window algorithm is time-consuming, and its compilation time should be
118   // taken into consideration.
119   TimeTraceScope Scope("WindowSearch");
120   ++NumTryWindowSchedule;
121   // Performing the relevant processing before window scheduling.
122   preProcess();
123   // The main window scheduling begins.
124   std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
125   auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
126   for (unsigned Idx : SearchIndexes) {
127     OriToCycle.clear();
128     ++NumTryWindowSearch;
129     // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
130     // be added to Idx.
131     unsigned Offset = Idx + SchedPhiNum;
132     auto Range = getScheduleRange(Offset, SchedInstrNum);
133     SchedDAG->startBlock(MBB);
134     SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
135     SchedDAG->schedule();
136     LLVM_DEBUG(SchedDAG->dump());
137     unsigned II = analyseII(*SchedDAG, Offset);
138     if (II == WindowIILimit) {
139       restoreTripleMBB();
140       LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
141       ++NumFailAnalyseII;
142       continue;
143     }
144     schedulePhi(Offset, II);
145     updateScheduleResult(Offset, II);
146     restoreTripleMBB();
147     LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
148                       << II << ".\n");
149   }
150   // Performing the relevant processing after window scheduling.
151   postProcess();
152   // Check whether the scheduling result is valid.
153   if (!isScheduleValid()) {
154     LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
155     return false;
156   }
157   LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
158                     << " and Best II is " << BestII << ".\n");
159   // Expand the scheduling result to prologue, kernel, and epilogue.
160   expand();
161   ++NumWindowSchedule;
162   return true;
163 }
164 
165 ScheduleDAGInstrs *
166 WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
167   return OnlyBuildGraph
168              ? new ScheduleDAGMI(
169                    Context, std::make_unique<PostGenericScheduler>(Context),
170                    true)
171              : Context->TM->createMachineScheduler(Context);
172 }
173 
174 bool WindowScheduler::initialize() {
175   if (!Subtarget->enableWindowScheduler()) {
176     LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
177     return false;
178   }
179   // Initialized the member variables used by window algorithm.
180   OriMIs.clear();
181   TriMIs.clear();
182   TriToOri.clear();
183   OriToCycle.clear();
184   SchedResult.clear();
185   SchedPhiNum = 0;
186   SchedInstrNum = 0;
187   BestII = UINT_MAX;
188   BestOffset = 0;
189   BaseII = 0;
190   // List scheduling used in the window algorithm depends on LiveIntervals.
191   if (!Context->LIS) {
192     LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
193     return false;
194   }
195   // Check each MI in MBB.
196   SmallSet<Register, 8> PrevDefs;
197   SmallSet<Register, 8> PrevUses;
198   auto IsLoopCarried = [&](MachineInstr &Phi) {
199     // Two cases are checked here: (1)The virtual register defined by the
200     // preceding phi is used by the succeeding phi;(2)The preceding phi uses the
201     // virtual register defined by the succeeding phi.
202     if (PrevUses.count(Phi.getOperand(0).getReg()))
203       return true;
204     PrevDefs.insert(Phi.getOperand(0).getReg());
205     for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
206       if (PrevDefs.count(Phi.getOperand(I).getReg()))
207         return true;
208       PrevUses.insert(Phi.getOperand(I).getReg());
209     }
210     return false;
211   };
212   auto PLI = TII->analyzeLoopForPipelining(MBB);
213   for (auto &MI : *MBB) {
214     if (MI.isMetaInstruction() || MI.isTerminator())
215       continue;
216     if (MI.isPHI()) {
217       if (IsLoopCarried(MI)) {
218         LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
219         return false;
220       }
221       ++SchedPhiNum;
222       ++BestOffset;
223     } else
224       ++SchedInstrNum;
225     if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
226       LLVM_DEBUG(
227           dbgs() << "Boundary MI is not allowed in window scheduling!\n");
228       return false;
229     }
230     if (PLI->shouldIgnoreForPipelining(&MI)) {
231       LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
232                            "window scheduling!\n");
233       return false;
234     }
235     for (auto &Def : MI.all_defs())
236       if (Def.isReg() && Def.getReg().isPhysical()) {
237         LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
238                              "window scheduling!\n");
239         return false;
240       }
241   }
242   if (SchedInstrNum <= WindowRegionLimit) {
243     LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
244     return false;
245   }
246   return true;
247 }
248 
249 void WindowScheduler::preProcess() {
250   // Prior to window scheduling, it's necessary to backup the original MBB,
251   // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
252   backupMBB();
253   generateTripleMBB();
254   TripleDAG->startBlock(MBB);
255   TripleDAG->enterRegion(
256       MBB, MBB->begin(), MBB->getFirstTerminator(),
257       std::distance(MBB->begin(), MBB->getFirstTerminator()));
258   TripleDAG->buildSchedGraph(Context->AA);
259 }
260 
261 void WindowScheduler::postProcess() {
262   // After window scheduling, it's necessary to clear the TripleDAG and restore
263   // to the original MBB.
264   TripleDAG->exitRegion();
265   TripleDAG->finishBlock();
266   restoreMBB();
267 }
268 
269 void WindowScheduler::backupMBB() {
270   for (auto &MI : MBB->instrs())
271     OriMIs.push_back(&MI);
272   // Remove MIs and the corresponding live intervals.
273   for (auto &MI : make_early_inc_range(*MBB)) {
274     Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
275     MBB->remove(&MI);
276   }
277 }
278 
279 void WindowScheduler::restoreMBB() {
280   // Erase MIs and the corresponding live intervals.
281   for (auto &MI : make_early_inc_range(*MBB)) {
282     Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
283     MI.eraseFromParent();
284   }
285   // Restore MBB to the state before window scheduling.
286   llvm::append_range(*MBB, OriMIs);
287   updateLiveIntervals();
288 }
289 
290 void WindowScheduler::generateTripleMBB() {
291   const unsigned DuplicateNum = 3;
292   TriMIs.clear();
293   TriToOri.clear();
294   assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
295   // Step 1: Performing the first copy of MBB instructions, excluding
296   // terminators. At the same time, we back up the anti-register of phis.
297   // DefPairs hold the old and new define register pairs.
298   DenseMap<Register, Register> DefPairs;
299   for (auto *MI : OriMIs) {
300     if (MI->isMetaInstruction() || MI->isTerminator())
301       continue;
302     if (MI->isPHI())
303       if (Register AntiReg = getAntiRegister(MI))
304         DefPairs[MI->getOperand(0).getReg()] = AntiReg;
305     auto *NewMI = MF->CloneMachineInstr(MI);
306     MBB->push_back(NewMI);
307     TriMIs.push_back(NewMI);
308     TriToOri[NewMI] = MI;
309   }
310   // Step 2: Performing the remaining two copies of MBB instructions excluding
311   // phis, and the last one contains terminators. At the same time, registers
312   // are updated accordingly.
313   for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
314     for (auto *MI : OriMIs) {
315       if (MI->isPHI() || MI->isMetaInstruction() ||
316           (MI->isTerminator() && Cnt < DuplicateNum - 1))
317         continue;
318       auto *NewMI = MF->CloneMachineInstr(MI);
319       DenseMap<Register, Register> NewDefs;
320       // New defines are updated.
321       for (auto MO : NewMI->all_defs())
322         if (MO.isReg() && MO.getReg().isVirtual()) {
323           Register NewDef =
324               MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
325           NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
326           NewDefs[MO.getReg()] = NewDef;
327         }
328       // New uses are updated.
329       for (auto DefRegPair : DefPairs)
330         if (NewMI->readsRegister(DefRegPair.first, TRI)) {
331           Register NewUse = DefRegPair.second;
332           // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
333           //
334           // BB.3:                                  DefPairs
335           // ==================================
336           // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]  (%1,%7)
337           // ...
338           // ==================================
339           // ...
340           // %4 = sub i32 %1, %3
341           // ...
342           // %7 = add i32 %5, %6
343           // ...
344           // ----------------------------------
345           // ...
346           // %8 = sub i32 %7, %3                    (%1,%7),(%4,%8)
347           // ...
348           // %9 = add i32 %5, %6                    (%1,%7),(%4,%8),(%7,%9)
349           // ...
350           // ----------------------------------
351           // ...
352           // %10 = sub i32 %9, %3                   (%1,%7),(%4,%10),(%7,%9)
353           // ...            ^
354           // %11 = add i32 %5, %6                   (%1,%7),(%4,%10),(%7,%11)
355           // ...
356           // ==================================
357           //          < Terminators >
358           // ==================================
359           if (auto It = DefPairs.find(NewUse); It != DefPairs.end())
360             NewUse = It->second;
361           NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
362         }
363       // DefPairs is updated at last.
364       for (auto &NewDef : NewDefs)
365         DefPairs[NewDef.first] = NewDef.second;
366       MBB->push_back(NewMI);
367       TriMIs.push_back(NewMI);
368       TriToOri[NewMI] = MI;
369     }
370   }
371   // Step 3: The registers used by phis are updated, and they are generated in
372   // the third copy of MBB.
373   // In the privious example, the old phi is:
374   // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
375   // The new phi is:
376   // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
377   for (auto &Phi : MBB->phis()) {
378     for (auto DefRegPair : DefPairs)
379       if (Phi.readsRegister(DefRegPair.first, TRI))
380         Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
381   }
382   updateLiveIntervals();
383 }
384 
385 void WindowScheduler::restoreTripleMBB() {
386   // After list scheduling, the MBB is restored in one traversal.
387   for (size_t I = 0; I < TriMIs.size(); ++I) {
388     auto *MI = TriMIs[I];
389     auto OldPos = MBB->begin();
390     std::advance(OldPos, I);
391     auto CurPos = MI->getIterator();
392     if (CurPos != OldPos) {
393       MBB->splice(OldPos, MBB, CurPos);
394       Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
395     }
396   }
397 }
398 
399 SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
400                                                         unsigned SearchRatio) {
401   // We use SearchRatio to get the index range, and then evenly get the indexes
402   // according to the SearchNum. This is a simple huristic. Depending on the
403   // characteristics of the target, more complex algorithms can be used for both
404   // performance and compilation time.
405   assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
406   unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
407   unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
408   SmallVector<unsigned> SearchIndexes;
409   for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
410     SearchIndexes.push_back(Idx);
411   return SearchIndexes;
412 }
413 
414 int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
415   // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
416   unsigned MaxDepth = 1;
417   for (auto &SU : DAG.SUnits)
418     MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
419   return MaxDepth * WindowIICoeff;
420 }
421 
422 int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
423                                        unsigned Offset) {
424   int InitII = getEstimatedII(DAG);
425   ResourceManager RM(Subtarget, &DAG);
426   RM.init(InitII);
427   // ResourceManager and DAG are used to calculate the maximum cycle for the
428   // scheduled MIs. Since MIs in the Region have already been scheduled, the
429   // emit cycles can be estimated in order here.
430   int CurCycle = 0;
431   auto Range = getScheduleRange(Offset, SchedInstrNum);
432   for (auto &MI : Range) {
433     auto *SU = DAG.getSUnit(&MI);
434     int ExpectCycle = CurCycle;
435     // The predecessors of current MI determine its earliest issue cycle.
436     for (auto &Pred : SU->Preds) {
437       if (Pred.isWeak())
438         continue;
439       auto *PredMI = Pred.getSUnit()->getInstr();
440       int PredCycle = getOriCycle(PredMI);
441       ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
442     }
443     // Zero cost instructions do not need to check resource.
444     if (!TII->isZeroCost(MI.getOpcode())) {
445       // ResourceManager can be used to detect resource conflicts between the
446       // current MI and the previously inserted MIs.
447       while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
448         ++CurCycle;
449         if (CurCycle == (int)WindowIILimit)
450           return CurCycle;
451       }
452       RM.reserveResources(*SU, CurCycle);
453     }
454     OriToCycle[getOriMI(&MI)] = CurCycle;
455     LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
456                       << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
457   }
458   LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
459   return CurCycle;
460 }
461 
462 // By utilizing TripleDAG, we can easily establish dependencies between A and B.
463 // Based on the MaxCycle and the issue cycle of A and B, we can determine
464 // whether it is necessary to add a stall cycle. This is because, without
465 // inserting the stall cycle, the latency constraint between A and B cannot be
466 // satisfied. The details are as follows:
467 //
468 // New MBB:
469 // ========================================
470 //                 < Phis >
471 // ========================================     (sliding direction)
472 // MBB copy 1                                            |
473 //                                                       V
474 //
475 // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~  ----schedule window-----
476 //                    |                                  |
477 // ===================V====================              |
478 // MBB copy 2      < MI B >                              |
479 //                                                       |
480 //                 < MI A >                              V
481 // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~  ------------------------
482 //                    :
483 // ===================V====================
484 // MBB copy 3      < MI B'>
485 //
486 //
487 //
488 //
489 // ========================================
490 //              < Terminators >
491 // ========================================
492 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
493   int MaxStallCycle = 0;
494   int CurrentII = MaxCycle + 1;
495   auto Range = getScheduleRange(Offset, SchedInstrNum);
496   for (auto &MI : Range) {
497     auto *SU = TripleDAG->getSUnit(&MI);
498     int DefCycle = getOriCycle(&MI);
499     for (auto &Succ : SU->Succs) {
500       if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
501         continue;
502       // If the expected cycle does not exceed CurrentII, no check is needed.
503       if (DefCycle + (int)Succ.getLatency() <= CurrentII)
504         continue;
505       // If the cycle of the scheduled MI A is less than that of the scheduled
506       // MI B, the scheduling will fail because the lifetime of the
507       // corresponding register exceeds II.
508       auto *SuccMI = Succ.getSUnit()->getInstr();
509       int UseCycle = getOriCycle(SuccMI);
510       if (DefCycle < UseCycle)
511         return WindowIILimit;
512       // Get the stall cycle introduced by the register between two trips.
513       int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
514       MaxStallCycle = std::max(MaxStallCycle, StallCycle);
515     }
516   }
517   LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
518   return MaxStallCycle;
519 }
520 
521 unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
522   LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
523   int MaxCycle = calculateMaxCycle(DAG, Offset);
524   if (MaxCycle == (int)WindowIILimit)
525     return MaxCycle;
526   int StallCycle = calculateStallCycle(Offset, MaxCycle);
527   if (StallCycle == (int)WindowIILimit)
528     return StallCycle;
529   // The value of II is equal to the maximum execution cycle plus 1.
530   return MaxCycle + StallCycle + 1;
531 }
532 
533 void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
534   LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
535   for (auto &Phi : MBB->phis()) {
536     int LateCycle = INT_MAX;
537     auto *SU = TripleDAG->getSUnit(&Phi);
538     for (auto &Succ : SU->Succs) {
539       // Phi doesn't have any Anti successors.
540       if (Succ.getKind() != SDep::Data)
541         continue;
542       // Phi is scheduled before the successor of stage 0. The issue cycle of
543       // phi is the latest cycle in this interval.
544       auto *SuccMI = Succ.getSUnit()->getInstr();
545       int Cycle = getOriCycle(SuccMI);
546       if (getOriStage(getOriMI(SuccMI), Offset) == 0)
547         LateCycle = std::min(LateCycle, Cycle);
548     }
549     // The anti-dependency of phi need to be handled separately in the same way.
550     if (Register AntiReg = getAntiRegister(&Phi)) {
551       auto *AntiMI = MRI->getVRegDef(AntiReg);
552       // AntiReg may be defined outside the kernel MBB.
553       if (AntiMI->getParent() == MBB) {
554         auto AntiCycle = getOriCycle(AntiMI);
555         if (getOriStage(getOriMI(AntiMI), Offset) == 0)
556           LateCycle = std::min(LateCycle, AntiCycle);
557       }
558     }
559     // If there is no limit to the late cycle, a default value is given.
560     if (LateCycle == INT_MAX)
561       LateCycle = (int)(II - 1);
562     LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
563     // The issue cycle of phi is set to the latest cycle in the interval.
564     auto *OriPhi = getOriMI(&Phi);
565     OriToCycle[OriPhi] = LateCycle;
566   }
567 }
568 
569 DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
570                                                              unsigned II) {
571   // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
572   // way is to put phi at the beginning of the current cycle.
573   DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
574   auto Range = getScheduleRange(Offset, SchedInstrNum);
575   for (auto &Phi : MBB->phis())
576     CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
577   for (auto &MI : Range)
578     CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
579   // Each MI is assigned a separate ordered Id, which is used as a sort marker
580   // in the following expand process.
581   DenseMap<MachineInstr *, int> IssueOrder;
582   int Id = 0;
583   for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
584     auto It = CycleToMIs.find(Cycle);
585     if (It == CycleToMIs.end())
586       continue;
587     for (auto *MI : It->second)
588       IssueOrder[MI] = Id++;
589   }
590   return IssueOrder;
591 }
592 
593 void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
594   // At the first update, Offset is equal to SchedPhiNum. At this time, only
595   // BestII, BestOffset, and BaseII need to be updated.
596   if (Offset == SchedPhiNum) {
597     BestII = II;
598     BestOffset = SchedPhiNum;
599     BaseII = II;
600     return;
601   }
602   // The update will only continue if the II is smaller than BestII and the II
603   // is sufficiently small.
604   if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
605     return;
606   BestII = II;
607   BestOffset = Offset;
608   // Record the result of the current list scheduling, noting that each MI is
609   // stored unordered in SchedResult.
610   SchedResult.clear();
611   auto IssueOrder = getIssueOrder(Offset, II);
612   for (auto &Pair : OriToCycle) {
613     assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
614     SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
615                                           getOriStage(Pair.first, Offset),
616                                           IssueOrder[Pair.first]));
617   }
618 }
619 
620 void WindowScheduler::expand() {
621   // The MIs in the SchedResult are sorted by the issue order ID.
622   llvm::stable_sort(SchedResult,
623                     [](const std::tuple<MachineInstr *, int, int, int> &A,
624                        const std::tuple<MachineInstr *, int, int, int> &B) {
625                       return std::get<3>(A) < std::get<3>(B);
626                     });
627   // Use the scheduling infrastructure for expansion, noting that InstrChanges
628   // is not supported here.
629   DenseMap<MachineInstr *, int> Cycles, Stages;
630   std::vector<MachineInstr *> OrderedInsts;
631   for (auto &Info : SchedResult) {
632     auto *MI = std::get<0>(Info);
633     OrderedInsts.push_back(MI);
634     Cycles[MI] = std::get<1>(Info);
635     Stages[MI] = std::get<2>(Info);
636     LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
637                       << "]: " << *MI);
638   }
639   ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
640                     std::move(Stages));
641   ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
642                              ModuloScheduleExpander::InstrChangesTy());
643   MSE.expand();
644   MSE.cleanup();
645 }
646 
647 void WindowScheduler::updateLiveIntervals() {
648   SmallVector<Register, 128> UsedRegs;
649   for (MachineInstr &MI : *MBB)
650     for (const MachineOperand &MO : MI.operands()) {
651       if (!MO.isReg() || MO.getReg() == 0)
652         continue;
653       Register Reg = MO.getReg();
654       if (!is_contained(UsedRegs, Reg))
655         UsedRegs.push_back(Reg);
656     }
657   Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs);
658 }
659 
660 iterator_range<MachineBasicBlock::iterator>
661 WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
662   auto RegionBegin = MBB->begin();
663   std::advance(RegionBegin, Offset);
664   auto RegionEnd = RegionBegin;
665   std::advance(RegionEnd, Num);
666   return make_range(RegionBegin, RegionEnd);
667 }
668 
669 int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
670   assert(TriToOri.count(NewMI) && "Cannot find original MI!");
671   auto *OriMI = TriToOri[NewMI];
672   assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
673   return OriToCycle[OriMI];
674 }
675 
676 MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
677   assert(TriToOri.count(NewMI) && "Cannot find original MI!");
678   return TriToOri[NewMI];
679 }
680 
681 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
682   assert(llvm::is_contained(OriMIs, OriMI) && "Cannot find OriMI in OriMIs!");
683   // If there is no instruction fold, all MI stages are 0.
684   if (Offset == SchedPhiNum)
685     return 0;
686   // For those MIs with an ID less than the Offset, their stages are set to 0,
687   // while the rest are set to 1.
688   unsigned Id = 0;
689   for (auto *MI : OriMIs) {
690     if (MI->isMetaInstruction())
691       continue;
692     if (MI == OriMI)
693       break;
694     ++Id;
695   }
696   return Id >= (size_t)Offset ? 1 : 0;
697 }
698 
699 Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
700   assert(Phi->isPHI() && "Expecting PHI!");
701   Register AntiReg;
702   for (auto MO : Phi->uses()) {
703     if (MO.isReg())
704       AntiReg = MO.getReg();
705     else if (MO.isMBB() && MO.getMBB() == MBB)
706       return AntiReg;
707   }
708   return 0;
709 }
710