1 //======----------- WindowScheduler.cpp - window scheduler -------------======//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Window Scheduling software pipelining algorithm.
10 //
11 // The fundamental concept of the window scheduling algorithm involves folding
12 // the original MBB at a specific position, followed by list scheduling on the
13 // folded MIs. The optimal scheduling result is then chosen from various folding
14 // positions as the final scheduling outcome.
15 //
16 // The primary challenge in this algorithm lies in generating the folded MIs and
17 // establishing their dependencies. We have innovatively employed a new MBB,
18 // created by copying the original MBB three times, known as TripleMBB. This
19 // TripleMBB enables the convenient implementation of MI folding and dependency
20 // establishment. To facilitate the algorithm's implementation, we have also
21 // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
22 //
23 // Another challenge in the algorithm is the scheduling of phis. Semantically,
24 // it is difficult to place the phis in the window and perform list scheduling.
25 // Therefore, we schedule these phis separately after each list scheduling.
26 //
27 // The provided implementation is designed for use before the Register Allocator
28 // (RA). If the target requires implementation after RA, it is recommended to
29 // reimplement analyseII(), schedulePhi(), and expand(). Additionally,
30 // target-specific logic can be added in initialize(), preProcess(), and
31 // postProcess().
32 //
33 // Lastly, it is worth mentioning that getSearchIndexes() is an important
34 // function. We have experimented with more complex heuristics on downstream
35 // target and achieved favorable results.
36 //
37 //===----------------------------------------------------------------------===//
38 #include "llvm/CodeGen/WindowScheduler.h"
39 #include "llvm/ADT/Statistic.h"
40 #include "llvm/CodeGen/LiveIntervals.h"
41 #include "llvm/CodeGen/MachineLoopInfo.h"
42 #include "llvm/CodeGen/MachinePipeliner.h"
43 #include "llvm/CodeGen/ModuloSchedule.h"
44 #include "llvm/CodeGen/TargetPassConfig.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/TimeProfiler.h"
48
49 using namespace llvm;
50
51 #define DEBUG_TYPE "pipeliner"
52
53 namespace {
54 STATISTIC(NumTryWindowSchedule,
55 "Number of loops that we attempt to use window scheduling");
56 STATISTIC(NumTryWindowSearch,
57 "Number of times that we run list schedule in the window scheduling");
58 STATISTIC(NumWindowSchedule,
59 "Number of loops that we successfully use window scheduling");
60 STATISTIC(NumFailAnalyseII,
61 "Window scheduling abort due to the failure of the II analysis");
62
63 cl::opt<unsigned>
64 WindowSearchNum("window-search-num",
65 cl::desc("The number of searches per loop in the window "
66 "algorithm. 0 means no search number limit."),
67 cl::Hidden, cl::init(6));
68
69 cl::opt<unsigned> WindowSearchRatio(
70 "window-search-ratio",
71 cl::desc("The ratio of searches per loop in the window algorithm. 100 "
72 "means search all positions in the loop, while 0 means not "
73 "performing any search."),
74 cl::Hidden, cl::init(40));
75
76 cl::opt<unsigned> WindowIICoeff(
77 "window-ii-coeff",
78 cl::desc(
79 "The coefficient used when initializing II in the window algorithm."),
80 cl::Hidden, cl::init(5));
81
82 cl::opt<unsigned> WindowRegionLimit(
83 "window-region-limit",
84 cl::desc(
85 "The lower limit of the scheduling region in the window algorithm."),
86 cl::Hidden, cl::init(3));
87
88 cl::opt<unsigned> WindowDiffLimit(
89 "window-diff-limit",
90 cl::desc("The lower limit of the difference between best II and base II in "
91 "the window algorithm. If the difference is smaller than "
92 "this lower limit, window scheduling will not be performed."),
93 cl::Hidden, cl::init(2));
94 } // namespace
95
96 // WindowIILimit serves as an indicator of abnormal scheduling results and could
97 // potentially be referenced by the derived target window scheduler.
98 cl::opt<unsigned>
99 WindowIILimit("window-ii-limit",
100 cl::desc("The upper limit of II in the window algorithm."),
101 cl::Hidden, cl::init(1000));
102
WindowScheduler(MachineSchedContext * C,MachineLoop & ML)103 WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
104 : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
105 Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
106 TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
107 TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
108 createMachineScheduler(/*OnlyBuildGraph=*/true));
109 }
110
run()111 bool WindowScheduler::run() {
112 if (!initialize()) {
113 LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
114 return false;
115 }
116 // The window algorithm is time-consuming, and its compilation time should be
117 // taken into consideration.
118 TimeTraceScope Scope("WindowSearch");
119 ++NumTryWindowSchedule;
120 // Performing the relevant processing before window scheduling.
121 preProcess();
122 // The main window scheduling begins.
123 std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
124 auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
125 for (unsigned Idx : SearchIndexes) {
126 OriToCycle.clear();
127 ++NumTryWindowSearch;
128 // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
129 // be added to Idx.
130 unsigned Offset = Idx + SchedPhiNum;
131 auto Range = getScheduleRange(Offset, SchedInstrNum);
132 SchedDAG->startBlock(MBB);
133 SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
134 SchedDAG->schedule();
135 LLVM_DEBUG(SchedDAG->dump());
136 unsigned II = analyseII(*SchedDAG, Offset);
137 if (II == WindowIILimit) {
138 restoreTripleMBB();
139 LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
140 ++NumFailAnalyseII;
141 continue;
142 }
143 schedulePhi(Offset, II);
144 updateScheduleResult(Offset, II);
145 restoreTripleMBB();
146 LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
147 << II << ".\n");
148 }
149 // Performing the relevant processing after window scheduling.
150 postProcess();
151 // Check whether the scheduling result is valid.
152 if (!isScheduleValid()) {
153 LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
154 return false;
155 }
156 LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
157 << " and Best II is " << BestII << ".\n");
158 // Expand the scheduling result to prologue, kernel, and epilogue.
159 expand();
160 ++NumWindowSchedule;
161 return true;
162 }
163
164 ScheduleDAGInstrs *
createMachineScheduler(bool OnlyBuildGraph)165 WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
166 return OnlyBuildGraph
167 ? new ScheduleDAGMI(
168 Context, std::make_unique<PostGenericScheduler>(Context),
169 true)
170 : Context->PassConfig->createMachineScheduler(Context);
171 }
172
initialize()173 bool WindowScheduler::initialize() {
174 if (!Subtarget->enableWindowScheduler()) {
175 LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
176 return false;
177 }
178 // Initialized the member variables used by window algorithm.
179 OriMIs.clear();
180 TriMIs.clear();
181 TriToOri.clear();
182 OriToCycle.clear();
183 SchedResult.clear();
184 SchedPhiNum = 0;
185 SchedInstrNum = 0;
186 BestII = UINT_MAX;
187 BestOffset = 0;
188 BaseII = 0;
189 // List scheduling used in the window algorithm depends on LiveIntervals.
190 if (!Context->LIS) {
191 LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
192 return false;
193 }
194 // Check each MI in MBB.
195 SmallSet<Register, 8> PrevDefs;
196 SmallSet<Register, 8> PrevUses;
197 auto IsLoopCarried = [&](MachineInstr &Phi) {
198 // Two cases are checked here: (1)The virtual register defined by the
199 // preceding phi is used by the succeeding phi;(2)The preceding phi uses the
200 // virtual register defined by the succeeding phi.
201 if (PrevUses.count(Phi.getOperand(0).getReg()))
202 return true;
203 PrevDefs.insert(Phi.getOperand(0).getReg());
204 for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
205 if (PrevDefs.count(Phi.getOperand(I).getReg()))
206 return true;
207 PrevUses.insert(Phi.getOperand(I).getReg());
208 }
209 return false;
210 };
211 auto PLI = TII->analyzeLoopForPipelining(MBB);
212 for (auto &MI : *MBB) {
213 if (MI.isMetaInstruction() || MI.isTerminator())
214 continue;
215 if (MI.isPHI()) {
216 if (IsLoopCarried(MI)) {
217 LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
218 return false;
219 }
220 ++SchedPhiNum;
221 ++BestOffset;
222 } else
223 ++SchedInstrNum;
224 if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
225 LLVM_DEBUG(
226 dbgs() << "Boundary MI is not allowed in window scheduling!\n");
227 return false;
228 }
229 if (PLI->shouldIgnoreForPipelining(&MI)) {
230 LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
231 "window scheduling!\n");
232 return false;
233 }
234 for (auto &Def : MI.all_defs())
235 if (Def.isReg() && Def.getReg().isPhysical()) {
236 LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
237 "window scheduling!\n");
238 return false;
239 }
240 }
241 if (SchedInstrNum <= WindowRegionLimit) {
242 LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
243 return false;
244 }
245 return true;
246 }
247
preProcess()248 void WindowScheduler::preProcess() {
249 // Prior to window scheduling, it's necessary to backup the original MBB,
250 // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
251 backupMBB();
252 generateTripleMBB();
253 TripleDAG->startBlock(MBB);
254 TripleDAG->enterRegion(
255 MBB, MBB->begin(), MBB->getFirstTerminator(),
256 std::distance(MBB->begin(), MBB->getFirstTerminator()));
257 TripleDAG->buildSchedGraph(Context->AA);
258 }
259
postProcess()260 void WindowScheduler::postProcess() {
261 // After window scheduling, it's necessary to clear the TripleDAG and restore
262 // to the original MBB.
263 TripleDAG->exitRegion();
264 TripleDAG->finishBlock();
265 restoreMBB();
266 }
267
backupMBB()268 void WindowScheduler::backupMBB() {
269 for (auto &MI : MBB->instrs())
270 OriMIs.push_back(&MI);
271 // Remove MIs and the corresponding live intervals.
272 for (auto &MI : make_early_inc_range(*MBB)) {
273 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
274 MBB->remove(&MI);
275 }
276 }
277
restoreMBB()278 void WindowScheduler::restoreMBB() {
279 // Erase MIs and the corresponding live intervals.
280 for (auto &MI : make_early_inc_range(*MBB)) {
281 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
282 MI.eraseFromParent();
283 }
284 // Restore MBB to the state before window scheduling.
285 for (auto *MI : OriMIs)
286 MBB->push_back(MI);
287 updateLiveIntervals();
288 }
289
generateTripleMBB()290 void WindowScheduler::generateTripleMBB() {
291 const unsigned DuplicateNum = 3;
292 TriMIs.clear();
293 TriToOri.clear();
294 assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
295 // Step 1: Performing the first copy of MBB instructions, excluding
296 // terminators. At the same time, we back up the anti-register of phis.
297 // DefPairs hold the old and new define register pairs.
298 DenseMap<Register, Register> DefPairs;
299 for (auto *MI : OriMIs) {
300 if (MI->isMetaInstruction() || MI->isTerminator())
301 continue;
302 if (MI->isPHI())
303 if (Register AntiReg = getAntiRegister(MI))
304 DefPairs[MI->getOperand(0).getReg()] = AntiReg;
305 auto *NewMI = MF->CloneMachineInstr(MI);
306 MBB->push_back(NewMI);
307 TriMIs.push_back(NewMI);
308 TriToOri[NewMI] = MI;
309 }
310 // Step 2: Performing the remaining two copies of MBB instructions excluding
311 // phis, and the last one contains terminators. At the same time, registers
312 // are updated accordingly.
313 for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
314 for (auto *MI : OriMIs) {
315 if (MI->isPHI() || MI->isMetaInstruction() ||
316 (MI->isTerminator() && Cnt < DuplicateNum - 1))
317 continue;
318 auto *NewMI = MF->CloneMachineInstr(MI);
319 DenseMap<Register, Register> NewDefs;
320 // New defines are updated.
321 for (auto MO : NewMI->all_defs())
322 if (MO.isReg() && MO.getReg().isVirtual()) {
323 Register NewDef =
324 MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
325 NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
326 NewDefs[MO.getReg()] = NewDef;
327 }
328 // New uses are updated.
329 for (auto DefRegPair : DefPairs)
330 if (NewMI->readsRegister(DefRegPair.first, TRI)) {
331 Register NewUse = DefRegPair.second;
332 // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
333 //
334 // BB.3: DefPairs
335 // ==================================
336 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7)
337 // ...
338 // ==================================
339 // ...
340 // %4 = sub i32 %1, %3
341 // ...
342 // %7 = add i32 %5, %6
343 // ...
344 // ----------------------------------
345 // ...
346 // %8 = sub i32 %7, %3 (%1,%7),(%4,%8)
347 // ...
348 // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9)
349 // ...
350 // ----------------------------------
351 // ...
352 // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9)
353 // ... ^
354 // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11)
355 // ...
356 // ==================================
357 // < Terminators >
358 // ==================================
359 if (DefPairs.count(NewUse))
360 NewUse = DefPairs[NewUse];
361 NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
362 }
363 // DefPairs is updated at last.
364 for (auto &NewDef : NewDefs)
365 DefPairs[NewDef.first] = NewDef.second;
366 MBB->push_back(NewMI);
367 TriMIs.push_back(NewMI);
368 TriToOri[NewMI] = MI;
369 }
370 }
371 // Step 3: The registers used by phis are updated, and they are generated in
372 // the third copy of MBB.
373 // In the privious example, the old phi is:
374 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
375 // The new phi is:
376 // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
377 for (auto &Phi : MBB->phis()) {
378 for (auto DefRegPair : DefPairs)
379 if (Phi.readsRegister(DefRegPair.first, TRI))
380 Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
381 }
382 updateLiveIntervals();
383 }
384
restoreTripleMBB()385 void WindowScheduler::restoreTripleMBB() {
386 // After list scheduling, the MBB is restored in one traversal.
387 for (size_t I = 0; I < TriMIs.size(); ++I) {
388 auto *MI = TriMIs[I];
389 auto OldPos = MBB->begin();
390 std::advance(OldPos, I);
391 auto CurPos = MI->getIterator();
392 if (CurPos != OldPos) {
393 MBB->splice(OldPos, MBB, CurPos);
394 Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
395 }
396 }
397 }
398
getSearchIndexes(unsigned SearchNum,unsigned SearchRatio)399 SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
400 unsigned SearchRatio) {
401 // We use SearchRatio to get the index range, and then evenly get the indexes
402 // according to the SearchNum. This is a simple huristic. Depending on the
403 // characteristics of the target, more complex algorithms can be used for both
404 // performance and compilation time.
405 assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
406 unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
407 unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
408 SmallVector<unsigned> SearchIndexes;
409 for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
410 SearchIndexes.push_back(Idx);
411 return SearchIndexes;
412 }
413
getEstimatedII(ScheduleDAGInstrs & DAG)414 int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
415 // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
416 unsigned MaxDepth = 1;
417 for (auto &SU : DAG.SUnits)
418 MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
419 return MaxDepth * WindowIICoeff;
420 }
421
calculateMaxCycle(ScheduleDAGInstrs & DAG,unsigned Offset)422 int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
423 unsigned Offset) {
424 int InitII = getEstimatedII(DAG);
425 ResourceManager RM(Subtarget, &DAG);
426 RM.init(InitII);
427 // ResourceManager and DAG are used to calculate the maximum cycle for the
428 // scheduled MIs. Since MIs in the Region have already been scheduled, the
429 // emit cycles can be estimated in order here.
430 int CurCycle = 0;
431 auto Range = getScheduleRange(Offset, SchedInstrNum);
432 for (auto &MI : Range) {
433 auto *SU = DAG.getSUnit(&MI);
434 int ExpectCycle = CurCycle;
435 // The predecessors of current MI determine its earliest issue cycle.
436 for (auto &Pred : SU->Preds) {
437 if (Pred.isWeak())
438 continue;
439 auto *PredMI = Pred.getSUnit()->getInstr();
440 int PredCycle = getOriCycle(PredMI);
441 ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
442 }
443 // Zero cost instructions do not need to check resource.
444 if (!TII->isZeroCost(MI.getOpcode())) {
445 // ResourceManager can be used to detect resource conflicts between the
446 // current MI and the previously inserted MIs.
447 while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
448 ++CurCycle;
449 if (CurCycle == (int)WindowIILimit)
450 return CurCycle;
451 }
452 RM.reserveResources(*SU, CurCycle);
453 }
454 OriToCycle[getOriMI(&MI)] = CurCycle;
455 LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
456 << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
457 }
458 LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
459 return CurCycle;
460 }
461
462 // By utilizing TripleDAG, we can easily establish dependencies between A and B.
463 // Based on the MaxCycle and the issue cycle of A and B, we can determine
464 // whether it is necessary to add a stall cycle. This is because, without
465 // inserting the stall cycle, the latency constraint between A and B cannot be
466 // satisfied. The details are as follows:
467 //
468 // New MBB:
469 // ========================================
470 // < Phis >
471 // ======================================== (sliding direction)
472 // MBB copy 1 |
473 // V
474 //
475 // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window-----
476 // | |
477 // ===================V==================== |
478 // MBB copy 2 < MI B > |
479 // |
480 // < MI A > V
481 // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------
482 // :
483 // ===================V====================
484 // MBB copy 3 < MI B'>
485 //
486 //
487 //
488 //
489 // ========================================
490 // < Terminators >
491 // ========================================
calculateStallCycle(unsigned Offset,int MaxCycle)492 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
493 int MaxStallCycle = 0;
494 int CurrentII = MaxCycle + 1;
495 auto Range = getScheduleRange(Offset, SchedInstrNum);
496 for (auto &MI : Range) {
497 auto *SU = TripleDAG->getSUnit(&MI);
498 int DefCycle = getOriCycle(&MI);
499 for (auto &Succ : SU->Succs) {
500 if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
501 continue;
502 // If the expected cycle does not exceed CurrentII, no check is needed.
503 if (DefCycle + (int)Succ.getLatency() <= CurrentII)
504 continue;
505 // If the cycle of the scheduled MI A is less than that of the scheduled
506 // MI B, the scheduling will fail because the lifetime of the
507 // corresponding register exceeds II.
508 auto *SuccMI = Succ.getSUnit()->getInstr();
509 int UseCycle = getOriCycle(SuccMI);
510 if (DefCycle < UseCycle)
511 return WindowIILimit;
512 // Get the stall cycle introduced by the register between two trips.
513 int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
514 MaxStallCycle = std::max(MaxStallCycle, StallCycle);
515 }
516 }
517 LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
518 return MaxStallCycle;
519 }
520
analyseII(ScheduleDAGInstrs & DAG,unsigned Offset)521 unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
522 LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
523 int MaxCycle = calculateMaxCycle(DAG, Offset);
524 if (MaxCycle == (int)WindowIILimit)
525 return MaxCycle;
526 int StallCycle = calculateStallCycle(Offset, MaxCycle);
527 if (StallCycle == (int)WindowIILimit)
528 return StallCycle;
529 // The value of II is equal to the maximum execution cycle plus 1.
530 return MaxCycle + StallCycle + 1;
531 }
532
schedulePhi(int Offset,unsigned & II)533 void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
534 LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
535 for (auto &Phi : MBB->phis()) {
536 int LateCycle = INT_MAX;
537 auto *SU = TripleDAG->getSUnit(&Phi);
538 for (auto &Succ : SU->Succs) {
539 // Phi doesn't have any Anti successors.
540 if (Succ.getKind() != SDep::Data)
541 continue;
542 // Phi is scheduled before the successor of stage 0. The issue cycle of
543 // phi is the latest cycle in this interval.
544 auto *SuccMI = Succ.getSUnit()->getInstr();
545 int Cycle = getOriCycle(SuccMI);
546 if (getOriStage(getOriMI(SuccMI), Offset) == 0)
547 LateCycle = std::min(LateCycle, Cycle);
548 }
549 // The anti-dependency of phi need to be handled separately in the same way.
550 if (Register AntiReg = getAntiRegister(&Phi)) {
551 auto *AntiMI = MRI->getVRegDef(AntiReg);
552 // AntiReg may be defined outside the kernel MBB.
553 if (AntiMI->getParent() == MBB) {
554 auto AntiCycle = getOriCycle(AntiMI);
555 if (getOriStage(getOriMI(AntiMI), Offset) == 0)
556 LateCycle = std::min(LateCycle, AntiCycle);
557 }
558 }
559 // If there is no limit to the late cycle, a default value is given.
560 if (LateCycle == INT_MAX)
561 LateCycle = (int)(II - 1);
562 LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
563 // The issue cycle of phi is set to the latest cycle in the interval.
564 auto *OriPhi = getOriMI(&Phi);
565 OriToCycle[OriPhi] = LateCycle;
566 }
567 }
568
getIssueOrder(unsigned Offset,unsigned II)569 DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
570 unsigned II) {
571 // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
572 // way is to put phi at the beginning of the current cycle.
573 DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
574 auto Range = getScheduleRange(Offset, SchedInstrNum);
575 for (auto &Phi : MBB->phis())
576 CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
577 for (auto &MI : Range)
578 CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
579 // Each MI is assigned a separate ordered Id, which is used as a sort marker
580 // in the following expand process.
581 DenseMap<MachineInstr *, int> IssueOrder;
582 int Id = 0;
583 for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
584 if (!CycleToMIs.count(Cycle))
585 continue;
586 for (auto *MI : CycleToMIs[Cycle])
587 IssueOrder[MI] = Id++;
588 }
589 return IssueOrder;
590 }
591
updateScheduleResult(unsigned Offset,unsigned II)592 void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
593 // At the first update, Offset is equal to SchedPhiNum. At this time, only
594 // BestII, BestOffset, and BaseII need to be updated.
595 if (Offset == SchedPhiNum) {
596 BestII = II;
597 BestOffset = SchedPhiNum;
598 BaseII = II;
599 return;
600 }
601 // The update will only continue if the II is smaller than BestII and the II
602 // is sufficiently small.
603 if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
604 return;
605 BestII = II;
606 BestOffset = Offset;
607 // Record the result of the current list scheduling, noting that each MI is
608 // stored unordered in SchedResult.
609 SchedResult.clear();
610 auto IssueOrder = getIssueOrder(Offset, II);
611 for (auto &Pair : OriToCycle) {
612 assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
613 SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
614 getOriStage(Pair.first, Offset),
615 IssueOrder[Pair.first]));
616 }
617 }
618
expand()619 void WindowScheduler::expand() {
620 // The MIs in the SchedResult are sorted by the issue order ID.
621 llvm::stable_sort(SchedResult,
622 [](const std::tuple<MachineInstr *, int, int, int> &A,
623 const std::tuple<MachineInstr *, int, int, int> &B) {
624 return std::get<3>(A) < std::get<3>(B);
625 });
626 // Use the scheduling infrastructure for expansion, noting that InstrChanges
627 // is not supported here.
628 DenseMap<MachineInstr *, int> Cycles, Stages;
629 std::vector<MachineInstr *> OrderedInsts;
630 for (auto &Info : SchedResult) {
631 auto *MI = std::get<0>(Info);
632 OrderedInsts.push_back(MI);
633 Cycles[MI] = std::get<1>(Info);
634 Stages[MI] = std::get<2>(Info);
635 LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
636 << "]: " << *MI);
637 }
638 ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
639 std::move(Stages));
640 ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
641 ModuloScheduleExpander::InstrChangesTy());
642 MSE.expand();
643 MSE.cleanup();
644 }
645
updateLiveIntervals()646 void WindowScheduler::updateLiveIntervals() {
647 SmallVector<Register, 128> UsedRegs;
648 for (MachineInstr &MI : *MBB)
649 for (const MachineOperand &MO : MI.operands()) {
650 if (!MO.isReg() || MO.getReg() == 0)
651 continue;
652 Register Reg = MO.getReg();
653 if (!is_contained(UsedRegs, Reg))
654 UsedRegs.push_back(Reg);
655 }
656 Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs);
657 }
658
659 iterator_range<MachineBasicBlock::iterator>
getScheduleRange(unsigned Offset,unsigned Num)660 WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
661 auto RegionBegin = MBB->begin();
662 std::advance(RegionBegin, Offset);
663 auto RegionEnd = RegionBegin;
664 std::advance(RegionEnd, Num);
665 return make_range(RegionBegin, RegionEnd);
666 }
667
getOriCycle(MachineInstr * NewMI)668 int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
669 assert(TriToOri.count(NewMI) && "Cannot find original MI!");
670 auto *OriMI = TriToOri[NewMI];
671 assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
672 return OriToCycle[OriMI];
673 }
674
getOriMI(MachineInstr * NewMI)675 MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
676 assert(TriToOri.count(NewMI) && "Cannot find original MI!");
677 return TriToOri[NewMI];
678 }
679
getOriStage(MachineInstr * OriMI,unsigned Offset)680 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
681 assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
682 "Cannot find OriMI in OriMIs!");
683 // If there is no instruction fold, all MI stages are 0.
684 if (Offset == SchedPhiNum)
685 return 0;
686 // For those MIs with an ID less than the Offset, their stages are set to 0,
687 // while the rest are set to 1.
688 unsigned Id = 0;
689 for (auto *MI : OriMIs) {
690 if (MI->isMetaInstruction())
691 continue;
692 if (MI == OriMI)
693 break;
694 ++Id;
695 }
696 return Id >= (size_t)Offset ? 1 : 0;
697 }
698
getAntiRegister(MachineInstr * Phi)699 Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
700 assert(Phi->isPHI() && "Expecting PHI!");
701 Register AntiReg;
702 for (auto MO : Phi->uses()) {
703 if (MO.isReg())
704 AntiReg = MO.getReg();
705 else if (MO.isMBB() && MO.getMBB() == MBB)
706 return AntiReg;
707 }
708 return 0;
709 }
710