xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp (revision a4a491e2238b12ccd64d3faf9e6401487f6f1f1b)
1 //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // MachineScheduler schedules machine instructions after phi elimination. It
10 // preserves LiveIntervals so it can be invoked before register allocation.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/CodeGen/MachineScheduler.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/BitVector.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/PriorityQueue.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/CodeGen/LiveInterval.h"
25 #include "llvm/CodeGen/LiveIntervals.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineDominators.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineFunctionPass.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineLoopInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/CodeGen/MachinePassRegistry.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterClassInfo.h"
36 #include "llvm/CodeGen/RegisterPressure.h"
37 #include "llvm/CodeGen/ScheduleDAG.h"
38 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
39 #include "llvm/CodeGen/ScheduleDAGMutation.h"
40 #include "llvm/CodeGen/ScheduleDFS.h"
41 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
42 #include "llvm/CodeGen/SlotIndexes.h"
43 #include "llvm/CodeGen/TargetFrameLowering.h"
44 #include "llvm/CodeGen/TargetInstrInfo.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/CodeGen/TargetRegisterInfo.h"
48 #include "llvm/CodeGen/TargetSchedule.h"
49 #include "llvm/CodeGen/TargetSubtargetInfo.h"
50 #include "llvm/Config/llvm-config.h"
51 #include "llvm/InitializePasses.h"
52 #include "llvm/MC/LaneBitmask.h"
53 #include "llvm/Pass.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Compiler.h"
56 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/ErrorHandling.h"
58 #include "llvm/Support/GraphWriter.h"
59 #include "llvm/Support/MachineValueType.h"
60 #include "llvm/Support/raw_ostream.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <limits>
66 #include <memory>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 #include <vector>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "machine-scheduler"
75 
76 STATISTIC(NumClustered, "Number of load/store pairs clustered");
77 
78 namespace llvm {
79 
80 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
81                            cl::desc("Force top-down list scheduling"));
82 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
83                             cl::desc("Force bottom-up list scheduling"));
84 cl::opt<bool>
85 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
86                        cl::desc("Print critical path length to stdout"));
87 
88 cl::opt<bool> VerifyScheduling(
89     "verify-misched", cl::Hidden,
90     cl::desc("Verify machine instrs before and after machine scheduling"));
91 
92 #ifndef NDEBUG
93 cl::opt<bool> ViewMISchedDAGs(
94     "view-misched-dags", cl::Hidden,
95     cl::desc("Pop up a window to show MISched dags after they are processed"));
96 cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
97                         cl::desc("Print schedule DAGs"));
98 #else
99 const bool ViewMISchedDAGs = false;
100 const bool PrintDAGs = false;
101 #endif // NDEBUG
102 
103 } // end namespace llvm
104 
105 #ifndef NDEBUG
106 /// In some situations a few uninteresting nodes depend on nearly all other
107 /// nodes in the graph, provide a cutoff to hide them.
108 static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
109   cl::desc("Hide nodes with more predecessor/successor than cutoff"));
110 
111 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
112   cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
113 
114 static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
115   cl::desc("Only schedule this function"));
116 static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
117                                         cl::desc("Only schedule this MBB#"));
118 #endif // NDEBUG
119 
120 /// Avoid quadratic complexity in unusually large basic blocks by limiting the
121 /// size of the ready lists.
122 static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
123   cl::desc("Limit ready list to N instructions"), cl::init(256));
124 
125 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
126   cl::desc("Enable register pressure scheduling."), cl::init(true));
127 
128 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
129   cl::desc("Enable cyclic critical path analysis."), cl::init(true));
130 
131 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
132                                         cl::desc("Enable memop clustering."),
133                                         cl::init(true));
134 static cl::opt<bool>
135     ForceFastCluster("force-fast-cluster", cl::Hidden,
136                      cl::desc("Switch to fast cluster algorithm with the lost "
137                               "of some fusion opportunities"),
138                      cl::init(false));
139 static cl::opt<unsigned>
140     FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
141                          cl::desc("The threshold for fast cluster"),
142                          cl::init(1000));
143 
144 // DAG subtrees must have at least this many nodes.
145 static const unsigned MinSubtreeSize = 8;
146 
147 // Pin the vtables to this file.
148 void MachineSchedStrategy::anchor() {}
149 
150 void ScheduleDAGMutation::anchor() {}
151 
152 //===----------------------------------------------------------------------===//
153 // Machine Instruction Scheduling Pass and Registry
154 //===----------------------------------------------------------------------===//
155 
156 MachineSchedContext::MachineSchedContext() {
157   RegClassInfo = new RegisterClassInfo();
158 }
159 
160 MachineSchedContext::~MachineSchedContext() {
161   delete RegClassInfo;
162 }
163 
164 namespace {
165 
166 /// Base class for a machine scheduler class that can run at any point.
167 class MachineSchedulerBase : public MachineSchedContext,
168                              public MachineFunctionPass {
169 public:
170   MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
171 
172   void print(raw_ostream &O, const Module* = nullptr) const override;
173 
174 protected:
175   void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
176 };
177 
178 /// MachineScheduler runs after coalescing and before register allocation.
179 class MachineScheduler : public MachineSchedulerBase {
180 public:
181   MachineScheduler();
182 
183   void getAnalysisUsage(AnalysisUsage &AU) const override;
184 
185   bool runOnMachineFunction(MachineFunction&) override;
186 
187   static char ID; // Class identification, replacement for typeinfo
188 
189 protected:
190   ScheduleDAGInstrs *createMachineScheduler();
191 };
192 
193 /// PostMachineScheduler runs after shortly before code emission.
194 class PostMachineScheduler : public MachineSchedulerBase {
195 public:
196   PostMachineScheduler();
197 
198   void getAnalysisUsage(AnalysisUsage &AU) const override;
199 
200   bool runOnMachineFunction(MachineFunction&) override;
201 
202   static char ID; // Class identification, replacement for typeinfo
203 
204 protected:
205   ScheduleDAGInstrs *createPostMachineScheduler();
206 };
207 
208 } // end anonymous namespace
209 
210 char MachineScheduler::ID = 0;
211 
212 char &llvm::MachineSchedulerID = MachineScheduler::ID;
213 
214 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
215                       "Machine Instruction Scheduler", false, false)
216 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
217 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
218 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
219 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
220 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
221 INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
222                     "Machine Instruction Scheduler", false, false)
223 
224 MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
225   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
226 }
227 
228 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
229   AU.setPreservesCFG();
230   AU.addRequired<MachineDominatorTree>();
231   AU.addRequired<MachineLoopInfo>();
232   AU.addRequired<AAResultsWrapperPass>();
233   AU.addRequired<TargetPassConfig>();
234   AU.addRequired<SlotIndexes>();
235   AU.addPreserved<SlotIndexes>();
236   AU.addRequired<LiveIntervals>();
237   AU.addPreserved<LiveIntervals>();
238   MachineFunctionPass::getAnalysisUsage(AU);
239 }
240 
241 char PostMachineScheduler::ID = 0;
242 
243 char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
244 
245 INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
246                       "PostRA Machine Instruction Scheduler", false, false)
247 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
248 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
249 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
250 INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
251                     "PostRA Machine Instruction Scheduler", false, false)
252 
253 PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
254   initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
255 }
256 
257 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
258   AU.setPreservesCFG();
259   AU.addRequired<MachineDominatorTree>();
260   AU.addRequired<MachineLoopInfo>();
261   AU.addRequired<AAResultsWrapperPass>();
262   AU.addRequired<TargetPassConfig>();
263   MachineFunctionPass::getAnalysisUsage(AU);
264 }
265 
266 MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
267     MachineSchedRegistry::Registry;
268 
269 /// A dummy default scheduler factory indicates whether the scheduler
270 /// is overridden on the command line.
271 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
272   return nullptr;
273 }
274 
275 /// MachineSchedOpt allows command line selection of the scheduler.
276 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
277                RegisterPassParser<MachineSchedRegistry>>
278 MachineSchedOpt("misched",
279                 cl::init(&useDefaultMachineSched), cl::Hidden,
280                 cl::desc("Machine instruction scheduler to use"));
281 
282 static MachineSchedRegistry
283 DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
284                      useDefaultMachineSched);
285 
286 static cl::opt<bool> EnableMachineSched(
287     "enable-misched",
288     cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
289     cl::Hidden);
290 
291 static cl::opt<bool> EnablePostRAMachineSched(
292     "enable-post-misched",
293     cl::desc("Enable the post-ra machine instruction scheduling pass."),
294     cl::init(true), cl::Hidden);
295 
296 /// Decrement this iterator until reaching the top or a non-debug instr.
297 static MachineBasicBlock::const_iterator
298 priorNonDebug(MachineBasicBlock::const_iterator I,
299               MachineBasicBlock::const_iterator Beg) {
300   assert(I != Beg && "reached the top of the region, cannot decrement");
301   while (--I != Beg) {
302     if (!I->isDebugOrPseudoInstr())
303       break;
304   }
305   return I;
306 }
307 
308 /// Non-const version.
309 static MachineBasicBlock::iterator
310 priorNonDebug(MachineBasicBlock::iterator I,
311               MachineBasicBlock::const_iterator Beg) {
312   return priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)
313       .getNonConstIterator();
314 }
315 
316 /// If this iterator is a debug value, increment until reaching the End or a
317 /// non-debug instruction.
318 static MachineBasicBlock::const_iterator
319 nextIfDebug(MachineBasicBlock::const_iterator I,
320             MachineBasicBlock::const_iterator End) {
321   for(; I != End; ++I) {
322     if (!I->isDebugOrPseudoInstr())
323       break;
324   }
325   return I;
326 }
327 
328 /// Non-const version.
329 static MachineBasicBlock::iterator
330 nextIfDebug(MachineBasicBlock::iterator I,
331             MachineBasicBlock::const_iterator End) {
332   return nextIfDebug(MachineBasicBlock::const_iterator(I), End)
333       .getNonConstIterator();
334 }
335 
336 /// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
337 ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
338   // Select the scheduler, or set the default.
339   MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
340   if (Ctor != useDefaultMachineSched)
341     return Ctor(this);
342 
343   // Get the default scheduler set by the target for this function.
344   ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
345   if (Scheduler)
346     return Scheduler;
347 
348   // Default to GenericScheduler.
349   return createGenericSchedLive(this);
350 }
351 
352 /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
353 /// the caller. We don't have a command line option to override the postRA
354 /// scheduler. The Target must configure it.
355 ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
356   // Get the postRA scheduler set by the target for this function.
357   ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
358   if (Scheduler)
359     return Scheduler;
360 
361   // Default to GenericScheduler.
362   return createGenericSchedPostRA(this);
363 }
364 
365 /// Top-level MachineScheduler pass driver.
366 ///
367 /// Visit blocks in function order. Divide each block into scheduling regions
368 /// and visit them bottom-up. Visiting regions bottom-up is not required, but is
369 /// consistent with the DAG builder, which traverses the interior of the
370 /// scheduling regions bottom-up.
371 ///
372 /// This design avoids exposing scheduling boundaries to the DAG builder,
373 /// simplifying the DAG builder's support for "special" target instructions.
374 /// At the same time the design allows target schedulers to operate across
375 /// scheduling boundaries, for example to bundle the boundary instructions
376 /// without reordering them. This creates complexity, because the target
377 /// scheduler must update the RegionBegin and RegionEnd positions cached by
378 /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
379 /// design would be to split blocks at scheduling boundaries, but LLVM has a
380 /// general bias against block splitting purely for implementation simplicity.
381 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
382   if (skipFunction(mf.getFunction()))
383     return false;
384 
385   if (EnableMachineSched.getNumOccurrences()) {
386     if (!EnableMachineSched)
387       return false;
388   } else if (!mf.getSubtarget().enableMachineScheduler())
389     return false;
390 
391   LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
392 
393   // Initialize the context of the pass.
394   MF = &mf;
395   MLI = &getAnalysis<MachineLoopInfo>();
396   MDT = &getAnalysis<MachineDominatorTree>();
397   PassConfig = &getAnalysis<TargetPassConfig>();
398   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
399 
400   LIS = &getAnalysis<LiveIntervals>();
401 
402   if (VerifyScheduling) {
403     LLVM_DEBUG(LIS->dump());
404     MF->verify(this, "Before machine scheduling.");
405   }
406   RegClassInfo->runOnMachineFunction(*MF);
407 
408   // Instantiate the selected scheduler for this target, function, and
409   // optimization level.
410   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
411   scheduleRegions(*Scheduler, false);
412 
413   LLVM_DEBUG(LIS->dump());
414   if (VerifyScheduling)
415     MF->verify(this, "After machine scheduling.");
416   return true;
417 }
418 
419 bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
420   if (skipFunction(mf.getFunction()))
421     return false;
422 
423   if (EnablePostRAMachineSched.getNumOccurrences()) {
424     if (!EnablePostRAMachineSched)
425       return false;
426   } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) {
427     LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
428     return false;
429   }
430   LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
431 
432   // Initialize the context of the pass.
433   MF = &mf;
434   MLI = &getAnalysis<MachineLoopInfo>();
435   PassConfig = &getAnalysis<TargetPassConfig>();
436   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
437 
438   if (VerifyScheduling)
439     MF->verify(this, "Before post machine scheduling.");
440 
441   // Instantiate the selected scheduler for this target, function, and
442   // optimization level.
443   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
444   scheduleRegions(*Scheduler, true);
445 
446   if (VerifyScheduling)
447     MF->verify(this, "After post machine scheduling.");
448   return true;
449 }
450 
451 /// Return true of the given instruction should not be included in a scheduling
452 /// region.
453 ///
454 /// MachineScheduler does not currently support scheduling across calls. To
455 /// handle calls, the DAG builder needs to be modified to create register
456 /// anti/output dependencies on the registers clobbered by the call's regmask
457 /// operand. In PreRA scheduling, the stack pointer adjustment already prevents
458 /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
459 /// the boundary, but there would be no benefit to postRA scheduling across
460 /// calls this late anyway.
461 static bool isSchedBoundary(MachineBasicBlock::iterator MI,
462                             MachineBasicBlock *MBB,
463                             MachineFunction *MF,
464                             const TargetInstrInfo *TII) {
465   return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
466 }
467 
468 /// A region of an MBB for scheduling.
469 namespace {
470 struct SchedRegion {
471   /// RegionBegin is the first instruction in the scheduling region, and
472   /// RegionEnd is either MBB->end() or the scheduling boundary after the
473   /// last instruction in the scheduling region. These iterators cannot refer
474   /// to instructions outside of the identified scheduling region because
475   /// those may be reordered before scheduling this region.
476   MachineBasicBlock::iterator RegionBegin;
477   MachineBasicBlock::iterator RegionEnd;
478   unsigned NumRegionInstrs;
479 
480   SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
481               unsigned N) :
482     RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
483 };
484 } // end anonymous namespace
485 
486 using MBBRegionsVector = SmallVector<SchedRegion, 16>;
487 
488 static void
489 getSchedRegions(MachineBasicBlock *MBB,
490                 MBBRegionsVector &Regions,
491                 bool RegionsTopDown) {
492   MachineFunction *MF = MBB->getParent();
493   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
494 
495   MachineBasicBlock::iterator I = nullptr;
496   for(MachineBasicBlock::iterator RegionEnd = MBB->end();
497       RegionEnd != MBB->begin(); RegionEnd = I) {
498 
499     // Avoid decrementing RegionEnd for blocks with no terminator.
500     if (RegionEnd != MBB->end() ||
501         isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
502       --RegionEnd;
503     }
504 
505     // The next region starts above the previous region. Look backward in the
506     // instruction stream until we find the nearest boundary.
507     unsigned NumRegionInstrs = 0;
508     I = RegionEnd;
509     for (;I != MBB->begin(); --I) {
510       MachineInstr &MI = *std::prev(I);
511       if (isSchedBoundary(&MI, &*MBB, MF, TII))
512         break;
513       if (!MI.isDebugOrPseudoInstr()) {
514         // MBB::size() uses instr_iterator to count. Here we need a bundle to
515         // count as a single instruction.
516         ++NumRegionInstrs;
517       }
518     }
519 
520     // It's possible we found a scheduling region that only has debug
521     // instructions. Don't bother scheduling these.
522     if (NumRegionInstrs != 0)
523       Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
524   }
525 
526   if (RegionsTopDown)
527     std::reverse(Regions.begin(), Regions.end());
528 }
529 
530 /// Main driver for both MachineScheduler and PostMachineScheduler.
531 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
532                                            bool FixKillFlags) {
533   // Visit all machine basic blocks.
534   //
535   // TODO: Visit blocks in global postorder or postorder within the bottom-up
536   // loop tree. Then we can optionally compute global RegPressure.
537   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
538        MBB != MBBEnd; ++MBB) {
539 
540     Scheduler.startBlock(&*MBB);
541 
542 #ifndef NDEBUG
543     if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
544       continue;
545     if (SchedOnlyBlock.getNumOccurrences()
546         && (int)SchedOnlyBlock != MBB->getNumber())
547       continue;
548 #endif
549 
550     // Break the block into scheduling regions [I, RegionEnd). RegionEnd
551     // points to the scheduling boundary at the bottom of the region. The DAG
552     // does not include RegionEnd, but the region does (i.e. the next
553     // RegionEnd is above the previous RegionBegin). If the current block has
554     // no terminator then RegionEnd == MBB->end() for the bottom region.
555     //
556     // All the regions of MBB are first found and stored in MBBRegions, which
557     // will be processed (MBB) top-down if initialized with true.
558     //
559     // The Scheduler may insert instructions during either schedule() or
560     // exitRegion(), even for empty regions. So the local iterators 'I' and
561     // 'RegionEnd' are invalid across these calls. Instructions must not be
562     // added to other regions than the current one without updating MBBRegions.
563 
564     MBBRegionsVector MBBRegions;
565     getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown());
566     for (const SchedRegion &R : MBBRegions) {
567       MachineBasicBlock::iterator I = R.RegionBegin;
568       MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
569       unsigned NumRegionInstrs = R.NumRegionInstrs;
570 
571       // Notify the scheduler of the region, even if we may skip scheduling
572       // it. Perhaps it still needs to be bundled.
573       Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
574 
575       // Skip empty scheduling regions (0 or 1 schedulable instructions).
576       if (I == RegionEnd || I == std::prev(RegionEnd)) {
577         // Close the current region. Bundle the terminator if needed.
578         // This invalidates 'RegionEnd' and 'I'.
579         Scheduler.exitRegion();
580         continue;
581       }
582       LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
583       LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB)
584                         << " " << MBB->getName() << "\n  From: " << *I
585                         << "    To: ";
586                  if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
587                  else dbgs() << "End\n";
588                  dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
589       if (DumpCriticalPathLength) {
590         errs() << MF->getName();
591         errs() << ":%bb. " << MBB->getNumber();
592         errs() << " " << MBB->getName() << " \n";
593       }
594 
595       // Schedule a region: possibly reorder instructions.
596       // This invalidates the original region iterators.
597       Scheduler.schedule();
598 
599       // Close the current region.
600       Scheduler.exitRegion();
601     }
602     Scheduler.finishBlock();
603     // FIXME: Ideally, no further passes should rely on kill flags. However,
604     // thumb2 size reduction is currently an exception, so the PostMIScheduler
605     // needs to do this.
606     if (FixKillFlags)
607       Scheduler.fixupKills(*MBB);
608   }
609   Scheduler.finalizeSchedule();
610 }
611 
612 void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
613   // unimplemented
614 }
615 
616 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
617 LLVM_DUMP_METHOD void ReadyQueue::dump() const {
618   dbgs() << "Queue " << Name << ": ";
619   for (const SUnit *SU : Queue)
620     dbgs() << SU->NodeNum << " ";
621   dbgs() << "\n";
622 }
623 #endif
624 
625 //===----------------------------------------------------------------------===//
626 // ScheduleDAGMI - Basic machine instruction scheduling. This is
627 // independent of PreRA/PostRA scheduling and involves no extra book-keeping for
628 // virtual registers.
629 // ===----------------------------------------------------------------------===/
630 
631 // Provide a vtable anchor.
632 ScheduleDAGMI::~ScheduleDAGMI() = default;
633 
634 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
635 /// NumPredsLeft reaches zero, release the successor node.
636 ///
637 /// FIXME: Adjust SuccSU height based on MinLatency.
638 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
639   SUnit *SuccSU = SuccEdge->getSUnit();
640 
641   if (SuccEdge->isWeak()) {
642     --SuccSU->WeakPredsLeft;
643     if (SuccEdge->isCluster())
644       NextClusterSucc = SuccSU;
645     return;
646   }
647 #ifndef NDEBUG
648   if (SuccSU->NumPredsLeft == 0) {
649     dbgs() << "*** Scheduling failed! ***\n";
650     dumpNode(*SuccSU);
651     dbgs() << " has been released too many times!\n";
652     llvm_unreachable(nullptr);
653   }
654 #endif
655   // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
656   // CurrCycle may have advanced since then.
657   if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
658     SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
659 
660   --SuccSU->NumPredsLeft;
661   if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
662     SchedImpl->releaseTopNode(SuccSU);
663 }
664 
665 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
666 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
667   for (SDep &Succ : SU->Succs)
668     releaseSucc(SU, &Succ);
669 }
670 
671 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
672 /// NumSuccsLeft reaches zero, release the predecessor node.
673 ///
674 /// FIXME: Adjust PredSU height based on MinLatency.
675 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
676   SUnit *PredSU = PredEdge->getSUnit();
677 
678   if (PredEdge->isWeak()) {
679     --PredSU->WeakSuccsLeft;
680     if (PredEdge->isCluster())
681       NextClusterPred = PredSU;
682     return;
683   }
684 #ifndef NDEBUG
685   if (PredSU->NumSuccsLeft == 0) {
686     dbgs() << "*** Scheduling failed! ***\n";
687     dumpNode(*PredSU);
688     dbgs() << " has been released too many times!\n";
689     llvm_unreachable(nullptr);
690   }
691 #endif
692   // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
693   // CurrCycle may have advanced since then.
694   if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
695     PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
696 
697   --PredSU->NumSuccsLeft;
698   if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
699     SchedImpl->releaseBottomNode(PredSU);
700 }
701 
702 /// releasePredecessors - Call releasePred on each of SU's predecessors.
703 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
704   for (SDep &Pred : SU->Preds)
705     releasePred(SU, &Pred);
706 }
707 
708 void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) {
709   ScheduleDAGInstrs::startBlock(bb);
710   SchedImpl->enterMBB(bb);
711 }
712 
713 void ScheduleDAGMI::finishBlock() {
714   SchedImpl->leaveMBB();
715   ScheduleDAGInstrs::finishBlock();
716 }
717 
718 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
719 /// crossing a scheduling boundary. [begin, end) includes all instructions in
720 /// the region, including the boundary itself and single-instruction regions
721 /// that don't get scheduled.
722 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
723                                      MachineBasicBlock::iterator begin,
724                                      MachineBasicBlock::iterator end,
725                                      unsigned regioninstrs)
726 {
727   ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
728 
729   SchedImpl->initPolicy(begin, end, regioninstrs);
730 }
731 
732 /// This is normally called from the main scheduler loop but may also be invoked
733 /// by the scheduling strategy to perform additional code motion.
734 void ScheduleDAGMI::moveInstruction(
735   MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
736   // Advance RegionBegin if the first instruction moves down.
737   if (&*RegionBegin == MI)
738     ++RegionBegin;
739 
740   // Update the instruction stream.
741   BB->splice(InsertPos, BB, MI);
742 
743   // Update LiveIntervals
744   if (LIS)
745     LIS->handleMove(*MI, /*UpdateFlags=*/true);
746 
747   // Recede RegionBegin if an instruction moves above the first.
748   if (RegionBegin == InsertPos)
749     RegionBegin = MI;
750 }
751 
752 bool ScheduleDAGMI::checkSchedLimit() {
753 #if LLVM_ENABLE_ABI_BREAKING_CHECKS && !defined(NDEBUG)
754   if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
755     CurrentTop = CurrentBottom;
756     return false;
757   }
758   ++NumInstrsScheduled;
759 #endif
760   return true;
761 }
762 
763 /// Per-region scheduling driver, called back from
764 /// MachineScheduler::runOnMachineFunction. This is a simplified driver that
765 /// does not consider liveness or register pressure. It is useful for PostRA
766 /// scheduling and potentially other custom schedulers.
767 void ScheduleDAGMI::schedule() {
768   LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
769   LLVM_DEBUG(SchedImpl->dumpPolicy());
770 
771   // Build the DAG.
772   buildSchedGraph(AA);
773 
774   postprocessDAG();
775 
776   SmallVector<SUnit*, 8> TopRoots, BotRoots;
777   findRootsAndBiasEdges(TopRoots, BotRoots);
778 
779   LLVM_DEBUG(dump());
780   if (PrintDAGs) dump();
781   if (ViewMISchedDAGs) viewGraph();
782 
783   // Initialize the strategy before modifying the DAG.
784   // This may initialize a DFSResult to be used for queue priority.
785   SchedImpl->initialize(this);
786 
787   // Initialize ready queues now that the DAG and priority data are finalized.
788   initQueues(TopRoots, BotRoots);
789 
790   bool IsTopNode = false;
791   while (true) {
792     LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
793     SUnit *SU = SchedImpl->pickNode(IsTopNode);
794     if (!SU) break;
795 
796     assert(!SU->isScheduled && "Node already scheduled");
797     if (!checkSchedLimit())
798       break;
799 
800     MachineInstr *MI = SU->getInstr();
801     if (IsTopNode) {
802       assert(SU->isTopReady() && "node still has unscheduled dependencies");
803       if (&*CurrentTop == MI)
804         CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
805       else
806         moveInstruction(MI, CurrentTop);
807     } else {
808       assert(SU->isBottomReady() && "node still has unscheduled dependencies");
809       MachineBasicBlock::iterator priorII =
810         priorNonDebug(CurrentBottom, CurrentTop);
811       if (&*priorII == MI)
812         CurrentBottom = priorII;
813       else {
814         if (&*CurrentTop == MI)
815           CurrentTop = nextIfDebug(++CurrentTop, priorII);
816         moveInstruction(MI, CurrentBottom);
817         CurrentBottom = MI;
818       }
819     }
820     // Notify the scheduling strategy before updating the DAG.
821     // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
822     // runs, it can then use the accurate ReadyCycle time to determine whether
823     // newly released nodes can move to the readyQ.
824     SchedImpl->schedNode(SU, IsTopNode);
825 
826     updateQueues(SU, IsTopNode);
827   }
828   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
829 
830   placeDebugValues();
831 
832   LLVM_DEBUG({
833     dbgs() << "*** Final schedule for "
834            << printMBBReference(*begin()->getParent()) << " ***\n";
835     dumpSchedule();
836     dbgs() << '\n';
837   });
838 }
839 
840 /// Apply each ScheduleDAGMutation step in order.
841 void ScheduleDAGMI::postprocessDAG() {
842   for (auto &m : Mutations)
843     m->apply(this);
844 }
845 
846 void ScheduleDAGMI::
847 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
848                       SmallVectorImpl<SUnit*> &BotRoots) {
849   for (SUnit &SU : SUnits) {
850     assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
851 
852     // Order predecessors so DFSResult follows the critical path.
853     SU.biasCriticalPath();
854 
855     // A SUnit is ready to top schedule if it has no predecessors.
856     if (!SU.NumPredsLeft)
857       TopRoots.push_back(&SU);
858     // A SUnit is ready to bottom schedule if it has no successors.
859     if (!SU.NumSuccsLeft)
860       BotRoots.push_back(&SU);
861   }
862   ExitSU.biasCriticalPath();
863 }
864 
865 /// Identify DAG roots and setup scheduler queues.
866 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
867                                ArrayRef<SUnit*> BotRoots) {
868   NextClusterSucc = nullptr;
869   NextClusterPred = nullptr;
870 
871   // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
872   //
873   // Nodes with unreleased weak edges can still be roots.
874   // Release top roots in forward order.
875   for (SUnit *SU : TopRoots)
876     SchedImpl->releaseTopNode(SU);
877 
878   // Release bottom roots in reverse order so the higher priority nodes appear
879   // first. This is more natural and slightly more efficient.
880   for (SmallVectorImpl<SUnit*>::const_reverse_iterator
881          I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
882     SchedImpl->releaseBottomNode(*I);
883   }
884 
885   releaseSuccessors(&EntrySU);
886   releasePredecessors(&ExitSU);
887 
888   SchedImpl->registerRoots();
889 
890   // Advance past initial DebugValues.
891   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
892   CurrentBottom = RegionEnd;
893 }
894 
895 /// Update scheduler queues after scheduling an instruction.
896 void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
897   // Release dependent instructions for scheduling.
898   if (IsTopNode)
899     releaseSuccessors(SU);
900   else
901     releasePredecessors(SU);
902 
903   SU->isScheduled = true;
904 }
905 
906 /// Reinsert any remaining debug_values, just like the PostRA scheduler.
907 void ScheduleDAGMI::placeDebugValues() {
908   // If first instruction was a DBG_VALUE then put it back.
909   if (FirstDbgValue) {
910     BB->splice(RegionBegin, BB, FirstDbgValue);
911     RegionBegin = FirstDbgValue;
912   }
913 
914   for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
915          DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
916     std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
917     MachineInstr *DbgValue = P.first;
918     MachineBasicBlock::iterator OrigPrevMI = P.second;
919     if (&*RegionBegin == DbgValue)
920       ++RegionBegin;
921     BB->splice(std::next(OrigPrevMI), BB, DbgValue);
922     if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd)
923       RegionEnd = DbgValue;
924   }
925 }
926 
927 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
928 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
929   for (MachineInstr &MI : *this) {
930     if (SUnit *SU = getSUnit(&MI))
931       dumpNode(*SU);
932     else
933       dbgs() << "Missing SUnit\n";
934   }
935 }
936 #endif
937 
938 //===----------------------------------------------------------------------===//
939 // ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
940 // preservation.
941 //===----------------------------------------------------------------------===//
942 
943 ScheduleDAGMILive::~ScheduleDAGMILive() {
944   delete DFSResult;
945 }
946 
947 void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
948   const MachineInstr &MI = *SU.getInstr();
949   for (const MachineOperand &MO : MI.operands()) {
950     if (!MO.isReg())
951       continue;
952     if (!MO.readsReg())
953       continue;
954     if (TrackLaneMasks && !MO.isUse())
955       continue;
956 
957     Register Reg = MO.getReg();
958     if (!Register::isVirtualRegister(Reg))
959       continue;
960 
961     // Ignore re-defs.
962     if (TrackLaneMasks) {
963       bool FoundDef = false;
964       for (const MachineOperand &MO2 : MI.operands()) {
965         if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) {
966           FoundDef = true;
967           break;
968         }
969       }
970       if (FoundDef)
971         continue;
972     }
973 
974     // Record this local VReg use.
975     VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
976     for (; UI != VRegUses.end(); ++UI) {
977       if (UI->SU == &SU)
978         break;
979     }
980     if (UI == VRegUses.end())
981       VRegUses.insert(VReg2SUnit(Reg, LaneBitmask::getNone(), &SU));
982   }
983 }
984 
985 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
986 /// crossing a scheduling boundary. [begin, end) includes all instructions in
987 /// the region, including the boundary itself and single-instruction regions
988 /// that don't get scheduled.
989 void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
990                                 MachineBasicBlock::iterator begin,
991                                 MachineBasicBlock::iterator end,
992                                 unsigned regioninstrs)
993 {
994   // ScheduleDAGMI initializes SchedImpl's per-region policy.
995   ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
996 
997   // For convenience remember the end of the liveness region.
998   LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd);
999 
1000   SUPressureDiffs.clear();
1001 
1002   ShouldTrackPressure = SchedImpl->shouldTrackPressure();
1003   ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
1004 
1005   assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
1006          "ShouldTrackLaneMasks requires ShouldTrackPressure");
1007 }
1008 
1009 // Setup the register pressure trackers for the top scheduled and bottom
1010 // scheduled regions.
1011 void ScheduleDAGMILive::initRegPressure() {
1012   VRegUses.clear();
1013   VRegUses.setUniverse(MRI.getNumVirtRegs());
1014   for (SUnit &SU : SUnits)
1015     collectVRegUses(SU);
1016 
1017   TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
1018                     ShouldTrackLaneMasks, false);
1019   BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1020                     ShouldTrackLaneMasks, false);
1021 
1022   // Close the RPTracker to finalize live ins.
1023   RPTracker.closeRegion();
1024 
1025   LLVM_DEBUG(RPTracker.dump());
1026 
1027   // Initialize the live ins and live outs.
1028   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
1029   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
1030 
1031   // Close one end of the tracker so we can call
1032   // getMaxUpward/DownwardPressureDelta before advancing across any
1033   // instructions. This converts currently live regs into live ins/outs.
1034   TopRPTracker.closeTop();
1035   BotRPTracker.closeBottom();
1036 
1037   BotRPTracker.initLiveThru(RPTracker);
1038   if (!BotRPTracker.getLiveThru().empty()) {
1039     TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
1040     LLVM_DEBUG(dbgs() << "Live Thru: ";
1041                dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
1042   };
1043 
1044   // For each live out vreg reduce the pressure change associated with other
1045   // uses of the same vreg below the live-out reaching def.
1046   updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
1047 
1048   // Account for liveness generated by the region boundary.
1049   if (LiveRegionEnd != RegionEnd) {
1050     SmallVector<RegisterMaskPair, 8> LiveUses;
1051     BotRPTracker.recede(&LiveUses);
1052     updatePressureDiffs(LiveUses);
1053   }
1054 
1055   LLVM_DEBUG(dbgs() << "Top Pressure:\n";
1056              dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
1057              dbgs() << "Bottom Pressure:\n";
1058              dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI););
1059 
1060   assert((BotRPTracker.getPos() == RegionEnd ||
1061           (RegionEnd->isDebugInstr() &&
1062            BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
1063          "Can't find the region bottom");
1064 
1065   // Cache the list of excess pressure sets in this region. This will also track
1066   // the max pressure in the scheduled code for these sets.
1067   RegionCriticalPSets.clear();
1068   const std::vector<unsigned> &RegionPressure =
1069     RPTracker.getPressure().MaxSetPressure;
1070   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
1071     unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
1072     if (RegionPressure[i] > Limit) {
1073       LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit
1074                         << " Actual " << RegionPressure[i] << "\n");
1075       RegionCriticalPSets.push_back(PressureChange(i));
1076     }
1077   }
1078   LLVM_DEBUG(dbgs() << "Excess PSets: ";
1079              for (const PressureChange &RCPS
1080                   : RegionCriticalPSets) dbgs()
1081              << TRI->getRegPressureSetName(RCPS.getPSet()) << " ";
1082              dbgs() << "\n");
1083 }
1084 
1085 void ScheduleDAGMILive::
1086 updateScheduledPressure(const SUnit *SU,
1087                         const std::vector<unsigned> &NewMaxPressure) {
1088   const PressureDiff &PDiff = getPressureDiff(SU);
1089   unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
1090   for (const PressureChange &PC : PDiff) {
1091     if (!PC.isValid())
1092       break;
1093     unsigned ID = PC.getPSet();
1094     while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
1095       ++CritIdx;
1096     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
1097       if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
1098           && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
1099         RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
1100     }
1101     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
1102     if (NewMaxPressure[ID] >= Limit - 2) {
1103       LLVM_DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
1104                         << NewMaxPressure[ID]
1105                         << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ")
1106                         << Limit << "(+ " << BotRPTracker.getLiveThru()[ID]
1107                         << " livethru)\n");
1108     }
1109   }
1110 }
1111 
1112 /// Update the PressureDiff array for liveness after scheduling this
1113 /// instruction.
1114 void ScheduleDAGMILive::updatePressureDiffs(
1115     ArrayRef<RegisterMaskPair> LiveUses) {
1116   for (const RegisterMaskPair &P : LiveUses) {
1117     Register Reg = P.RegUnit;
1118     /// FIXME: Currently assuming single-use physregs.
1119     if (!Register::isVirtualRegister(Reg))
1120       continue;
1121 
1122     if (ShouldTrackLaneMasks) {
1123       // If the register has just become live then other uses won't change
1124       // this fact anymore => decrement pressure.
1125       // If the register has just become dead then other uses make it come
1126       // back to life => increment pressure.
1127       bool Decrement = P.LaneMask.any();
1128 
1129       for (const VReg2SUnit &V2SU
1130            : make_range(VRegUses.find(Reg), VRegUses.end())) {
1131         SUnit &SU = *V2SU.SU;
1132         if (SU.isScheduled || &SU == &ExitSU)
1133           continue;
1134 
1135         PressureDiff &PDiff = getPressureDiff(&SU);
1136         PDiff.addPressureChange(Reg, Decrement, &MRI);
1137         LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
1138                           << printReg(Reg, TRI) << ':'
1139                           << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr();
1140                    dbgs() << "              to "; PDiff.dump(*TRI););
1141       }
1142     } else {
1143       assert(P.LaneMask.any());
1144       LLVM_DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
1145       // This may be called before CurrentBottom has been initialized. However,
1146       // BotRPTracker must have a valid position. We want the value live into the
1147       // instruction or live out of the block, so ask for the previous
1148       // instruction's live-out.
1149       const LiveInterval &LI = LIS->getInterval(Reg);
1150       VNInfo *VNI;
1151       MachineBasicBlock::const_iterator I =
1152         nextIfDebug(BotRPTracker.getPos(), BB->end());
1153       if (I == BB->end())
1154         VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1155       else {
1156         LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
1157         VNI = LRQ.valueIn();
1158       }
1159       // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
1160       assert(VNI && "No live value at use.");
1161       for (const VReg2SUnit &V2SU
1162            : make_range(VRegUses.find(Reg), VRegUses.end())) {
1163         SUnit *SU = V2SU.SU;
1164         // If this use comes before the reaching def, it cannot be a last use,
1165         // so decrease its pressure change.
1166         if (!SU->isScheduled && SU != &ExitSU) {
1167           LiveQueryResult LRQ =
1168               LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1169           if (LRQ.valueIn() == VNI) {
1170             PressureDiff &PDiff = getPressureDiff(SU);
1171             PDiff.addPressureChange(Reg, true, &MRI);
1172             LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
1173                               << *SU->getInstr();
1174                        dbgs() << "              to "; PDiff.dump(*TRI););
1175           }
1176         }
1177       }
1178     }
1179   }
1180 }
1181 
1182 void ScheduleDAGMILive::dump() const {
1183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1184   if (EntrySU.getInstr() != nullptr)
1185     dumpNodeAll(EntrySU);
1186   for (const SUnit &SU : SUnits) {
1187     dumpNodeAll(SU);
1188     if (ShouldTrackPressure) {
1189       dbgs() << "  Pressure Diff      : ";
1190       getPressureDiff(&SU).dump(*TRI);
1191     }
1192     dbgs() << "  Single Issue       : ";
1193     if (SchedModel.mustBeginGroup(SU.getInstr()) &&
1194         SchedModel.mustEndGroup(SU.getInstr()))
1195       dbgs() << "true;";
1196     else
1197       dbgs() << "false;";
1198     dbgs() << '\n';
1199   }
1200   if (ExitSU.getInstr() != nullptr)
1201     dumpNodeAll(ExitSU);
1202 #endif
1203 }
1204 
1205 /// schedule - Called back from MachineScheduler::runOnMachineFunction
1206 /// after setting up the current scheduling region. [RegionBegin, RegionEnd)
1207 /// only includes instructions that have DAG nodes, not scheduling boundaries.
1208 ///
1209 /// This is a skeletal driver, with all the functionality pushed into helpers,
1210 /// so that it can be easily extended by experimental schedulers. Generally,
1211 /// implementing MachineSchedStrategy should be sufficient to implement a new
1212 /// scheduling algorithm. However, if a scheduler further subclasses
1213 /// ScheduleDAGMILive then it will want to override this virtual method in order
1214 /// to update any specialized state.
1215 void ScheduleDAGMILive::schedule() {
1216   LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
1217   LLVM_DEBUG(SchedImpl->dumpPolicy());
1218   buildDAGWithRegPressure();
1219 
1220   postprocessDAG();
1221 
1222   SmallVector<SUnit*, 8> TopRoots, BotRoots;
1223   findRootsAndBiasEdges(TopRoots, BotRoots);
1224 
1225   // Initialize the strategy before modifying the DAG.
1226   // This may initialize a DFSResult to be used for queue priority.
1227   SchedImpl->initialize(this);
1228 
1229   LLVM_DEBUG(dump());
1230   if (PrintDAGs) dump();
1231   if (ViewMISchedDAGs) viewGraph();
1232 
1233   // Initialize ready queues now that the DAG and priority data are finalized.
1234   initQueues(TopRoots, BotRoots);
1235 
1236   bool IsTopNode = false;
1237   while (true) {
1238     LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
1239     SUnit *SU = SchedImpl->pickNode(IsTopNode);
1240     if (!SU) break;
1241 
1242     assert(!SU->isScheduled && "Node already scheduled");
1243     if (!checkSchedLimit())
1244       break;
1245 
1246     scheduleMI(SU, IsTopNode);
1247 
1248     if (DFSResult) {
1249       unsigned SubtreeID = DFSResult->getSubtreeID(SU);
1250       if (!ScheduledTrees.test(SubtreeID)) {
1251         ScheduledTrees.set(SubtreeID);
1252         DFSResult->scheduleTree(SubtreeID);
1253         SchedImpl->scheduleTree(SubtreeID);
1254       }
1255     }
1256 
1257     // Notify the scheduling strategy after updating the DAG.
1258     SchedImpl->schedNode(SU, IsTopNode);
1259 
1260     updateQueues(SU, IsTopNode);
1261   }
1262   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1263 
1264   placeDebugValues();
1265 
1266   LLVM_DEBUG({
1267     dbgs() << "*** Final schedule for "
1268            << printMBBReference(*begin()->getParent()) << " ***\n";
1269     dumpSchedule();
1270     dbgs() << '\n';
1271   });
1272 }
1273 
1274 /// Build the DAG and setup three register pressure trackers.
1275 void ScheduleDAGMILive::buildDAGWithRegPressure() {
1276   if (!ShouldTrackPressure) {
1277     RPTracker.reset();
1278     RegionCriticalPSets.clear();
1279     buildSchedGraph(AA);
1280     return;
1281   }
1282 
1283   // Initialize the register pressure tracker used by buildSchedGraph.
1284   RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
1285                  ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
1286 
1287   // Account for liveness generate by the region boundary.
1288   if (LiveRegionEnd != RegionEnd)
1289     RPTracker.recede();
1290 
1291   // Build the DAG, and compute current register pressure.
1292   buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
1293 
1294   // Initialize top/bottom trackers after computing region pressure.
1295   initRegPressure();
1296 }
1297 
1298 void ScheduleDAGMILive::computeDFSResult() {
1299   if (!DFSResult)
1300     DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
1301   DFSResult->clear();
1302   ScheduledTrees.clear();
1303   DFSResult->resize(SUnits.size());
1304   DFSResult->compute(SUnits);
1305   ScheduledTrees.resize(DFSResult->getNumSubtrees());
1306 }
1307 
1308 /// Compute the max cyclic critical path through the DAG. The scheduling DAG
1309 /// only provides the critical path for single block loops. To handle loops that
1310 /// span blocks, we could use the vreg path latencies provided by
1311 /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
1312 /// available for use in the scheduler.
1313 ///
1314 /// The cyclic path estimation identifies a def-use pair that crosses the back
1315 /// edge and considers the depth and height of the nodes. For example, consider
1316 /// the following instruction sequence where each instruction has unit latency
1317 /// and defines an eponymous virtual register:
1318 ///
1319 /// a->b(a,c)->c(b)->d(c)->exit
1320 ///
1321 /// The cyclic critical path is a two cycles: b->c->b
1322 /// The acyclic critical path is four cycles: a->b->c->d->exit
1323 /// LiveOutHeight = height(c) = len(c->d->exit) = 2
1324 /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
1325 /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
1326 /// LiveInDepth = depth(b) = len(a->b) = 1
1327 ///
1328 /// LiveOutDepth - LiveInDepth = 3 - 1 = 2
1329 /// LiveInHeight - LiveOutHeight = 4 - 2 = 2
1330 /// CyclicCriticalPath = min(2, 2) = 2
1331 ///
1332 /// This could be relevant to PostRA scheduling, but is currently implemented
1333 /// assuming LiveIntervals.
1334 unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
1335   // This only applies to single block loop.
1336   if (!BB->isSuccessor(BB))
1337     return 0;
1338 
1339   unsigned MaxCyclicLatency = 0;
1340   // Visit each live out vreg def to find def/use pairs that cross iterations.
1341   for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
1342     Register Reg = P.RegUnit;
1343     if (!Register::isVirtualRegister(Reg))
1344       continue;
1345     const LiveInterval &LI = LIS->getInterval(Reg);
1346     const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
1347     if (!DefVNI)
1348       continue;
1349 
1350     MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
1351     const SUnit *DefSU = getSUnit(DefMI);
1352     if (!DefSU)
1353       continue;
1354 
1355     unsigned LiveOutHeight = DefSU->getHeight();
1356     unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
1357     // Visit all local users of the vreg def.
1358     for (const VReg2SUnit &V2SU
1359          : make_range(VRegUses.find(Reg), VRegUses.end())) {
1360       SUnit *SU = V2SU.SU;
1361       if (SU == &ExitSU)
1362         continue;
1363 
1364       // Only consider uses of the phi.
1365       LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
1366       if (!LRQ.valueIn()->isPHIDef())
1367         continue;
1368 
1369       // Assume that a path spanning two iterations is a cycle, which could
1370       // overestimate in strange cases. This allows cyclic latency to be
1371       // estimated as the minimum slack of the vreg's depth or height.
1372       unsigned CyclicLatency = 0;
1373       if (LiveOutDepth > SU->getDepth())
1374         CyclicLatency = LiveOutDepth - SU->getDepth();
1375 
1376       unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
1377       if (LiveInHeight > LiveOutHeight) {
1378         if (LiveInHeight - LiveOutHeight < CyclicLatency)
1379           CyclicLatency = LiveInHeight - LiveOutHeight;
1380       } else
1381         CyclicLatency = 0;
1382 
1383       LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
1384                         << SU->NodeNum << ") = " << CyclicLatency << "c\n");
1385       if (CyclicLatency > MaxCyclicLatency)
1386         MaxCyclicLatency = CyclicLatency;
1387     }
1388   }
1389   LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
1390   return MaxCyclicLatency;
1391 }
1392 
1393 /// Release ExitSU predecessors and setup scheduler queues. Re-position
1394 /// the Top RP tracker in case the region beginning has changed.
1395 void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
1396                                    ArrayRef<SUnit*> BotRoots) {
1397   ScheduleDAGMI::initQueues(TopRoots, BotRoots);
1398   if (ShouldTrackPressure) {
1399     assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1400     TopRPTracker.setPos(CurrentTop);
1401   }
1402 }
1403 
1404 /// Move an instruction and update register pressure.
1405 void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
1406   // Move the instruction to its new location in the instruction stream.
1407   MachineInstr *MI = SU->getInstr();
1408 
1409   if (IsTopNode) {
1410     assert(SU->isTopReady() && "node still has unscheduled dependencies");
1411     if (&*CurrentTop == MI)
1412       CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
1413     else {
1414       moveInstruction(MI, CurrentTop);
1415       TopRPTracker.setPos(MI);
1416     }
1417 
1418     if (ShouldTrackPressure) {
1419       // Update top scheduled pressure.
1420       RegisterOperands RegOpers;
1421       RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1422       if (ShouldTrackLaneMasks) {
1423         // Adjust liveness and add missing dead+read-undef flags.
1424         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1425         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1426       } else {
1427         // Adjust for missing dead-def flags.
1428         RegOpers.detectDeadDefs(*MI, *LIS);
1429       }
1430 
1431       TopRPTracker.advance(RegOpers);
1432       assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
1433       LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure(
1434                      TopRPTracker.getRegSetPressureAtPos(), TRI););
1435 
1436       updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
1437     }
1438   } else {
1439     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
1440     MachineBasicBlock::iterator priorII =
1441       priorNonDebug(CurrentBottom, CurrentTop);
1442     if (&*priorII == MI)
1443       CurrentBottom = priorII;
1444     else {
1445       if (&*CurrentTop == MI) {
1446         CurrentTop = nextIfDebug(++CurrentTop, priorII);
1447         TopRPTracker.setPos(CurrentTop);
1448       }
1449       moveInstruction(MI, CurrentBottom);
1450       CurrentBottom = MI;
1451       BotRPTracker.setPos(CurrentBottom);
1452     }
1453     if (ShouldTrackPressure) {
1454       RegisterOperands RegOpers;
1455       RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
1456       if (ShouldTrackLaneMasks) {
1457         // Adjust liveness and add missing dead+read-undef flags.
1458         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
1459         RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
1460       } else {
1461         // Adjust for missing dead-def flags.
1462         RegOpers.detectDeadDefs(*MI, *LIS);
1463       }
1464 
1465       if (BotRPTracker.getPos() != CurrentBottom)
1466         BotRPTracker.recedeSkipDebugValues();
1467       SmallVector<RegisterMaskPair, 8> LiveUses;
1468       BotRPTracker.recede(RegOpers, &LiveUses);
1469       assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
1470       LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure(
1471                      BotRPTracker.getRegSetPressureAtPos(), TRI););
1472 
1473       updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
1474       updatePressureDiffs(LiveUses);
1475     }
1476   }
1477 }
1478 
1479 //===----------------------------------------------------------------------===//
1480 // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
1481 //===----------------------------------------------------------------------===//
1482 
1483 namespace {
1484 
1485 /// Post-process the DAG to create cluster edges between neighboring
1486 /// loads or between neighboring stores.
1487 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
1488   struct MemOpInfo {
1489     SUnit *SU;
1490     SmallVector<const MachineOperand *, 4> BaseOps;
1491     int64_t Offset;
1492     unsigned Width;
1493 
1494     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
1495               int64_t Offset, unsigned Width)
1496         : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
1497           Width(Width) {}
1498 
1499     static bool Compare(const MachineOperand *const &A,
1500                         const MachineOperand *const &B) {
1501       if (A->getType() != B->getType())
1502         return A->getType() < B->getType();
1503       if (A->isReg())
1504         return A->getReg() < B->getReg();
1505       if (A->isFI()) {
1506         const MachineFunction &MF = *A->getParent()->getParent()->getParent();
1507         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
1508         bool StackGrowsDown = TFI.getStackGrowthDirection() ==
1509                               TargetFrameLowering::StackGrowsDown;
1510         return StackGrowsDown ? A->getIndex() > B->getIndex()
1511                               : A->getIndex() < B->getIndex();
1512       }
1513 
1514       llvm_unreachable("MemOpClusterMutation only supports register or frame "
1515                        "index bases.");
1516     }
1517 
1518     bool operator<(const MemOpInfo &RHS) const {
1519       // FIXME: Don't compare everything twice. Maybe use C++20 three way
1520       // comparison instead when it's available.
1521       if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(),
1522                                        RHS.BaseOps.begin(), RHS.BaseOps.end(),
1523                                        Compare))
1524         return true;
1525       if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(),
1526                                        BaseOps.begin(), BaseOps.end(), Compare))
1527         return false;
1528       if (Offset != RHS.Offset)
1529         return Offset < RHS.Offset;
1530       return SU->NodeNum < RHS.SU->NodeNum;
1531     }
1532   };
1533 
1534   const TargetInstrInfo *TII;
1535   const TargetRegisterInfo *TRI;
1536   bool IsLoad;
1537 
1538 public:
1539   BaseMemOpClusterMutation(const TargetInstrInfo *tii,
1540                            const TargetRegisterInfo *tri, bool IsLoad)
1541       : TII(tii), TRI(tri), IsLoad(IsLoad) {}
1542 
1543   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1544 
1545 protected:
1546   void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
1547                                 ScheduleDAGInstrs *DAG);
1548   void collectMemOpRecords(std::vector<SUnit> &SUnits,
1549                            SmallVectorImpl<MemOpInfo> &MemOpRecords);
1550   bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1551                    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
1552 };
1553 
1554 class StoreClusterMutation : public BaseMemOpClusterMutation {
1555 public:
1556   StoreClusterMutation(const TargetInstrInfo *tii,
1557                        const TargetRegisterInfo *tri)
1558       : BaseMemOpClusterMutation(tii, tri, false) {}
1559 };
1560 
1561 class LoadClusterMutation : public BaseMemOpClusterMutation {
1562 public:
1563   LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
1564       : BaseMemOpClusterMutation(tii, tri, true) {}
1565 };
1566 
1567 } // end anonymous namespace
1568 
1569 namespace llvm {
1570 
1571 std::unique_ptr<ScheduleDAGMutation>
1572 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
1573                              const TargetRegisterInfo *TRI) {
1574   return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
1575                             : nullptr;
1576 }
1577 
1578 std::unique_ptr<ScheduleDAGMutation>
1579 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
1580                               const TargetRegisterInfo *TRI) {
1581   return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
1582                             : nullptr;
1583 }
1584 
1585 } // end namespace llvm
1586 
1587 // Sorting all the loads/stores first, then for each load/store, checking the
1588 // following load/store one by one, until reach the first non-dependent one and
1589 // call target hook to see if they can cluster.
1590 // If FastCluster is enabled, we assume that, all the loads/stores have been
1591 // preprocessed and now, they didn't have dependencies on each other.
1592 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
1593     ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
1594     ScheduleDAGInstrs *DAG) {
1595   // Keep track of the current cluster length and bytes for each SUnit.
1596   DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
1597 
1598   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
1599   // cluster mem ops collected within `MemOpRecords` array.
1600   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
1601     // Decision to cluster mem ops is taken based on target dependent logic
1602     auto MemOpa = MemOpRecords[Idx];
1603 
1604     // Seek for the next load/store to do the cluster.
1605     unsigned NextIdx = Idx + 1;
1606     for (; NextIdx < End; ++NextIdx)
1607       // Skip if MemOpb has been clustered already or has dependency with
1608       // MemOpa.
1609       if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
1610           (FastCluster ||
1611            (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
1612             !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
1613         break;
1614     if (NextIdx == End)
1615       continue;
1616 
1617     auto MemOpb = MemOpRecords[NextIdx];
1618     unsigned ClusterLength = 2;
1619     unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
1620     if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
1621       ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
1622       CurrentClusterBytes =
1623           SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
1624     }
1625 
1626     if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
1627                                   CurrentClusterBytes))
1628       continue;
1629 
1630     SUnit *SUa = MemOpa.SU;
1631     SUnit *SUb = MemOpb.SU;
1632     if (SUa->NodeNum > SUb->NodeNum)
1633       std::swap(SUa, SUb);
1634 
1635     // FIXME: Is this check really required?
1636     if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
1637       continue;
1638 
1639     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
1640                       << SUb->NodeNum << ")\n");
1641     ++NumClustered;
1642 
1643     if (IsLoad) {
1644       // Copy successor edges from SUa to SUb. Interleaving computation
1645       // dependent on SUa can prevent load combining due to register reuse.
1646       // Predecessor edges do not need to be copied from SUb to SUa since
1647       // nearby loads should have effectively the same inputs.
1648       for (const SDep &Succ : SUa->Succs) {
1649         if (Succ.getSUnit() == SUb)
1650           continue;
1651         LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
1652                           << ")\n");
1653         DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
1654       }
1655     } else {
1656       // Copy predecessor edges from SUb to SUa to avoid the SUnits that
1657       // SUb dependent on scheduled in-between SUb and SUa. Successor edges
1658       // do not need to be copied from SUa to SUb since no one will depend
1659       // on stores.
1660       // Notice that, we don't need to care about the memory dependency as
1661       // we won't try to cluster them if they have any memory dependency.
1662       for (const SDep &Pred : SUb->Preds) {
1663         if (Pred.getSUnit() == SUa)
1664           continue;
1665         LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
1666                           << ")\n");
1667         DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
1668       }
1669     }
1670 
1671     SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
1672                                              CurrentClusterBytes};
1673 
1674     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
1675                       << ", Curr cluster bytes: " << CurrentClusterBytes
1676                       << "\n");
1677   }
1678 }
1679 
1680 void BaseMemOpClusterMutation::collectMemOpRecords(
1681     std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
1682   for (auto &SU : SUnits) {
1683     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
1684         (!IsLoad && !SU.getInstr()->mayStore()))
1685       continue;
1686 
1687     const MachineInstr &MI = *SU.getInstr();
1688     SmallVector<const MachineOperand *, 4> BaseOps;
1689     int64_t Offset;
1690     bool OffsetIsScalable;
1691     unsigned Width;
1692     if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
1693                                            OffsetIsScalable, Width, TRI)) {
1694       MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
1695 
1696       LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
1697                         << Offset << ", OffsetIsScalable: " << OffsetIsScalable
1698                         << ", Width: " << Width << "\n");
1699     }
1700 #ifndef NDEBUG
1701     for (const auto *Op : BaseOps)
1702       assert(Op);
1703 #endif
1704   }
1705 }
1706 
1707 bool BaseMemOpClusterMutation::groupMemOps(
1708     ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
1709     DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
1710   bool FastCluster =
1711       ForceFastCluster ||
1712       MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
1713 
1714   for (const auto &MemOp : MemOps) {
1715     unsigned ChainPredID = DAG->SUnits.size();
1716     if (FastCluster) {
1717       for (const SDep &Pred : MemOp.SU->Preds) {
1718         // We only want to cluster the mem ops that have the same ctrl(non-data)
1719         // pred so that they didn't have ctrl dependency for each other. But for
1720         // store instrs, we can still cluster them if the pred is load instr.
1721         if ((Pred.isCtrl() &&
1722              (IsLoad ||
1723               (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
1724             !Pred.isArtificial()) {
1725           ChainPredID = Pred.getSUnit()->NodeNum;
1726           break;
1727         }
1728       }
1729     } else
1730       ChainPredID = 0;
1731 
1732     Groups[ChainPredID].push_back(MemOp);
1733   }
1734   return FastCluster;
1735 }
1736 
1737 /// Callback from DAG postProcessing to create cluster edges for loads/stores.
1738 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
1739   // Collect all the clusterable loads/stores
1740   SmallVector<MemOpInfo, 32> MemOpRecords;
1741   collectMemOpRecords(DAG->SUnits, MemOpRecords);
1742 
1743   if (MemOpRecords.size() < 2)
1744     return;
1745 
1746   // Put the loads/stores without dependency into the same group with some
1747   // heuristic if the DAG is too complex to avoid compiling time blow up.
1748   // Notice that, some fusion pair could be lost with this.
1749   DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
1750   bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
1751 
1752   for (auto &Group : Groups) {
1753     // Sorting the loads/stores, so that, we can stop the cluster as early as
1754     // possible.
1755     llvm::sort(Group.second);
1756 
1757     // Trying to cluster all the neighboring loads/stores.
1758     clusterNeighboringMemOps(Group.second, FastCluster, DAG);
1759   }
1760 }
1761 
1762 //===----------------------------------------------------------------------===//
1763 // CopyConstrain - DAG post-processing to encourage copy elimination.
1764 //===----------------------------------------------------------------------===//
1765 
1766 namespace {
1767 
1768 /// Post-process the DAG to create weak edges from all uses of a copy to
1769 /// the one use that defines the copy's source vreg, most likely an induction
1770 /// variable increment.
1771 class CopyConstrain : public ScheduleDAGMutation {
1772   // Transient state.
1773   SlotIndex RegionBeginIdx;
1774 
1775   // RegionEndIdx is the slot index of the last non-debug instruction in the
1776   // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
1777   SlotIndex RegionEndIdx;
1778 
1779 public:
1780   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
1781 
1782   void apply(ScheduleDAGInstrs *DAGInstrs) override;
1783 
1784 protected:
1785   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
1786 };
1787 
1788 } // end anonymous namespace
1789 
1790 namespace llvm {
1791 
1792 std::unique_ptr<ScheduleDAGMutation>
1793 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
1794                                const TargetRegisterInfo *TRI) {
1795   return std::make_unique<CopyConstrain>(TII, TRI);
1796 }
1797 
1798 } // end namespace llvm
1799 
1800 /// constrainLocalCopy handles two possibilities:
1801 /// 1) Local src:
1802 /// I0:     = dst
1803 /// I1: src = ...
1804 /// I2:     = dst
1805 /// I3: dst = src (copy)
1806 /// (create pred->succ edges I0->I1, I2->I1)
1807 ///
1808 /// 2) Local copy:
1809 /// I0: dst = src (copy)
1810 /// I1:     = dst
1811 /// I2: src = ...
1812 /// I3:     = dst
1813 /// (create pred->succ edges I1->I2, I3->I2)
1814 ///
1815 /// Although the MachineScheduler is currently constrained to single blocks,
1816 /// this algorithm should handle extended blocks. An EBB is a set of
1817 /// contiguously numbered blocks such that the previous block in the EBB is
1818 /// always the single predecessor.
1819 void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
1820   LiveIntervals *LIS = DAG->getLIS();
1821   MachineInstr *Copy = CopySU->getInstr();
1822 
1823   // Check for pure vreg copies.
1824   const MachineOperand &SrcOp = Copy->getOperand(1);
1825   Register SrcReg = SrcOp.getReg();
1826   if (!Register::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
1827     return;
1828 
1829   const MachineOperand &DstOp = Copy->getOperand(0);
1830   Register DstReg = DstOp.getReg();
1831   if (!Register::isVirtualRegister(DstReg) || DstOp.isDead())
1832     return;
1833 
1834   // Check if either the dest or source is local. If it's live across a back
1835   // edge, it's not local. Note that if both vregs are live across the back
1836   // edge, we cannot successfully contrain the copy without cyclic scheduling.
1837   // If both the copy's source and dest are local live intervals, then we
1838   // should treat the dest as the global for the purpose of adding
1839   // constraints. This adds edges from source's other uses to the copy.
1840   unsigned LocalReg = SrcReg;
1841   unsigned GlobalReg = DstReg;
1842   LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
1843   if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
1844     LocalReg = DstReg;
1845     GlobalReg = SrcReg;
1846     LocalLI = &LIS->getInterval(LocalReg);
1847     if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
1848       return;
1849   }
1850   LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
1851 
1852   // Find the global segment after the start of the local LI.
1853   LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
1854   // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
1855   // local live range. We could create edges from other global uses to the local
1856   // start, but the coalescer should have already eliminated these cases, so
1857   // don't bother dealing with it.
1858   if (GlobalSegment == GlobalLI->end())
1859     return;
1860 
1861   // If GlobalSegment is killed at the LocalLI->start, the call to find()
1862   // returned the next global segment. But if GlobalSegment overlaps with
1863   // LocalLI->start, then advance to the next segment. If a hole in GlobalLI
1864   // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
1865   if (GlobalSegment->contains(LocalLI->beginIndex()))
1866     ++GlobalSegment;
1867 
1868   if (GlobalSegment == GlobalLI->end())
1869     return;
1870 
1871   // Check if GlobalLI contains a hole in the vicinity of LocalLI.
1872   if (GlobalSegment != GlobalLI->begin()) {
1873     // Two address defs have no hole.
1874     if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end,
1875                                GlobalSegment->start)) {
1876       return;
1877     }
1878     // If the prior global segment may be defined by the same two-address
1879     // instruction that also defines LocalLI, then can't make a hole here.
1880     if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start,
1881                                LocalLI->beginIndex())) {
1882       return;
1883     }
1884     // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
1885     // it would be a disconnected component in the live range.
1886     assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() &&
1887            "Disconnected LRG within the scheduling region.");
1888   }
1889   MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
1890   if (!GlobalDef)
1891     return;
1892 
1893   SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
1894   if (!GlobalSU)
1895     return;
1896 
1897   // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
1898   // constraining the uses of the last local def to precede GlobalDef.
1899   SmallVector<SUnit*,8> LocalUses;
1900   const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
1901   MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
1902   SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
1903   for (const SDep &Succ : LastLocalSU->Succs) {
1904     if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
1905       continue;
1906     if (Succ.getSUnit() == GlobalSU)
1907       continue;
1908     if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
1909       return;
1910     LocalUses.push_back(Succ.getSUnit());
1911   }
1912   // Open the top of the GlobalLI hole by constraining any earlier global uses
1913   // to precede the start of LocalLI.
1914   SmallVector<SUnit*,8> GlobalUses;
1915   MachineInstr *FirstLocalDef =
1916     LIS->getInstructionFromIndex(LocalLI->beginIndex());
1917   SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
1918   for (const SDep &Pred : GlobalSU->Preds) {
1919     if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
1920       continue;
1921     if (Pred.getSUnit() == FirstLocalSU)
1922       continue;
1923     if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
1924       return;
1925     GlobalUses.push_back(Pred.getSUnit());
1926   }
1927   LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
1928   // Add the weak edges.
1929   for (SUnit *LU : LocalUses) {
1930     LLVM_DEBUG(dbgs() << "  Local use SU(" << LU->NodeNum << ") -> SU("
1931                       << GlobalSU->NodeNum << ")\n");
1932     DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak));
1933   }
1934   for (SUnit *GU : GlobalUses) {
1935     LLVM_DEBUG(dbgs() << "  Global use SU(" << GU->NodeNum << ") -> SU("
1936                       << FirstLocalSU->NodeNum << ")\n");
1937     DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak));
1938   }
1939 }
1940 
1941 /// Callback from DAG postProcessing to create weak edges to encourage
1942 /// copy elimination.
1943 void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
1944   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1945   assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
1946 
1947   MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
1948   if (FirstPos == DAG->end())
1949     return;
1950   RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
1951   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
1952       *priorNonDebug(DAG->end(), DAG->begin()));
1953 
1954   for (SUnit &SU : DAG->SUnits) {
1955     if (!SU.getInstr()->isCopy())
1956       continue;
1957 
1958     constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
1959   }
1960 }
1961 
1962 //===----------------------------------------------------------------------===//
1963 // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
1964 // and possibly other custom schedulers.
1965 //===----------------------------------------------------------------------===//
1966 
1967 static const unsigned InvalidCycle = ~0U;
1968 
1969 SchedBoundary::~SchedBoundary() { delete HazardRec; }
1970 
1971 /// Given a Count of resource usage and a Latency value, return true if a
1972 /// SchedBoundary becomes resource limited.
1973 /// If we are checking after scheduling a node, we should return true when
1974 /// we just reach the resource limit.
1975 static bool checkResourceLimit(unsigned LFactor, unsigned Count,
1976                                unsigned Latency, bool AfterSchedNode) {
1977   int ResCntFactor = (int)(Count - (Latency * LFactor));
1978   if (AfterSchedNode)
1979     return ResCntFactor >= (int)LFactor;
1980   else
1981     return ResCntFactor > (int)LFactor;
1982 }
1983 
1984 void SchedBoundary::reset() {
1985   // A new HazardRec is created for each DAG and owned by SchedBoundary.
1986   // Destroying and reconstructing it is very expensive though. So keep
1987   // invalid, placeholder HazardRecs.
1988   if (HazardRec && HazardRec->isEnabled()) {
1989     delete HazardRec;
1990     HazardRec = nullptr;
1991   }
1992   Available.clear();
1993   Pending.clear();
1994   CheckPending = false;
1995   CurrCycle = 0;
1996   CurrMOps = 0;
1997   MinReadyCycle = std::numeric_limits<unsigned>::max();
1998   ExpectedLatency = 0;
1999   DependentLatency = 0;
2000   RetiredMOps = 0;
2001   MaxExecutedResCount = 0;
2002   ZoneCritResIdx = 0;
2003   IsResourceLimited = false;
2004   ReservedCycles.clear();
2005   ReservedCyclesIndex.clear();
2006   ResourceGroupSubUnitMasks.clear();
2007 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2008   // Track the maximum number of stall cycles that could arise either from the
2009   // latency of a DAG edge or the number of cycles that a processor resource is
2010   // reserved (SchedBoundary::ReservedCycles).
2011   MaxObservedStall = 0;
2012 #endif
2013   // Reserve a zero-count for invalid CritResIdx.
2014   ExecutedResCounts.resize(1);
2015   assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
2016 }
2017 
2018 void SchedRemainder::
2019 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
2020   reset();
2021   if (!SchedModel->hasInstrSchedModel())
2022     return;
2023   RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
2024   for (SUnit &SU : DAG->SUnits) {
2025     const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
2026     RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
2027       * SchedModel->getMicroOpFactor();
2028     for (TargetSchedModel::ProcResIter
2029            PI = SchedModel->getWriteProcResBegin(SC),
2030            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2031       unsigned PIdx = PI->ProcResourceIdx;
2032       unsigned Factor = SchedModel->getResourceFactor(PIdx);
2033       RemainingCounts[PIdx] += (Factor * PI->Cycles);
2034     }
2035   }
2036 }
2037 
2038 void SchedBoundary::
2039 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
2040   reset();
2041   DAG = dag;
2042   SchedModel = smodel;
2043   Rem = rem;
2044   if (SchedModel->hasInstrSchedModel()) {
2045     unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
2046     ReservedCyclesIndex.resize(ResourceCount);
2047     ExecutedResCounts.resize(ResourceCount);
2048     ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0));
2049     unsigned NumUnits = 0;
2050 
2051     for (unsigned i = 0; i < ResourceCount; ++i) {
2052       ReservedCyclesIndex[i] = NumUnits;
2053       NumUnits += SchedModel->getProcResource(i)->NumUnits;
2054       if (isUnbufferedGroup(i)) {
2055         auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
2056         for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
2057              U != UE; ++U)
2058           ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]);
2059       }
2060     }
2061 
2062     ReservedCycles.resize(NumUnits, InvalidCycle);
2063   }
2064 }
2065 
2066 /// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
2067 /// these "soft stalls" differently than the hard stall cycles based on CPU
2068 /// resources and computed by checkHazard(). A fully in-order model
2069 /// (MicroOpBufferSize==0) will not make use of this since instructions are not
2070 /// available for scheduling until they are ready. However, a weaker in-order
2071 /// model may use this for heuristics. For example, if a processor has in-order
2072 /// behavior when reading certain resources, this may come into play.
2073 unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
2074   if (!SU->isUnbuffered)
2075     return 0;
2076 
2077   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2078   if (ReadyCycle > CurrCycle)
2079     return ReadyCycle - CurrCycle;
2080   return 0;
2081 }
2082 
2083 /// Compute the next cycle at which the given processor resource unit
2084 /// can be scheduled.
2085 unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,
2086                                                        unsigned Cycles) {
2087   unsigned NextUnreserved = ReservedCycles[InstanceIdx];
2088   // If this resource has never been used, always return cycle zero.
2089   if (NextUnreserved == InvalidCycle)
2090     return 0;
2091   // For bottom-up scheduling add the cycles needed for the current operation.
2092   if (!isTop())
2093     NextUnreserved += Cycles;
2094   return NextUnreserved;
2095 }
2096 
2097 /// Compute the next cycle at which the given processor resource can be
2098 /// scheduled.  Returns the next cycle and the index of the processor resource
2099 /// instance in the reserved cycles vector.
2100 std::pair<unsigned, unsigned>
2101 SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
2102                                     unsigned Cycles) {
2103 
2104   unsigned MinNextUnreserved = InvalidCycle;
2105   unsigned InstanceIdx = 0;
2106   unsigned StartIndex = ReservedCyclesIndex[PIdx];
2107   unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits;
2108   assert(NumberOfInstances > 0 &&
2109          "Cannot have zero instances of a ProcResource");
2110 
2111   if (isUnbufferedGroup(PIdx)) {
2112     // If any subunits are used by the instruction, report that the resource
2113     // group is available at 0, effectively removing the group record from
2114     // hazarding and basing the hazarding decisions on the subunit records.
2115     // Otherwise, choose the first available instance from among the subunits.
2116     // Specifications which assign cycles to both the subunits and the group or
2117     // which use an unbuffered group with buffered subunits will appear to
2118     // schedule strangely. In the first case, the additional cycles for the
2119     // group will be ignored.  In the second, the group will be ignored
2120     // entirely.
2121     for (const MCWriteProcResEntry &PE :
2122          make_range(SchedModel->getWriteProcResBegin(SC),
2123                     SchedModel->getWriteProcResEnd(SC)))
2124       if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx])
2125         return std::make_pair(0u, StartIndex);
2126 
2127     auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin;
2128     for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) {
2129       unsigned NextUnreserved, NextInstanceIdx;
2130       std::tie(NextUnreserved, NextInstanceIdx) =
2131           getNextResourceCycle(SC, SubUnits[I], Cycles);
2132       if (MinNextUnreserved > NextUnreserved) {
2133         InstanceIdx = NextInstanceIdx;
2134         MinNextUnreserved = NextUnreserved;
2135       }
2136     }
2137     return std::make_pair(MinNextUnreserved, InstanceIdx);
2138   }
2139 
2140   for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End;
2141        ++I) {
2142     unsigned NextUnreserved = getNextResourceCycleByInstance(I, Cycles);
2143     if (MinNextUnreserved > NextUnreserved) {
2144       InstanceIdx = I;
2145       MinNextUnreserved = NextUnreserved;
2146     }
2147   }
2148   return std::make_pair(MinNextUnreserved, InstanceIdx);
2149 }
2150 
2151 /// Does this SU have a hazard within the current instruction group.
2152 ///
2153 /// The scheduler supports two modes of hazard recognition. The first is the
2154 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
2155 /// supports highly complicated in-order reservation tables
2156 /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
2157 ///
2158 /// The second is a streamlined mechanism that checks for hazards based on
2159 /// simple counters that the scheduler itself maintains. It explicitly checks
2160 /// for instruction dispatch limitations, including the number of micro-ops that
2161 /// can dispatch per cycle.
2162 ///
2163 /// TODO: Also check whether the SU must start a new group.
2164 bool SchedBoundary::checkHazard(SUnit *SU) {
2165   if (HazardRec->isEnabled()
2166       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
2167     return true;
2168   }
2169 
2170   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
2171   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
2172     LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
2173                       << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
2174     return true;
2175   }
2176 
2177   if (CurrMOps > 0 &&
2178       ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
2179        (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
2180     LLVM_DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
2181                       << (isTop() ? "begin" : "end") << " group\n");
2182     return true;
2183   }
2184 
2185   if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
2186     const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2187     for (const MCWriteProcResEntry &PE :
2188           make_range(SchedModel->getWriteProcResBegin(SC),
2189                      SchedModel->getWriteProcResEnd(SC))) {
2190       unsigned ResIdx = PE.ProcResourceIdx;
2191       unsigned Cycles = PE.Cycles;
2192       unsigned NRCycle, InstanceIdx;
2193       std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles);
2194       if (NRCycle > CurrCycle) {
2195 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2196         MaxObservedStall = std::max(Cycles, MaxObservedStall);
2197 #endif
2198         LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
2199                           << SchedModel->getResourceName(ResIdx)
2200                           << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx]  << ']'
2201                           << "=" << NRCycle << "c\n");
2202         return true;
2203       }
2204     }
2205   }
2206   return false;
2207 }
2208 
2209 // Find the unscheduled node in ReadySUs with the highest latency.
2210 unsigned SchedBoundary::
2211 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
2212   SUnit *LateSU = nullptr;
2213   unsigned RemLatency = 0;
2214   for (SUnit *SU : ReadySUs) {
2215     unsigned L = getUnscheduledLatency(SU);
2216     if (L > RemLatency) {
2217       RemLatency = L;
2218       LateSU = SU;
2219     }
2220   }
2221   if (LateSU) {
2222     LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU("
2223                       << LateSU->NodeNum << ") " << RemLatency << "c\n");
2224   }
2225   return RemLatency;
2226 }
2227 
2228 // Count resources in this zone and the remaining unscheduled
2229 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical
2230 // resource index, or zero if the zone is issue limited.
2231 unsigned SchedBoundary::
2232 getOtherResourceCount(unsigned &OtherCritIdx) {
2233   OtherCritIdx = 0;
2234   if (!SchedModel->hasInstrSchedModel())
2235     return 0;
2236 
2237   unsigned OtherCritCount = Rem->RemIssueCount
2238     + (RetiredMOps * SchedModel->getMicroOpFactor());
2239   LLVM_DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
2240                     << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
2241   for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
2242        PIdx != PEnd; ++PIdx) {
2243     unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
2244     if (OtherCount > OtherCritCount) {
2245       OtherCritCount = OtherCount;
2246       OtherCritIdx = PIdx;
2247     }
2248   }
2249   if (OtherCritIdx) {
2250     LLVM_DEBUG(
2251         dbgs() << "  " << Available.getName() << " + Remain CritRes: "
2252                << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
2253                << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
2254   }
2255   return OtherCritCount;
2256 }
2257 
2258 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
2259                                 unsigned Idx) {
2260   assert(SU->getInstr() && "Scheduled SUnit must have instr");
2261 
2262 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
2263   // ReadyCycle was been bumped up to the CurrCycle when this node was
2264   // scheduled, but CurrCycle may have been eagerly advanced immediately after
2265   // scheduling, so may now be greater than ReadyCycle.
2266   if (ReadyCycle > CurrCycle)
2267     MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
2268 #endif
2269 
2270   if (ReadyCycle < MinReadyCycle)
2271     MinReadyCycle = ReadyCycle;
2272 
2273   // Check for interlocks first. For the purpose of other heuristics, an
2274   // instruction that cannot issue appears as if it's not in the ReadyQueue.
2275   bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
2276   bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) ||
2277                         checkHazard(SU) || (Available.size() >= ReadyListLimit);
2278 
2279   if (!HazardDetected) {
2280     Available.push(SU);
2281 
2282     if (InPQueue)
2283       Pending.remove(Pending.begin() + Idx);
2284     return;
2285   }
2286 
2287   if (!InPQueue)
2288     Pending.push(SU);
2289 }
2290 
2291 /// Move the boundary of scheduled code by one cycle.
2292 void SchedBoundary::bumpCycle(unsigned NextCycle) {
2293   if (SchedModel->getMicroOpBufferSize() == 0) {
2294     assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
2295            "MinReadyCycle uninitialized");
2296     if (MinReadyCycle > NextCycle)
2297       NextCycle = MinReadyCycle;
2298   }
2299   // Update the current micro-ops, which will issue in the next cycle.
2300   unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
2301   CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
2302 
2303   // Decrement DependentLatency based on the next cycle.
2304   if ((NextCycle - CurrCycle) > DependentLatency)
2305     DependentLatency = 0;
2306   else
2307     DependentLatency -= (NextCycle - CurrCycle);
2308 
2309   if (!HazardRec->isEnabled()) {
2310     // Bypass HazardRec virtual calls.
2311     CurrCycle = NextCycle;
2312   } else {
2313     // Bypass getHazardType calls in case of long latency.
2314     for (; CurrCycle != NextCycle; ++CurrCycle) {
2315       if (isTop())
2316         HazardRec->AdvanceCycle();
2317       else
2318         HazardRec->RecedeCycle();
2319     }
2320   }
2321   CheckPending = true;
2322   IsResourceLimited =
2323       checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2324                          getScheduledLatency(), true);
2325 
2326   LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName()
2327                     << '\n');
2328 }
2329 
2330 void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
2331   ExecutedResCounts[PIdx] += Count;
2332   if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
2333     MaxExecutedResCount = ExecutedResCounts[PIdx];
2334 }
2335 
2336 /// Add the given processor resource to this scheduled zone.
2337 ///
2338 /// \param Cycles indicates the number of consecutive (non-pipelined) cycles
2339 /// during which this resource is consumed.
2340 ///
2341 /// \return the next cycle at which the instruction may execute without
2342 /// oversubscribing resources.
2343 unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx,
2344                                       unsigned Cycles, unsigned NextCycle) {
2345   unsigned Factor = SchedModel->getResourceFactor(PIdx);
2346   unsigned Count = Factor * Cycles;
2347   LLVM_DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx) << " +"
2348                     << Cycles << "x" << Factor << "u\n");
2349 
2350   // Update Executed resources counts.
2351   incExecutedResources(PIdx, Count);
2352   assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
2353   Rem->RemainingCounts[PIdx] -= Count;
2354 
2355   // Check if this resource exceeds the current critical resource. If so, it
2356   // becomes the critical resource.
2357   if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
2358     ZoneCritResIdx = PIdx;
2359     LLVM_DEBUG(dbgs() << "  *** Critical resource "
2360                       << SchedModel->getResourceName(PIdx) << ": "
2361                       << getResourceCount(PIdx) / SchedModel->getLatencyFactor()
2362                       << "c\n");
2363   }
2364   // For reserved resources, record the highest cycle using the resource.
2365   unsigned NextAvailable, InstanceIdx;
2366   std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(SC, PIdx, Cycles);
2367   if (NextAvailable > CurrCycle) {
2368     LLVM_DEBUG(dbgs() << "  Resource conflict: "
2369                       << SchedModel->getResourceName(PIdx)
2370                       << '[' << InstanceIdx - ReservedCyclesIndex[PIdx]  << ']'
2371                       << " reserved until @" << NextAvailable << "\n");
2372   }
2373   return NextAvailable;
2374 }
2375 
2376 /// Move the boundary of scheduled code by one SUnit.
2377 void SchedBoundary::bumpNode(SUnit *SU) {
2378   // Update the reservation table.
2379   if (HazardRec->isEnabled()) {
2380     if (!isTop() && SU->isCall) {
2381       // Calls are scheduled with their preceding instructions. For bottom-up
2382       // scheduling, clear the pipeline state before emitting.
2383       HazardRec->Reset();
2384     }
2385     HazardRec->EmitInstruction(SU);
2386     // Scheduling an instruction may have made pending instructions available.
2387     CheckPending = true;
2388   }
2389   // checkHazard should prevent scheduling multiple instructions per cycle that
2390   // exceed the issue width.
2391   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2392   unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
2393   assert(
2394       (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
2395       "Cannot schedule this instruction's MicroOps in the current cycle.");
2396 
2397   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
2398   LLVM_DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
2399 
2400   unsigned NextCycle = CurrCycle;
2401   switch (SchedModel->getMicroOpBufferSize()) {
2402   case 0:
2403     assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
2404     break;
2405   case 1:
2406     if (ReadyCycle > NextCycle) {
2407       NextCycle = ReadyCycle;
2408       LLVM_DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
2409     }
2410     break;
2411   default:
2412     // We don't currently model the OOO reorder buffer, so consider all
2413     // scheduled MOps to be "retired". We do loosely model in-order resource
2414     // latency. If this instruction uses an in-order resource, account for any
2415     // likely stall cycles.
2416     if (SU->isUnbuffered && ReadyCycle > NextCycle)
2417       NextCycle = ReadyCycle;
2418     break;
2419   }
2420   RetiredMOps += IncMOps;
2421 
2422   // Update resource counts and critical resource.
2423   if (SchedModel->hasInstrSchedModel()) {
2424     unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
2425     assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
2426     Rem->RemIssueCount -= DecRemIssue;
2427     if (ZoneCritResIdx) {
2428       // Scale scheduled micro-ops for comparing with the critical resource.
2429       unsigned ScaledMOps =
2430         RetiredMOps * SchedModel->getMicroOpFactor();
2431 
2432       // If scaled micro-ops are now more than the previous critical resource by
2433       // a full cycle, then micro-ops issue becomes critical.
2434       if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
2435           >= (int)SchedModel->getLatencyFactor()) {
2436         ZoneCritResIdx = 0;
2437         LLVM_DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
2438                           << ScaledMOps / SchedModel->getLatencyFactor()
2439                           << "c\n");
2440       }
2441     }
2442     for (TargetSchedModel::ProcResIter
2443            PI = SchedModel->getWriteProcResBegin(SC),
2444            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2445       unsigned RCycle =
2446         countResource(SC, PI->ProcResourceIdx, PI->Cycles, NextCycle);
2447       if (RCycle > NextCycle)
2448         NextCycle = RCycle;
2449     }
2450     if (SU->hasReservedResource) {
2451       // For reserved resources, record the highest cycle using the resource.
2452       // For top-down scheduling, this is the cycle in which we schedule this
2453       // instruction plus the number of cycles the operations reserves the
2454       // resource. For bottom-up is it simply the instruction's cycle.
2455       for (TargetSchedModel::ProcResIter
2456              PI = SchedModel->getWriteProcResBegin(SC),
2457              PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2458         unsigned PIdx = PI->ProcResourceIdx;
2459         if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
2460           unsigned ReservedUntil, InstanceIdx;
2461           std::tie(ReservedUntil, InstanceIdx) =
2462               getNextResourceCycle(SC, PIdx, 0);
2463           if (isTop()) {
2464             ReservedCycles[InstanceIdx] =
2465                 std::max(ReservedUntil, NextCycle + PI->Cycles);
2466           } else
2467             ReservedCycles[InstanceIdx] = NextCycle;
2468         }
2469       }
2470     }
2471   }
2472   // Update ExpectedLatency and DependentLatency.
2473   unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
2474   unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
2475   if (SU->getDepth() > TopLatency) {
2476     TopLatency = SU->getDepth();
2477     LLVM_DEBUG(dbgs() << "  " << Available.getName() << " TopLatency SU("
2478                       << SU->NodeNum << ") " << TopLatency << "c\n");
2479   }
2480   if (SU->getHeight() > BotLatency) {
2481     BotLatency = SU->getHeight();
2482     LLVM_DEBUG(dbgs() << "  " << Available.getName() << " BotLatency SU("
2483                       << SU->NodeNum << ") " << BotLatency << "c\n");
2484   }
2485   // If we stall for any reason, bump the cycle.
2486   if (NextCycle > CurrCycle)
2487     bumpCycle(NextCycle);
2488   else
2489     // After updating ZoneCritResIdx and ExpectedLatency, check if we're
2490     // resource limited. If a stall occurred, bumpCycle does this.
2491     IsResourceLimited =
2492         checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
2493                            getScheduledLatency(), true);
2494 
2495   // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
2496   // resets CurrMOps. Loop to handle instructions with more MOps than issue in
2497   // one cycle.  Since we commonly reach the max MOps here, opportunistically
2498   // bump the cycle to avoid uselessly checking everything in the readyQ.
2499   CurrMOps += IncMOps;
2500 
2501   // Bump the cycle count for issue group constraints.
2502   // This must be done after NextCycle has been adjust for all other stalls.
2503   // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
2504   // currCycle to X.
2505   if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
2506       (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
2507     LLVM_DEBUG(dbgs() << "  Bump cycle to " << (isTop() ? "end" : "begin")
2508                       << " group\n");
2509     bumpCycle(++NextCycle);
2510   }
2511 
2512   while (CurrMOps >= SchedModel->getIssueWidth()) {
2513     LLVM_DEBUG(dbgs() << "  *** Max MOps " << CurrMOps << " at cycle "
2514                       << CurrCycle << '\n');
2515     bumpCycle(++NextCycle);
2516   }
2517   LLVM_DEBUG(dumpScheduledState());
2518 }
2519 
2520 /// Release pending ready nodes in to the available queue. This makes them
2521 /// visible to heuristics.
2522 void SchedBoundary::releasePending() {
2523   // If the available queue is empty, it is safe to reset MinReadyCycle.
2524   if (Available.empty())
2525     MinReadyCycle = std::numeric_limits<unsigned>::max();
2526 
2527   // Check to see if any of the pending instructions are ready to issue.  If
2528   // so, add them to the available queue.
2529   for (unsigned I = 0, E = Pending.size(); I < E; ++I) {
2530     SUnit *SU = *(Pending.begin() + I);
2531     unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
2532 
2533     if (ReadyCycle < MinReadyCycle)
2534       MinReadyCycle = ReadyCycle;
2535 
2536     if (Available.size() >= ReadyListLimit)
2537       break;
2538 
2539     releaseNode(SU, ReadyCycle, true, I);
2540     if (E != Pending.size()) {
2541       --I;
2542       --E;
2543     }
2544   }
2545   CheckPending = false;
2546 }
2547 
2548 /// Remove SU from the ready set for this boundary.
2549 void SchedBoundary::removeReady(SUnit *SU) {
2550   if (Available.isInQueue(SU))
2551     Available.remove(Available.find(SU));
2552   else {
2553     assert(Pending.isInQueue(SU) && "bad ready count");
2554     Pending.remove(Pending.find(SU));
2555   }
2556 }
2557 
2558 /// If this queue only has one ready candidate, return it. As a side effect,
2559 /// defer any nodes that now hit a hazard, and advance the cycle until at least
2560 /// one node is ready. If multiple instructions are ready, return NULL.
2561 SUnit *SchedBoundary::pickOnlyChoice() {
2562   if (CheckPending)
2563     releasePending();
2564 
2565   // Defer any ready instrs that now have a hazard.
2566   for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
2567     if (checkHazard(*I)) {
2568       Pending.push(*I);
2569       I = Available.remove(I);
2570       continue;
2571     }
2572     ++I;
2573   }
2574   for (unsigned i = 0; Available.empty(); ++i) {
2575 //  FIXME: Re-enable assert once PR20057 is resolved.
2576 //    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
2577 //           "permanent hazard");
2578     (void)i;
2579     bumpCycle(CurrCycle + 1);
2580     releasePending();
2581   }
2582 
2583   LLVM_DEBUG(Pending.dump());
2584   LLVM_DEBUG(Available.dump());
2585 
2586   if (Available.size() == 1)
2587     return *Available.begin();
2588   return nullptr;
2589 }
2590 
2591 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2592 // This is useful information to dump after bumpNode.
2593 // Note that the Queue contents are more useful before pickNodeFromQueue.
2594 LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
2595   unsigned ResFactor;
2596   unsigned ResCount;
2597   if (ZoneCritResIdx) {
2598     ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
2599     ResCount = getResourceCount(ZoneCritResIdx);
2600   } else {
2601     ResFactor = SchedModel->getMicroOpFactor();
2602     ResCount = RetiredMOps * ResFactor;
2603   }
2604   unsigned LFactor = SchedModel->getLatencyFactor();
2605   dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
2606          << "  Retired: " << RetiredMOps;
2607   dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
2608   dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
2609          << ResCount / ResFactor << " "
2610          << SchedModel->getResourceName(ZoneCritResIdx)
2611          << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
2612          << (IsResourceLimited ? "  - Resource" : "  - Latency")
2613          << " limited.\n";
2614 }
2615 #endif
2616 
2617 //===----------------------------------------------------------------------===//
2618 // GenericScheduler - Generic implementation of MachineSchedStrategy.
2619 //===----------------------------------------------------------------------===//
2620 
2621 void GenericSchedulerBase::SchedCandidate::
2622 initResourceDelta(const ScheduleDAGMI *DAG,
2623                   const TargetSchedModel *SchedModel) {
2624   if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2625     return;
2626 
2627   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2628   for (TargetSchedModel::ProcResIter
2629          PI = SchedModel->getWriteProcResBegin(SC),
2630          PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2631     if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2632       ResDelta.CritResources += PI->Cycles;
2633     if (PI->ProcResourceIdx == Policy.DemandResIdx)
2634       ResDelta.DemandedResources += PI->Cycles;
2635   }
2636 }
2637 
2638 /// Compute remaining latency. We need this both to determine whether the
2639 /// overall schedule has become latency-limited and whether the instructions
2640 /// outside this zone are resource or latency limited.
2641 ///
2642 /// The "dependent" latency is updated incrementally during scheduling as the
2643 /// max height/depth of scheduled nodes minus the cycles since it was
2644 /// scheduled:
2645 ///   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
2646 ///
2647 /// The "independent" latency is the max ready queue depth:
2648 ///   ILat = max N.depth for N in Available|Pending
2649 ///
2650 /// RemainingLatency is the greater of independent and dependent latency.
2651 ///
2652 /// These computations are expensive, especially in DAGs with many edges, so
2653 /// only do them if necessary.
2654 static unsigned computeRemLatency(SchedBoundary &CurrZone) {
2655   unsigned RemLatency = CurrZone.getDependentLatency();
2656   RemLatency = std::max(RemLatency,
2657                         CurrZone.findMaxLatency(CurrZone.Available.elements()));
2658   RemLatency = std::max(RemLatency,
2659                         CurrZone.findMaxLatency(CurrZone.Pending.elements()));
2660   return RemLatency;
2661 }
2662 
2663 /// Returns true if the current cycle plus remaning latency is greater than
2664 /// the critical path in the scheduling region.
2665 bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
2666                                                SchedBoundary &CurrZone,
2667                                                bool ComputeRemLatency,
2668                                                unsigned &RemLatency) const {
2669   // The current cycle is already greater than the critical path, so we are
2670   // already latency limited and don't need to compute the remaining latency.
2671   if (CurrZone.getCurrCycle() > Rem.CriticalPath)
2672     return true;
2673 
2674   // If we haven't scheduled anything yet, then we aren't latency limited.
2675   if (CurrZone.getCurrCycle() == 0)
2676     return false;
2677 
2678   if (ComputeRemLatency)
2679     RemLatency = computeRemLatency(CurrZone);
2680 
2681   return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath;
2682 }
2683 
2684 /// Set the CandPolicy given a scheduling zone given the current resources and
2685 /// latencies inside and outside the zone.
2686 void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
2687                                      SchedBoundary &CurrZone,
2688                                      SchedBoundary *OtherZone) {
2689   // Apply preemptive heuristics based on the total latency and resources
2690   // inside and outside this zone. Potential stalls should be considered before
2691   // following this policy.
2692 
2693   // Compute the critical resource outside the zone.
2694   unsigned OtherCritIdx = 0;
2695   unsigned OtherCount =
2696     OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
2697 
2698   bool OtherResLimited = false;
2699   unsigned RemLatency = 0;
2700   bool RemLatencyComputed = false;
2701   if (SchedModel->hasInstrSchedModel() && OtherCount != 0) {
2702     RemLatency = computeRemLatency(CurrZone);
2703     RemLatencyComputed = true;
2704     OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(),
2705                                          OtherCount, RemLatency, false);
2706   }
2707 
2708   // Schedule aggressively for latency in PostRA mode. We don't check for
2709   // acyclic latency during PostRA, and highly out-of-order processors will
2710   // skip PostRA scheduling.
2711   if (!OtherResLimited &&
2712       (IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed,
2713                                        RemLatency))) {
2714     Policy.ReduceLatency |= true;
2715     LLVM_DEBUG(dbgs() << "  " << CurrZone.Available.getName()
2716                       << " RemainingLatency " << RemLatency << " + "
2717                       << CurrZone.getCurrCycle() << "c > CritPath "
2718                       << Rem.CriticalPath << "\n");
2719   }
2720   // If the same resource is limiting inside and outside the zone, do nothing.
2721   if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
2722     return;
2723 
2724   LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
2725     dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
2726            << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
2727   } if (OtherResLimited) dbgs()
2728                  << "  RemainingLimit: "
2729                  << SchedModel->getResourceName(OtherCritIdx) << "\n";
2730              if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
2731              << "  Latency limited both directions.\n");
2732 
2733   if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
2734     Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
2735 
2736   if (OtherResLimited)
2737     Policy.DemandResIdx = OtherCritIdx;
2738 }
2739 
2740 #ifndef NDEBUG
2741 const char *GenericSchedulerBase::getReasonStr(
2742   GenericSchedulerBase::CandReason Reason) {
2743   switch (Reason) {
2744   case NoCand:         return "NOCAND    ";
2745   case Only1:          return "ONLY1     ";
2746   case PhysReg:        return "PHYS-REG  ";
2747   case RegExcess:      return "REG-EXCESS";
2748   case RegCritical:    return "REG-CRIT  ";
2749   case Stall:          return "STALL     ";
2750   case Cluster:        return "CLUSTER   ";
2751   case Weak:           return "WEAK      ";
2752   case RegMax:         return "REG-MAX   ";
2753   case ResourceReduce: return "RES-REDUCE";
2754   case ResourceDemand: return "RES-DEMAND";
2755   case TopDepthReduce: return "TOP-DEPTH ";
2756   case TopPathReduce:  return "TOP-PATH  ";
2757   case BotHeightReduce:return "BOT-HEIGHT";
2758   case BotPathReduce:  return "BOT-PATH  ";
2759   case NextDefUse:     return "DEF-USE   ";
2760   case NodeOrder:      return "ORDER     ";
2761   };
2762   llvm_unreachable("Unknown reason!");
2763 }
2764 
2765 void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
2766   PressureChange P;
2767   unsigned ResIdx = 0;
2768   unsigned Latency = 0;
2769   switch (Cand.Reason) {
2770   default:
2771     break;
2772   case RegExcess:
2773     P = Cand.RPDelta.Excess;
2774     break;
2775   case RegCritical:
2776     P = Cand.RPDelta.CriticalMax;
2777     break;
2778   case RegMax:
2779     P = Cand.RPDelta.CurrentMax;
2780     break;
2781   case ResourceReduce:
2782     ResIdx = Cand.Policy.ReduceResIdx;
2783     break;
2784   case ResourceDemand:
2785     ResIdx = Cand.Policy.DemandResIdx;
2786     break;
2787   case TopDepthReduce:
2788     Latency = Cand.SU->getDepth();
2789     break;
2790   case TopPathReduce:
2791     Latency = Cand.SU->getHeight();
2792     break;
2793   case BotHeightReduce:
2794     Latency = Cand.SU->getHeight();
2795     break;
2796   case BotPathReduce:
2797     Latency = Cand.SU->getDepth();
2798     break;
2799   }
2800   dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
2801   if (P.isValid())
2802     dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
2803            << ":" << P.getUnitInc() << " ";
2804   else
2805     dbgs() << "      ";
2806   if (ResIdx)
2807     dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
2808   else
2809     dbgs() << "         ";
2810   if (Latency)
2811     dbgs() << " " << Latency << " cycles ";
2812   else
2813     dbgs() << "          ";
2814   dbgs() << '\n';
2815 }
2816 #endif
2817 
2818 namespace llvm {
2819 /// Return true if this heuristic determines order.
2820 /// TODO: Consider refactor return type of these functions as integer or enum,
2821 /// as we may need to differentiate whether TryCand is better than Cand.
2822 bool tryLess(int TryVal, int CandVal,
2823              GenericSchedulerBase::SchedCandidate &TryCand,
2824              GenericSchedulerBase::SchedCandidate &Cand,
2825              GenericSchedulerBase::CandReason Reason) {
2826   if (TryVal < CandVal) {
2827     TryCand.Reason = Reason;
2828     return true;
2829   }
2830   if (TryVal > CandVal) {
2831     if (Cand.Reason > Reason)
2832       Cand.Reason = Reason;
2833     return true;
2834   }
2835   return false;
2836 }
2837 
2838 bool tryGreater(int TryVal, int CandVal,
2839                 GenericSchedulerBase::SchedCandidate &TryCand,
2840                 GenericSchedulerBase::SchedCandidate &Cand,
2841                 GenericSchedulerBase::CandReason Reason) {
2842   if (TryVal > CandVal) {
2843     TryCand.Reason = Reason;
2844     return true;
2845   }
2846   if (TryVal < CandVal) {
2847     if (Cand.Reason > Reason)
2848       Cand.Reason = Reason;
2849     return true;
2850   }
2851   return false;
2852 }
2853 
2854 bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
2855                 GenericSchedulerBase::SchedCandidate &Cand,
2856                 SchedBoundary &Zone) {
2857   if (Zone.isTop()) {
2858     // Prefer the candidate with the lesser depth, but only if one of them has
2859     // depth greater than the total latency scheduled so far, otherwise either
2860     // of them could be scheduled now with no stall.
2861     if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
2862         Zone.getScheduledLatency()) {
2863       if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2864                   TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
2865         return true;
2866     }
2867     if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2868                    TryCand, Cand, GenericSchedulerBase::TopPathReduce))
2869       return true;
2870   } else {
2871     // Prefer the candidate with the lesser height, but only if one of them has
2872     // height greater than the total latency scheduled so far, otherwise either
2873     // of them could be scheduled now with no stall.
2874     if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
2875         Zone.getScheduledLatency()) {
2876       if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2877                   TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
2878         return true;
2879     }
2880     if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2881                    TryCand, Cand, GenericSchedulerBase::BotPathReduce))
2882       return true;
2883   }
2884   return false;
2885 }
2886 } // end namespace llvm
2887 
2888 static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
2889   LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2890                     << GenericSchedulerBase::getReasonStr(Reason) << '\n');
2891 }
2892 
2893 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
2894   tracePick(Cand.Reason, Cand.AtTop);
2895 }
2896 
2897 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
2898   assert(dag->hasVRegLiveness() &&
2899          "(PreRA)GenericScheduler needs vreg liveness");
2900   DAG = static_cast<ScheduleDAGMILive*>(dag);
2901   SchedModel = DAG->getSchedModel();
2902   TRI = DAG->TRI;
2903 
2904   if (RegionPolicy.ComputeDFSResult)
2905     DAG->computeDFSResult();
2906 
2907   Rem.init(DAG, SchedModel);
2908   Top.init(DAG, SchedModel, &Rem);
2909   Bot.init(DAG, SchedModel, &Rem);
2910 
2911   // Initialize resource counts.
2912 
2913   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
2914   // are disabled, then these HazardRecs will be disabled.
2915   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
2916   if (!Top.HazardRec) {
2917     Top.HazardRec =
2918         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2919             Itin, DAG);
2920   }
2921   if (!Bot.HazardRec) {
2922     Bot.HazardRec =
2923         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
2924             Itin, DAG);
2925   }
2926   TopCand.SU = nullptr;
2927   BotCand.SU = nullptr;
2928 }
2929 
2930 /// Initialize the per-region scheduling policy.
2931 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
2932                                   MachineBasicBlock::iterator End,
2933                                   unsigned NumRegionInstrs) {
2934   const MachineFunction &MF = *Begin->getMF();
2935   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
2936 
2937   // Avoid setting up the register pressure tracker for small regions to save
2938   // compile time. As a rough heuristic, only track pressure when the number of
2939   // schedulable instructions exceeds half the integer register file.
2940   RegionPolicy.ShouldTrackPressure = true;
2941   for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) {
2942     MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT;
2943     if (TLI->isTypeLegal(LegalIntVT)) {
2944       unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
2945         TLI->getRegClassFor(LegalIntVT));
2946       RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
2947     }
2948   }
2949 
2950   // For generic targets, we default to bottom-up, because it's simpler and more
2951   // compile-time optimizations have been implemented in that direction.
2952   RegionPolicy.OnlyBottomUp = true;
2953 
2954   // Allow the subtarget to override default policy.
2955   MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
2956 
2957   // After subtarget overrides, apply command line options.
2958   if (!EnableRegPressure) {
2959     RegionPolicy.ShouldTrackPressure = false;
2960     RegionPolicy.ShouldTrackLaneMasks = false;
2961   }
2962 
2963   // Check -misched-topdown/bottomup can force or unforce scheduling direction.
2964   // e.g. -misched-bottomup=false allows scheduling in both directions.
2965   assert((!ForceTopDown || !ForceBottomUp) &&
2966          "-misched-topdown incompatible with -misched-bottomup");
2967   if (ForceBottomUp.getNumOccurrences() > 0) {
2968     RegionPolicy.OnlyBottomUp = ForceBottomUp;
2969     if (RegionPolicy.OnlyBottomUp)
2970       RegionPolicy.OnlyTopDown = false;
2971   }
2972   if (ForceTopDown.getNumOccurrences() > 0) {
2973     RegionPolicy.OnlyTopDown = ForceTopDown;
2974     if (RegionPolicy.OnlyTopDown)
2975       RegionPolicy.OnlyBottomUp = false;
2976   }
2977 }
2978 
2979 void GenericScheduler::dumpPolicy() const {
2980   // Cannot completely remove virtual function even in release mode.
2981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2982   dbgs() << "GenericScheduler RegionPolicy: "
2983          << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
2984          << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
2985          << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
2986          << "\n";
2987 #endif
2988 }
2989 
2990 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
2991 /// critical path by more cycles than it takes to drain the instruction buffer.
2992 /// We estimate an upper bounds on in-flight instructions as:
2993 ///
2994 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
2995 /// InFlightIterations = AcyclicPath / CyclesPerIteration
2996 /// InFlightResources = InFlightIterations * LoopResources
2997 ///
2998 /// TODO: Check execution resources in addition to IssueCount.
2999 void GenericScheduler::checkAcyclicLatency() {
3000   if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
3001     return;
3002 
3003   // Scaled number of cycles per loop iteration.
3004   unsigned IterCount =
3005     std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
3006              Rem.RemIssueCount);
3007   // Scaled acyclic critical path.
3008   unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
3009   // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
3010   unsigned InFlightCount =
3011     (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
3012   unsigned BufferLimit =
3013     SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
3014 
3015   Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
3016 
3017   LLVM_DEBUG(
3018       dbgs() << "IssueCycles="
3019              << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
3020              << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
3021              << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount
3022              << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
3023              << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
3024       if (Rem.IsAcyclicLatencyLimited) dbgs() << "  ACYCLIC LATENCY LIMIT\n");
3025 }
3026 
3027 void GenericScheduler::registerRoots() {
3028   Rem.CriticalPath = DAG->ExitSU.getDepth();
3029 
3030   // Some roots may not feed into ExitSU. Check all of them in case.
3031   for (const SUnit *SU : Bot.Available) {
3032     if (SU->getDepth() > Rem.CriticalPath)
3033       Rem.CriticalPath = SU->getDepth();
3034   }
3035   LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
3036   if (DumpCriticalPathLength) {
3037     errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
3038   }
3039 
3040   if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
3041     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
3042     checkAcyclicLatency();
3043   }
3044 }
3045 
3046 namespace llvm {
3047 bool tryPressure(const PressureChange &TryP,
3048                  const PressureChange &CandP,
3049                  GenericSchedulerBase::SchedCandidate &TryCand,
3050                  GenericSchedulerBase::SchedCandidate &Cand,
3051                  GenericSchedulerBase::CandReason Reason,
3052                  const TargetRegisterInfo *TRI,
3053                  const MachineFunction &MF) {
3054   // If one candidate decreases and the other increases, go with it.
3055   // Invalid candidates have UnitInc==0.
3056   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
3057                  Reason)) {
3058     return true;
3059   }
3060   // Do not compare the magnitude of pressure changes between top and bottom
3061   // boundary.
3062   if (Cand.AtTop != TryCand.AtTop)
3063     return false;
3064 
3065   // If both candidates affect the same set in the same boundary, go with the
3066   // smallest increase.
3067   unsigned TryPSet = TryP.getPSetOrMax();
3068   unsigned CandPSet = CandP.getPSetOrMax();
3069   if (TryPSet == CandPSet) {
3070     return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
3071                    Reason);
3072   }
3073 
3074   int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
3075                                  std::numeric_limits<int>::max();
3076 
3077   int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
3078                                    std::numeric_limits<int>::max();
3079 
3080   // If the candidates are decreasing pressure, reverse priority.
3081   if (TryP.getUnitInc() < 0)
3082     std::swap(TryRank, CandRank);
3083   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
3084 }
3085 
3086 unsigned getWeakLeft(const SUnit *SU, bool isTop) {
3087   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
3088 }
3089 
3090 /// Minimize physical register live ranges. Regalloc wants them adjacent to
3091 /// their physreg def/use.
3092 ///
3093 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
3094 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
3095 /// with the operation that produces or consumes the physreg. We'll do this when
3096 /// regalloc has support for parallel copies.
3097 int biasPhysReg(const SUnit *SU, bool isTop) {
3098   const MachineInstr *MI = SU->getInstr();
3099 
3100   if (MI->isCopy()) {
3101     unsigned ScheduledOper = isTop ? 1 : 0;
3102     unsigned UnscheduledOper = isTop ? 0 : 1;
3103     // If we have already scheduled the physreg produce/consumer, immediately
3104     // schedule the copy.
3105     if (Register::isPhysicalRegister(MI->getOperand(ScheduledOper).getReg()))
3106       return 1;
3107     // If the physreg is at the boundary, defer it. Otherwise schedule it
3108     // immediately to free the dependent. We can hoist the copy later.
3109     bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
3110     if (Register::isPhysicalRegister(MI->getOperand(UnscheduledOper).getReg()))
3111       return AtBoundary ? -1 : 1;
3112   }
3113 
3114   if (MI->isMoveImmediate()) {
3115     // If we have a move immediate and all successors have been assigned, bias
3116     // towards scheduling this later. Make sure all register defs are to
3117     // physical registers.
3118     bool DoBias = true;
3119     for (const MachineOperand &Op : MI->defs()) {
3120       if (Op.isReg() && !Register::isPhysicalRegister(Op.getReg())) {
3121         DoBias = false;
3122         break;
3123       }
3124     }
3125 
3126     if (DoBias)
3127       return isTop ? -1 : 1;
3128   }
3129 
3130   return 0;
3131 }
3132 } // end namespace llvm
3133 
3134 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
3135                                      bool AtTop,
3136                                      const RegPressureTracker &RPTracker,
3137                                      RegPressureTracker &TempTracker) {
3138   Cand.SU = SU;
3139   Cand.AtTop = AtTop;
3140   if (DAG->isTrackingPressure()) {
3141     if (AtTop) {
3142       TempTracker.getMaxDownwardPressureDelta(
3143         Cand.SU->getInstr(),
3144         Cand.RPDelta,
3145         DAG->getRegionCriticalPSets(),
3146         DAG->getRegPressure().MaxSetPressure);
3147     } else {
3148       if (VerifyScheduling) {
3149         TempTracker.getMaxUpwardPressureDelta(
3150           Cand.SU->getInstr(),
3151           &DAG->getPressureDiff(Cand.SU),
3152           Cand.RPDelta,
3153           DAG->getRegionCriticalPSets(),
3154           DAG->getRegPressure().MaxSetPressure);
3155       } else {
3156         RPTracker.getUpwardPressureDelta(
3157           Cand.SU->getInstr(),
3158           DAG->getPressureDiff(Cand.SU),
3159           Cand.RPDelta,
3160           DAG->getRegionCriticalPSets(),
3161           DAG->getRegPressure().MaxSetPressure);
3162       }
3163     }
3164   }
3165   LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs()
3166              << "  Try  SU(" << Cand.SU->NodeNum << ") "
3167              << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":"
3168              << Cand.RPDelta.Excess.getUnitInc() << "\n");
3169 }
3170 
3171 /// Apply a set of heuristics to a new candidate. Heuristics are currently
3172 /// hierarchical. This may be more efficient than a graduated cost model because
3173 /// we don't need to evaluate all aspects of the model for each node in the
3174 /// queue. But it's really done to make the heuristics easier to debug and
3175 /// statistically analyze.
3176 ///
3177 /// \param Cand provides the policy and current best candidate.
3178 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3179 /// \param Zone describes the scheduled zone that we are extending, or nullptr
3180 ///             if Cand is from a different zone than TryCand.
3181 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
3182 bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
3183                                     SchedCandidate &TryCand,
3184                                     SchedBoundary *Zone) const {
3185   // Initialize the candidate if needed.
3186   if (!Cand.isValid()) {
3187     TryCand.Reason = NodeOrder;
3188     return true;
3189   }
3190 
3191   // Bias PhysReg Defs and copies to their uses and defined respectively.
3192   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
3193                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
3194     return TryCand.Reason != NoCand;
3195 
3196   // Avoid exceeding the target's limit.
3197   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
3198                                                Cand.RPDelta.Excess,
3199                                                TryCand, Cand, RegExcess, TRI,
3200                                                DAG->MF))
3201     return TryCand.Reason != NoCand;
3202 
3203   // Avoid increasing the max critical pressure in the scheduled region.
3204   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
3205                                                Cand.RPDelta.CriticalMax,
3206                                                TryCand, Cand, RegCritical, TRI,
3207                                                DAG->MF))
3208     return TryCand.Reason != NoCand;
3209 
3210   // We only compare a subset of features when comparing nodes between
3211   // Top and Bottom boundary. Some properties are simply incomparable, in many
3212   // other instances we should only override the other boundary if something
3213   // is a clear good pick on one boundary. Skip heuristics that are more
3214   // "tie-breaking" in nature.
3215   bool SameBoundary = Zone != nullptr;
3216   if (SameBoundary) {
3217     // For loops that are acyclic path limited, aggressively schedule for
3218     // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
3219     // heuristics to take precedence.
3220     if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
3221         tryLatency(TryCand, Cand, *Zone))
3222       return TryCand.Reason != NoCand;
3223 
3224     // Prioritize instructions that read unbuffered resources by stall cycles.
3225     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
3226                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3227       return TryCand.Reason != NoCand;
3228   }
3229 
3230   // Keep clustered nodes together to encourage downstream peephole
3231   // optimizations which may reduce resource requirements.
3232   //
3233   // This is a best effort to set things up for a post-RA pass. Optimizations
3234   // like generating loads of multiple registers should ideally be done within
3235   // the scheduler pass by combining the loads during DAG postprocessing.
3236   const SUnit *CandNextClusterSU =
3237     Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3238   const SUnit *TryCandNextClusterSU =
3239     TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
3240   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
3241                  Cand.SU == CandNextClusterSU,
3242                  TryCand, Cand, Cluster))
3243     return TryCand.Reason != NoCand;
3244 
3245   if (SameBoundary) {
3246     // Weak edges are for clustering and other constraints.
3247     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
3248                 getWeakLeft(Cand.SU, Cand.AtTop),
3249                 TryCand, Cand, Weak))
3250       return TryCand.Reason != NoCand;
3251   }
3252 
3253   // Avoid increasing the max pressure of the entire region.
3254   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
3255                                                Cand.RPDelta.CurrentMax,
3256                                                TryCand, Cand, RegMax, TRI,
3257                                                DAG->MF))
3258     return TryCand.Reason != NoCand;
3259 
3260   if (SameBoundary) {
3261     // Avoid critical resource consumption and balance the schedule.
3262     TryCand.initResourceDelta(DAG, SchedModel);
3263     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3264                 TryCand, Cand, ResourceReduce))
3265       return TryCand.Reason != NoCand;
3266     if (tryGreater(TryCand.ResDelta.DemandedResources,
3267                    Cand.ResDelta.DemandedResources,
3268                    TryCand, Cand, ResourceDemand))
3269       return TryCand.Reason != NoCand;
3270 
3271     // Avoid serializing long latency dependence chains.
3272     // For acyclic path limited loops, latency was already checked above.
3273     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
3274         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
3275       return TryCand.Reason != NoCand;
3276 
3277     // Fall through to original instruction order.
3278     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
3279         || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
3280       TryCand.Reason = NodeOrder;
3281       return true;
3282     }
3283   }
3284 
3285   return false;
3286 }
3287 
3288 /// Pick the best candidate from the queue.
3289 ///
3290 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
3291 /// DAG building. To adjust for the current scheduling location we need to
3292 /// maintain the number of vreg uses remaining to be top-scheduled.
3293 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
3294                                          const CandPolicy &ZonePolicy,
3295                                          const RegPressureTracker &RPTracker,
3296                                          SchedCandidate &Cand) {
3297   // getMaxPressureDelta temporarily modifies the tracker.
3298   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
3299 
3300   ReadyQueue &Q = Zone.Available;
3301   for (SUnit *SU : Q) {
3302 
3303     SchedCandidate TryCand(ZonePolicy);
3304     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
3305     // Pass SchedBoundary only when comparing nodes from the same boundary.
3306     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
3307     if (tryCandidate(Cand, TryCand, ZoneArg)) {
3308       // Initialize resource delta if needed in case future heuristics query it.
3309       if (TryCand.ResDelta == SchedResourceDelta())
3310         TryCand.initResourceDelta(DAG, SchedModel);
3311       Cand.setBest(TryCand);
3312       LLVM_DEBUG(traceCandidate(Cand));
3313     }
3314   }
3315 }
3316 
3317 /// Pick the best candidate node from either the top or bottom queue.
3318 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
3319   // Schedule as far as possible in the direction of no choice. This is most
3320   // efficient, but also provides the best heuristics for CriticalPSets.
3321   if (SUnit *SU = Bot.pickOnlyChoice()) {
3322     IsTopNode = false;
3323     tracePick(Only1, false);
3324     return SU;
3325   }
3326   if (SUnit *SU = Top.pickOnlyChoice()) {
3327     IsTopNode = true;
3328     tracePick(Only1, true);
3329     return SU;
3330   }
3331   // Set the bottom-up policy based on the state of the current bottom zone and
3332   // the instructions outside the zone, including the top zone.
3333   CandPolicy BotPolicy;
3334   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
3335   // Set the top-down policy based on the state of the current top zone and
3336   // the instructions outside the zone, including the bottom zone.
3337   CandPolicy TopPolicy;
3338   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
3339 
3340   // See if BotCand is still valid (because we previously scheduled from Top).
3341   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
3342   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
3343       BotCand.Policy != BotPolicy) {
3344     BotCand.reset(CandPolicy());
3345     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
3346     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
3347   } else {
3348     LLVM_DEBUG(traceCandidate(BotCand));
3349 #ifndef NDEBUG
3350     if (VerifyScheduling) {
3351       SchedCandidate TCand;
3352       TCand.reset(CandPolicy());
3353       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
3354       assert(TCand.SU == BotCand.SU &&
3355              "Last pick result should correspond to re-picking right now");
3356     }
3357 #endif
3358   }
3359 
3360   // Check if the top Q has a better candidate.
3361   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
3362   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
3363       TopCand.Policy != TopPolicy) {
3364     TopCand.reset(CandPolicy());
3365     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
3366     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
3367   } else {
3368     LLVM_DEBUG(traceCandidate(TopCand));
3369 #ifndef NDEBUG
3370     if (VerifyScheduling) {
3371       SchedCandidate TCand;
3372       TCand.reset(CandPolicy());
3373       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
3374       assert(TCand.SU == TopCand.SU &&
3375            "Last pick result should correspond to re-picking right now");
3376     }
3377 #endif
3378   }
3379 
3380   // Pick best from BotCand and TopCand.
3381   assert(BotCand.isValid());
3382   assert(TopCand.isValid());
3383   SchedCandidate Cand = BotCand;
3384   TopCand.Reason = NoCand;
3385   if (tryCandidate(Cand, TopCand, nullptr)) {
3386     Cand.setBest(TopCand);
3387     LLVM_DEBUG(traceCandidate(Cand));
3388   }
3389 
3390   IsTopNode = Cand.AtTop;
3391   tracePick(Cand);
3392   return Cand.SU;
3393 }
3394 
3395 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
3396 SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
3397   if (DAG->top() == DAG->bottom()) {
3398     assert(Top.Available.empty() && Top.Pending.empty() &&
3399            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
3400     return nullptr;
3401   }
3402   SUnit *SU;
3403   do {
3404     if (RegionPolicy.OnlyTopDown) {
3405       SU = Top.pickOnlyChoice();
3406       if (!SU) {
3407         CandPolicy NoPolicy;
3408         TopCand.reset(NoPolicy);
3409         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
3410         assert(TopCand.Reason != NoCand && "failed to find a candidate");
3411         tracePick(TopCand);
3412         SU = TopCand.SU;
3413       }
3414       IsTopNode = true;
3415     } else if (RegionPolicy.OnlyBottomUp) {
3416       SU = Bot.pickOnlyChoice();
3417       if (!SU) {
3418         CandPolicy NoPolicy;
3419         BotCand.reset(NoPolicy);
3420         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
3421         assert(BotCand.Reason != NoCand && "failed to find a candidate");
3422         tracePick(BotCand);
3423         SU = BotCand.SU;
3424       }
3425       IsTopNode = false;
3426     } else {
3427       SU = pickNodeBidirectional(IsTopNode);
3428     }
3429   } while (SU->isScheduled);
3430 
3431   if (SU->isTopReady())
3432     Top.removeReady(SU);
3433   if (SU->isBottomReady())
3434     Bot.removeReady(SU);
3435 
3436   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3437                     << *SU->getInstr());
3438   return SU;
3439 }
3440 
3441 void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
3442   MachineBasicBlock::iterator InsertPos = SU->getInstr();
3443   if (!isTop)
3444     ++InsertPos;
3445   SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
3446 
3447   // Find already scheduled copies with a single physreg dependence and move
3448   // them just above the scheduled instruction.
3449   for (SDep &Dep : Deps) {
3450     if (Dep.getKind() != SDep::Data ||
3451         !Register::isPhysicalRegister(Dep.getReg()))
3452       continue;
3453     SUnit *DepSU = Dep.getSUnit();
3454     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
3455       continue;
3456     MachineInstr *Copy = DepSU->getInstr();
3457     if (!Copy->isCopy() && !Copy->isMoveImmediate())
3458       continue;
3459     LLVM_DEBUG(dbgs() << "  Rescheduling physreg copy ";
3460                DAG->dumpNode(*Dep.getSUnit()));
3461     DAG->moveInstruction(Copy, InsertPos);
3462   }
3463 }
3464 
3465 /// Update the scheduler's state after scheduling a node. This is the same node
3466 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
3467 /// update it's state based on the current cycle before MachineSchedStrategy
3468 /// does.
3469 ///
3470 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
3471 /// them here. See comments in biasPhysReg.
3472 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3473   if (IsTopNode) {
3474     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3475     Top.bumpNode(SU);
3476     if (SU->hasPhysRegUses)
3477       reschedulePhysReg(SU, true);
3478   } else {
3479     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
3480     Bot.bumpNode(SU);
3481     if (SU->hasPhysRegDefs)
3482       reschedulePhysReg(SU, false);
3483   }
3484 }
3485 
3486 /// Create the standard converging machine scheduler. This will be used as the
3487 /// default scheduler if the target does not set a default.
3488 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
3489   ScheduleDAGMILive *DAG =
3490       new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
3491   // Register DAG post-processors.
3492   //
3493   // FIXME: extend the mutation API to allow earlier mutations to instantiate
3494   // data and pass it to later mutations. Have a single mutation that gathers
3495   // the interesting nodes in one pass.
3496   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
3497   return DAG;
3498 }
3499 
3500 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
3501   return createGenericSchedLive(C);
3502 }
3503 
3504 static MachineSchedRegistry
3505 GenericSchedRegistry("converge", "Standard converging scheduler.",
3506                      createConvergingSched);
3507 
3508 //===----------------------------------------------------------------------===//
3509 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
3510 //===----------------------------------------------------------------------===//
3511 
3512 void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
3513   DAG = Dag;
3514   SchedModel = DAG->getSchedModel();
3515   TRI = DAG->TRI;
3516 
3517   Rem.init(DAG, SchedModel);
3518   Top.init(DAG, SchedModel, &Rem);
3519   BotRoots.clear();
3520 
3521   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
3522   // or are disabled, then these HazardRecs will be disabled.
3523   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
3524   if (!Top.HazardRec) {
3525     Top.HazardRec =
3526         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
3527             Itin, DAG);
3528   }
3529 }
3530 
3531 void PostGenericScheduler::registerRoots() {
3532   Rem.CriticalPath = DAG->ExitSU.getDepth();
3533 
3534   // Some roots may not feed into ExitSU. Check all of them in case.
3535   for (const SUnit *SU : BotRoots) {
3536     if (SU->getDepth() > Rem.CriticalPath)
3537       Rem.CriticalPath = SU->getDepth();
3538   }
3539   LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
3540   if (DumpCriticalPathLength) {
3541     errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
3542   }
3543 }
3544 
3545 /// Apply a set of heuristics to a new candidate for PostRA scheduling.
3546 ///
3547 /// \param Cand provides the policy and current best candidate.
3548 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3549 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
3550 bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
3551                                         SchedCandidate &TryCand) {
3552   // Initialize the candidate if needed.
3553   if (!Cand.isValid()) {
3554     TryCand.Reason = NodeOrder;
3555     return true;
3556   }
3557 
3558   // Prioritize instructions that read unbuffered resources by stall cycles.
3559   if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
3560               Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3561     return TryCand.Reason != NoCand;
3562 
3563   // Keep clustered nodes together.
3564   if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
3565                  Cand.SU == DAG->getNextClusterSucc(),
3566                  TryCand, Cand, Cluster))
3567     return TryCand.Reason != NoCand;
3568 
3569   // Avoid critical resource consumption and balance the schedule.
3570   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3571               TryCand, Cand, ResourceReduce))
3572     return TryCand.Reason != NoCand;
3573   if (tryGreater(TryCand.ResDelta.DemandedResources,
3574                  Cand.ResDelta.DemandedResources,
3575                  TryCand, Cand, ResourceDemand))
3576     return TryCand.Reason != NoCand;
3577 
3578   // Avoid serializing long latency dependence chains.
3579   if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
3580     return TryCand.Reason != NoCand;
3581   }
3582 
3583   // Fall through to original instruction order.
3584   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
3585     TryCand.Reason = NodeOrder;
3586     return true;
3587   }
3588 
3589   return false;
3590 }
3591 
3592 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
3593   ReadyQueue &Q = Top.Available;
3594   for (SUnit *SU : Q) {
3595     SchedCandidate TryCand(Cand.Policy);
3596     TryCand.SU = SU;
3597     TryCand.AtTop = true;
3598     TryCand.initResourceDelta(DAG, SchedModel);
3599     if (tryCandidate(Cand, TryCand)) {
3600       Cand.setBest(TryCand);
3601       LLVM_DEBUG(traceCandidate(Cand));
3602     }
3603   }
3604 }
3605 
3606 /// Pick the next node to schedule.
3607 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
3608   if (DAG->top() == DAG->bottom()) {
3609     assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
3610     return nullptr;
3611   }
3612   SUnit *SU;
3613   do {
3614     SU = Top.pickOnlyChoice();
3615     if (SU) {
3616       tracePick(Only1, true);
3617     } else {
3618       CandPolicy NoPolicy;
3619       SchedCandidate TopCand(NoPolicy);
3620       // Set the top-down policy based on the state of the current top zone and
3621       // the instructions outside the zone, including the bottom zone.
3622       setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
3623       pickNodeFromQueue(TopCand);
3624       assert(TopCand.Reason != NoCand && "failed to find a candidate");
3625       tracePick(TopCand);
3626       SU = TopCand.SU;
3627     }
3628   } while (SU->isScheduled);
3629 
3630   IsTopNode = true;
3631   Top.removeReady(SU);
3632 
3633   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
3634                     << *SU->getInstr());
3635   return SU;
3636 }
3637 
3638 /// Called after ScheduleDAGMI has scheduled an instruction and updated
3639 /// scheduled/remaining flags in the DAG nodes.
3640 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3641   SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3642   Top.bumpNode(SU);
3643 }
3644 
3645 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
3646   return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
3647                            /*RemoveKillFlags=*/true);
3648 }
3649 
3650 //===----------------------------------------------------------------------===//
3651 // ILP Scheduler. Currently for experimental analysis of heuristics.
3652 //===----------------------------------------------------------------------===//
3653 
3654 namespace {
3655 
3656 /// Order nodes by the ILP metric.
3657 struct ILPOrder {
3658   const SchedDFSResult *DFSResult = nullptr;
3659   const BitVector *ScheduledTrees = nullptr;
3660   bool MaximizeILP;
3661 
3662   ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
3663 
3664   /// Apply a less-than relation on node priority.
3665   ///
3666   /// (Return true if A comes after B in the Q.)
3667   bool operator()(const SUnit *A, const SUnit *B) const {
3668     unsigned SchedTreeA = DFSResult->getSubtreeID(A);
3669     unsigned SchedTreeB = DFSResult->getSubtreeID(B);
3670     if (SchedTreeA != SchedTreeB) {
3671       // Unscheduled trees have lower priority.
3672       if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
3673         return ScheduledTrees->test(SchedTreeB);
3674 
3675       // Trees with shallower connections have have lower priority.
3676       if (DFSResult->getSubtreeLevel(SchedTreeA)
3677           != DFSResult->getSubtreeLevel(SchedTreeB)) {
3678         return DFSResult->getSubtreeLevel(SchedTreeA)
3679           < DFSResult->getSubtreeLevel(SchedTreeB);
3680       }
3681     }
3682     if (MaximizeILP)
3683       return DFSResult->getILP(A) < DFSResult->getILP(B);
3684     else
3685       return DFSResult->getILP(A) > DFSResult->getILP(B);
3686   }
3687 };
3688 
3689 /// Schedule based on the ILP metric.
3690 class ILPScheduler : public MachineSchedStrategy {
3691   ScheduleDAGMILive *DAG = nullptr;
3692   ILPOrder Cmp;
3693 
3694   std::vector<SUnit*> ReadyQ;
3695 
3696 public:
3697   ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
3698 
3699   void initialize(ScheduleDAGMI *dag) override {
3700     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
3701     DAG = static_cast<ScheduleDAGMILive*>(dag);
3702     DAG->computeDFSResult();
3703     Cmp.DFSResult = DAG->getDFSResult();
3704     Cmp.ScheduledTrees = &DAG->getScheduledTrees();
3705     ReadyQ.clear();
3706   }
3707 
3708   void registerRoots() override {
3709     // Restore the heap in ReadyQ with the updated DFS results.
3710     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3711   }
3712 
3713   /// Implement MachineSchedStrategy interface.
3714   /// -----------------------------------------
3715 
3716   /// Callback to select the highest priority node from the ready Q.
3717   SUnit *pickNode(bool &IsTopNode) override {
3718     if (ReadyQ.empty()) return nullptr;
3719     std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3720     SUnit *SU = ReadyQ.back();
3721     ReadyQ.pop_back();
3722     IsTopNode = false;
3723     LLVM_DEBUG(dbgs() << "Pick node "
3724                       << "SU(" << SU->NodeNum << ") "
3725                       << " ILP: " << DAG->getDFSResult()->getILP(SU)
3726                       << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU)
3727                       << " @"
3728                       << DAG->getDFSResult()->getSubtreeLevel(
3729                              DAG->getDFSResult()->getSubtreeID(SU))
3730                       << '\n'
3731                       << "Scheduling " << *SU->getInstr());
3732     return SU;
3733   }
3734 
3735   /// Scheduler callback to notify that a new subtree is scheduled.
3736   void scheduleTree(unsigned SubtreeID) override {
3737     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3738   }
3739 
3740   /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
3741   /// DFSResults, and resort the priority Q.
3742   void schedNode(SUnit *SU, bool IsTopNode) override {
3743     assert(!IsTopNode && "SchedDFSResult needs bottom-up");
3744   }
3745 
3746   void releaseTopNode(SUnit *) override { /*only called for top roots*/ }
3747 
3748   void releaseBottomNode(SUnit *SU) override {
3749     ReadyQ.push_back(SU);
3750     std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
3751   }
3752 };
3753 
3754 } // end anonymous namespace
3755 
3756 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
3757   return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
3758 }
3759 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
3760   return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
3761 }
3762 
3763 static MachineSchedRegistry ILPMaxRegistry(
3764   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
3765 static MachineSchedRegistry ILPMinRegistry(
3766   "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
3767 
3768 //===----------------------------------------------------------------------===//
3769 // Machine Instruction Shuffler for Correctness Testing
3770 //===----------------------------------------------------------------------===//
3771 
3772 #ifndef NDEBUG
3773 namespace {
3774 
3775 /// Apply a less-than relation on the node order, which corresponds to the
3776 /// instruction order prior to scheduling. IsReverse implements greater-than.
3777 template<bool IsReverse>
3778 struct SUnitOrder {
3779   bool operator()(SUnit *A, SUnit *B) const {
3780     if (IsReverse)
3781       return A->NodeNum > B->NodeNum;
3782     else
3783       return A->NodeNum < B->NodeNum;
3784   }
3785 };
3786 
3787 /// Reorder instructions as much as possible.
3788 class InstructionShuffler : public MachineSchedStrategy {
3789   bool IsAlternating;
3790   bool IsTopDown;
3791 
3792   // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
3793   // gives nodes with a higher number higher priority causing the latest
3794   // instructions to be scheduled first.
3795   PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
3796     TopQ;
3797 
3798   // When scheduling bottom-up, use greater-than as the queue priority.
3799   PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
3800     BottomQ;
3801 
3802 public:
3803   InstructionShuffler(bool alternate, bool topdown)
3804     : IsAlternating(alternate), IsTopDown(topdown) {}
3805 
3806   void initialize(ScheduleDAGMI*) override {
3807     TopQ.clear();
3808     BottomQ.clear();
3809   }
3810 
3811   /// Implement MachineSchedStrategy interface.
3812   /// -----------------------------------------
3813 
3814   SUnit *pickNode(bool &IsTopNode) override {
3815     SUnit *SU;
3816     if (IsTopDown) {
3817       do {
3818         if (TopQ.empty()) return nullptr;
3819         SU = TopQ.top();
3820         TopQ.pop();
3821       } while (SU->isScheduled);
3822       IsTopNode = true;
3823     } else {
3824       do {
3825         if (BottomQ.empty()) return nullptr;
3826         SU = BottomQ.top();
3827         BottomQ.pop();
3828       } while (SU->isScheduled);
3829       IsTopNode = false;
3830     }
3831     if (IsAlternating)
3832       IsTopDown = !IsTopDown;
3833     return SU;
3834   }
3835 
3836   void schedNode(SUnit *SU, bool IsTopNode) override {}
3837 
3838   void releaseTopNode(SUnit *SU) override {
3839     TopQ.push(SU);
3840   }
3841   void releaseBottomNode(SUnit *SU) override {
3842     BottomQ.push(SU);
3843   }
3844 };
3845 
3846 } // end anonymous namespace
3847 
3848 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
3849   bool Alternate = !ForceTopDown && !ForceBottomUp;
3850   bool TopDown = !ForceBottomUp;
3851   assert((TopDown || !ForceTopDown) &&
3852          "-misched-topdown incompatible with -misched-bottomup");
3853   return new ScheduleDAGMILive(
3854       C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
3855 }
3856 
3857 static MachineSchedRegistry ShufflerRegistry(
3858   "shuffle", "Shuffle machine instructions alternating directions",
3859   createInstructionShuffler);
3860 #endif // !NDEBUG
3861 
3862 //===----------------------------------------------------------------------===//
3863 // GraphWriter support for ScheduleDAGMILive.
3864 //===----------------------------------------------------------------------===//
3865 
3866 #ifndef NDEBUG
3867 namespace llvm {
3868 
3869 template<> struct GraphTraits<
3870   ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
3871 
3872 template<>
3873 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
3874   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
3875 
3876   static std::string getGraphName(const ScheduleDAG *G) {
3877     return std::string(G->MF.getName());
3878   }
3879 
3880   static bool renderGraphFromBottomUp() {
3881     return true;
3882   }
3883 
3884   static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
3885     if (ViewMISchedCutoff == 0)
3886       return false;
3887     return (Node->Preds.size() > ViewMISchedCutoff
3888          || Node->Succs.size() > ViewMISchedCutoff);
3889   }
3890 
3891   /// If you want to override the dot attributes printed for a particular
3892   /// edge, override this method.
3893   static std::string getEdgeAttributes(const SUnit *Node,
3894                                        SUnitIterator EI,
3895                                        const ScheduleDAG *Graph) {
3896     if (EI.isArtificialDep())
3897       return "color=cyan,style=dashed";
3898     if (EI.isCtrlDep())
3899       return "color=blue,style=dashed";
3900     return "";
3901   }
3902 
3903   static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
3904     std::string Str;
3905     raw_string_ostream SS(Str);
3906     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3907     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3908       static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3909     SS << "SU:" << SU->NodeNum;
3910     if (DFS)
3911       SS << " I:" << DFS->getNumInstrs(SU);
3912     return SS.str();
3913   }
3914 
3915   static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
3916     return G->getGraphNodeLabel(SU);
3917   }
3918 
3919   static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
3920     std::string Str("shape=Mrecord");
3921     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
3922     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
3923       static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
3924     if (DFS) {
3925       Str += ",style=filled,fillcolor=\"#";
3926       Str += DOT::getColorString(DFS->getSubtreeID(N));
3927       Str += '"';
3928     }
3929     return Str;
3930   }
3931 };
3932 
3933 } // end namespace llvm
3934 #endif // NDEBUG
3935 
3936 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
3937 /// rendered using 'dot'.
3938 void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
3939 #ifndef NDEBUG
3940   ViewGraph(this, Name, false, Title);
3941 #else
3942   errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
3943          << "systems with Graphviz or gv!\n";
3944 #endif  // NDEBUG
3945 }
3946 
3947 /// Out-of-line implementation with no arguments is handy for gdb.
3948 void ScheduleDAGMI::viewGraph() {
3949   viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
3950 }
3951