xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (revision e1c4c8dd8d2d10b6104f06856a77bd5b4813a801)
1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17 
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
30 
31 #define DEBUG_TYPE "machine-scheduler"
32 
33 using namespace llvm;
34 
35 static cl::opt<bool> DisableUnclusterHighRP(
36     "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
37     cl::desc("Disable unclustered high register pressure "
38              "reduction scheduling stage."),
39     cl::init(false));
40 
41 static cl::opt<bool> DisableClusteredLowOccupancy(
42     "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
43     cl::desc("Disable clustered low occupancy "
44              "rescheduling for ILP scheduling stage."),
45     cl::init(false));
46 
47 static cl::opt<unsigned> ScheduleMetricBias(
48     "amdgpu-schedule-metric-bias", cl::Hidden,
49     cl::desc(
50         "Sets the bias which adds weight to occupancy vs latency. Set it to "
51         "100 to chase the occupancy only."),
52     cl::init(10));
53 
54 static cl::opt<bool>
55     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
56                cl::desc("Relax occupancy targets for kernels which are memory "
57                         "bound (amdgpu-membound-threshold), or "
58                         "Wave Limited (amdgpu-limit-wave-threshold)."),
59                cl::init(false));
60 
61 const unsigned ScheduleMetrics::ScaleFactor = 100;
62 
63 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
64     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
65       HasHighPressure(false) {}
66 
67 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
68   GenericScheduler::initialize(DAG);
69 
70   MF = &DAG->MF;
71 
72   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
73 
74   SGPRExcessLimit =
75       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
76   VGPRExcessLimit =
77       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
78 
79   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
80   // Set the initial TargetOccupnacy to the maximum occupancy that we can
81   // achieve for this function. This effectively sets a lower bound on the
82   // 'Critical' register limits in the scheduler.
83   // Allow for lower occupancy targets if kernel is wave limited or memory
84   // bound, and using the relaxed occupancy feature.
85   TargetOccupancy =
86       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
87   SGPRCriticalLimit =
88       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
89 
90   if (!KnownExcessRP) {
91     VGPRCriticalLimit =
92         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
93   } else {
94     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
95     // returns a reasonably small number for targets with lots of VGPRs, such
96     // as GFX10 and GFX11.
97     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
98                          "VGPRCriticalLimit calculation method.\n");
99 
100     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
101     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
102     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
103     VGPRBudget = std::max(VGPRBudget, Granule);
104     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
105   }
106 
107   // Subtract error margin and bias from register limits and avoid overflow.
108   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
109   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
110   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
111   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
112 
113   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
114                     << ", VGPRExcessLimit = " << VGPRExcessLimit
115                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
116                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
117 }
118 
119 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
120                                      bool AtTop,
121                                      const RegPressureTracker &RPTracker,
122                                      const SIRegisterInfo *SRI,
123                                      unsigned SGPRPressure,
124                                      unsigned VGPRPressure) {
125   Cand.SU = SU;
126   Cand.AtTop = AtTop;
127 
128   if (!DAG->isTrackingPressure())
129     return;
130 
131   // getDownwardPressure() and getUpwardPressure() make temporary changes to
132   // the tracker, so we need to pass those function a non-const copy.
133   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
134 
135   Pressure.clear();
136   MaxPressure.clear();
137 
138   if (AtTop)
139     TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
140   else {
141     // FIXME: I think for bottom up scheduling, the register pressure is cached
142     // and can be retrieved by DAG->getPressureDif(SU).
143     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
144   }
145 
146   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
147   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
148 
149   // If two instructions increase the pressure of different register sets
150   // by the same amount, the generic scheduler will prefer to schedule the
151   // instruction that increases the set with the least amount of registers,
152   // which in our case would be SGPRs.  This is rarely what we want, so
153   // when we report excess/critical register pressure, we do it either
154   // only for VGPRs or only for SGPRs.
155 
156   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
157   const unsigned MaxVGPRPressureInc = 16;
158   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
159   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
160 
161 
162   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
163   // to increase the likelihood we don't go over the limits.  We should improve
164   // the analysis to look through dependencies to find the path with the least
165   // register pressure.
166 
167   // We only need to update the RPDelta for instructions that increase register
168   // pressure. Instructions that decrease or keep reg pressure the same will be
169   // marked as RegExcess in tryCandidate() when they are compared with
170   // instructions that increase the register pressure.
171   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
172     HasHighPressure = true;
173     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
174     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
175   }
176 
177   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
178     HasHighPressure = true;
179     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
180     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
181   }
182 
183   // Register pressure is considered 'CRITICAL' if it is approaching a value
184   // that would reduce the wave occupancy for the execution unit.  When
185   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
186   // has the same cost, so we don't need to prefer one over the other.
187 
188   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
189   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
190 
191   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
192     HasHighPressure = true;
193     if (SGPRDelta > VGPRDelta) {
194       Cand.RPDelta.CriticalMax =
195         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
196       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
197     } else {
198       Cand.RPDelta.CriticalMax =
199         PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
200       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
201     }
202   }
203 }
204 
205 // This function is mostly cut and pasted from
206 // GenericScheduler::pickNodeFromQueue()
207 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
208                                          const CandPolicy &ZonePolicy,
209                                          const RegPressureTracker &RPTracker,
210                                          SchedCandidate &Cand) {
211   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
212   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
213   unsigned SGPRPressure = 0;
214   unsigned VGPRPressure = 0;
215   if (DAG->isTrackingPressure()) {
216     SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
217     VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
218   }
219   ReadyQueue &Q = Zone.Available;
220   for (SUnit *SU : Q) {
221 
222     SchedCandidate TryCand(ZonePolicy);
223     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
224                   SGPRPressure, VGPRPressure);
225     // Pass SchedBoundary only when comparing nodes from the same boundary.
226     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
227     tryCandidate(Cand, TryCand, ZoneArg);
228     if (TryCand.Reason != NoCand) {
229       // Initialize resource delta if needed in case future heuristics query it.
230       if (TryCand.ResDelta == SchedResourceDelta())
231         TryCand.initResourceDelta(Zone.DAG, SchedModel);
232       Cand.setBest(TryCand);
233       LLVM_DEBUG(traceCandidate(Cand));
234     }
235   }
236 }
237 
238 // This function is mostly cut and pasted from
239 // GenericScheduler::pickNodeBidirectional()
240 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
241   // Schedule as far as possible in the direction of no choice. This is most
242   // efficient, but also provides the best heuristics for CriticalPSets.
243   if (SUnit *SU = Bot.pickOnlyChoice()) {
244     IsTopNode = false;
245     return SU;
246   }
247   if (SUnit *SU = Top.pickOnlyChoice()) {
248     IsTopNode = true;
249     return SU;
250   }
251   // Set the bottom-up policy based on the state of the current bottom zone and
252   // the instructions outside the zone, including the top zone.
253   CandPolicy BotPolicy;
254   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
255   // Set the top-down policy based on the state of the current top zone and
256   // the instructions outside the zone, including the bottom zone.
257   CandPolicy TopPolicy;
258   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
259 
260   // See if BotCand is still valid (because we previously scheduled from Top).
261   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
262   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
263       BotCand.Policy != BotPolicy) {
264     BotCand.reset(CandPolicy());
265     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
266     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
267   } else {
268     LLVM_DEBUG(traceCandidate(BotCand));
269 #ifndef NDEBUG
270     if (VerifyScheduling) {
271       SchedCandidate TCand;
272       TCand.reset(CandPolicy());
273       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
274       assert(TCand.SU == BotCand.SU &&
275              "Last pick result should correspond to re-picking right now");
276     }
277 #endif
278   }
279 
280   // Check if the top Q has a better candidate.
281   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
282   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
283       TopCand.Policy != TopPolicy) {
284     TopCand.reset(CandPolicy());
285     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
286     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
287   } else {
288     LLVM_DEBUG(traceCandidate(TopCand));
289 #ifndef NDEBUG
290     if (VerifyScheduling) {
291       SchedCandidate TCand;
292       TCand.reset(CandPolicy());
293       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
294       assert(TCand.SU == TopCand.SU &&
295            "Last pick result should correspond to re-picking right now");
296     }
297 #endif
298   }
299 
300   // Pick best from BotCand and TopCand.
301   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
302              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
303   SchedCandidate Cand = BotCand;
304   TopCand.Reason = NoCand;
305   tryCandidate(Cand, TopCand, nullptr);
306   if (TopCand.Reason != NoCand) {
307     Cand.setBest(TopCand);
308   }
309   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
310 
311   IsTopNode = Cand.AtTop;
312   return Cand.SU;
313 }
314 
315 // This function is mostly cut and pasted from
316 // GenericScheduler::pickNode()
317 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
318   if (DAG->top() == DAG->bottom()) {
319     assert(Top.Available.empty() && Top.Pending.empty() &&
320            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
321     return nullptr;
322   }
323   SUnit *SU;
324   do {
325     if (RegionPolicy.OnlyTopDown) {
326       SU = Top.pickOnlyChoice();
327       if (!SU) {
328         CandPolicy NoPolicy;
329         TopCand.reset(NoPolicy);
330         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
331         assert(TopCand.Reason != NoCand && "failed to find a candidate");
332         SU = TopCand.SU;
333       }
334       IsTopNode = true;
335     } else if (RegionPolicy.OnlyBottomUp) {
336       SU = Bot.pickOnlyChoice();
337       if (!SU) {
338         CandPolicy NoPolicy;
339         BotCand.reset(NoPolicy);
340         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
341         assert(BotCand.Reason != NoCand && "failed to find a candidate");
342         SU = BotCand.SU;
343       }
344       IsTopNode = false;
345     } else {
346       SU = pickNodeBidirectional(IsTopNode);
347     }
348   } while (SU->isScheduled);
349 
350   if (SU->isTopReady())
351     Top.removeReady(SU);
352   if (SU->isBottomReady())
353     Bot.removeReady(SU);
354 
355   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
356                     << *SU->getInstr());
357   return SU;
358 }
359 
360 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
361   assert(CurrentStage && CurrentStage != SchedStages.end());
362   return *CurrentStage;
363 }
364 
365 bool GCNSchedStrategy::advanceStage() {
366   assert(CurrentStage != SchedStages.end());
367   if (!CurrentStage)
368     CurrentStage = SchedStages.begin();
369   else
370     CurrentStage++;
371 
372   return CurrentStage != SchedStages.end();
373 }
374 
375 bool GCNSchedStrategy::hasNextStage() const {
376   assert(CurrentStage);
377   return std::next(CurrentStage) != SchedStages.end();
378 }
379 
380 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
381   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
382   return *std::next(CurrentStage);
383 }
384 
385 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
386     const MachineSchedContext *C)
387     : GCNSchedStrategy(C) {
388   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
389   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
390   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
391   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
392 }
393 
394 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
395     : GCNSchedStrategy(C) {
396   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
397 }
398 
399 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
400                                           SchedCandidate &TryCand,
401                                           SchedBoundary *Zone) const {
402   // Initialize the candidate if needed.
403   if (!Cand.isValid()) {
404     TryCand.Reason = NodeOrder;
405     return true;
406   }
407 
408   // Avoid spilling by exceeding the register limit.
409   if (DAG->isTrackingPressure() &&
410       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
411                   RegExcess, TRI, DAG->MF))
412     return TryCand.Reason != NoCand;
413 
414   // Bias PhysReg Defs and copies to their uses and defined respectively.
415   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
416                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
417     return TryCand.Reason != NoCand;
418 
419   bool SameBoundary = Zone != nullptr;
420   if (SameBoundary) {
421     // Prioritize instructions that read unbuffered resources by stall cycles.
422     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
423                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
424       return TryCand.Reason != NoCand;
425 
426     // Avoid critical resource consumption and balance the schedule.
427     TryCand.initResourceDelta(DAG, SchedModel);
428     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
429                 TryCand, Cand, ResourceReduce))
430       return TryCand.Reason != NoCand;
431     if (tryGreater(TryCand.ResDelta.DemandedResources,
432                    Cand.ResDelta.DemandedResources, TryCand, Cand,
433                    ResourceDemand))
434       return TryCand.Reason != NoCand;
435 
436     // Unconditionally try to reduce latency.
437     if (tryLatency(TryCand, Cand, *Zone))
438       return TryCand.Reason != NoCand;
439 
440     // Weak edges are for clustering and other constraints.
441     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
442                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
443       return TryCand.Reason != NoCand;
444   }
445 
446   // Keep clustered nodes together to encourage downstream peephole
447   // optimizations which may reduce resource requirements.
448   //
449   // This is a best effort to set things up for a post-RA pass. Optimizations
450   // like generating loads of multiple registers should ideally be done within
451   // the scheduler pass by combining the loads during DAG postprocessing.
452   const SUnit *CandNextClusterSU =
453       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
454   const SUnit *TryCandNextClusterSU =
455       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
456   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
457                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
458     return TryCand.Reason != NoCand;
459 
460   // Avoid increasing the max critical pressure in the scheduled region.
461   if (DAG->isTrackingPressure() &&
462       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
463                   TryCand, Cand, RegCritical, TRI, DAG->MF))
464     return TryCand.Reason != NoCand;
465 
466   // Avoid increasing the max pressure of the entire region.
467   if (DAG->isTrackingPressure() &&
468       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
469                   Cand, RegMax, TRI, DAG->MF))
470     return TryCand.Reason != NoCand;
471 
472   if (SameBoundary) {
473     // Fall through to original instruction order.
474     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
475         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
476       TryCand.Reason = NodeOrder;
477       return true;
478     }
479   }
480   return false;
481 }
482 
483 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
484     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
485     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
486       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
487       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
488 
489   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
490   if (RelaxedOcc) {
491     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
492     if (MinOccupancy != StartingOccupancy)
493       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
494                         << ".\n");
495   }
496 }
497 
498 std::unique_ptr<GCNSchedStage>
499 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
500   switch (SchedStageID) {
501   case GCNSchedStageID::OccInitialSchedule:
502     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
503   case GCNSchedStageID::UnclusteredHighRPReschedule:
504     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
505   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
506     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
507   case GCNSchedStageID::PreRARematerialize:
508     return std::make_unique<PreRARematStage>(SchedStageID, *this);
509   case GCNSchedStageID::ILPInitialSchedule:
510     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
511   }
512 
513   llvm_unreachable("Unknown SchedStageID.");
514 }
515 
516 void GCNScheduleDAGMILive::schedule() {
517   // Collect all scheduling regions. The actual scheduling is performed in
518   // GCNScheduleDAGMILive::finalizeSchedule.
519   Regions.push_back(std::pair(RegionBegin, RegionEnd));
520 }
521 
522 GCNRegPressure
523 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
524   GCNDownwardRPTracker RPTracker(*LIS);
525   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
526   return RPTracker.moveMaxPressure();
527 }
528 
529 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
530                                                 const MachineBasicBlock *MBB) {
531   GCNDownwardRPTracker RPTracker(*LIS);
532 
533   // If the block has the only successor then live-ins of that successor are
534   // live-outs of the current block. We can reuse calculated live set if the
535   // successor will be sent to scheduling past current block.
536 
537   // However, due to the bug in LiveInterval analysis it may happen that two
538   // predecessors of the same successor block have different lane bitmasks for
539   // a live-out register. Workaround that by sticking to one-to-one relationship
540   // i.e. one predecessor with one successor block.
541   const MachineBasicBlock *OnlySucc = nullptr;
542   if (MBB->succ_size() == 1) {
543     auto *Candidate = *MBB->succ_begin();
544     if (!Candidate->empty() && Candidate->pred_size() == 1) {
545       SlotIndexes *Ind = LIS->getSlotIndexes();
546       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
547         OnlySucc = Candidate;
548     }
549   }
550 
551   // Scheduler sends regions from the end of the block upwards.
552   size_t CurRegion = RegionIdx;
553   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
554     if (Regions[CurRegion].first->getParent() != MBB)
555       break;
556   --CurRegion;
557 
558   auto I = MBB->begin();
559   auto LiveInIt = MBBLiveIns.find(MBB);
560   auto &Rgn = Regions[CurRegion];
561   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
562   if (LiveInIt != MBBLiveIns.end()) {
563     auto LiveIn = std::move(LiveInIt->second);
564     RPTracker.reset(*MBB->begin(), &LiveIn);
565     MBBLiveIns.erase(LiveInIt);
566   } else {
567     I = Rgn.first;
568     auto LRS = BBLiveInMap.lookup(NonDbgMI);
569 #ifdef EXPENSIVE_CHECKS
570     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
571 #endif
572     RPTracker.reset(*I, &LRS);
573   }
574 
575   for (;;) {
576     I = RPTracker.getNext();
577 
578     if (Regions[CurRegion].first == I || NonDbgMI == I) {
579       LiveIns[CurRegion] = RPTracker.getLiveRegs();
580       RPTracker.clearMaxPressure();
581     }
582 
583     if (Regions[CurRegion].second == I) {
584       Pressure[CurRegion] = RPTracker.moveMaxPressure();
585       if (CurRegion-- == RegionIdx)
586         break;
587     }
588     RPTracker.advanceToNext();
589     RPTracker.advanceBeforeNext();
590   }
591 
592   if (OnlySucc) {
593     if (I != MBB->end()) {
594       RPTracker.advanceToNext();
595       RPTracker.advance(MBB->end());
596     }
597     RPTracker.advanceBeforeNext();
598     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
599   }
600 }
601 
602 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
603 GCNScheduleDAGMILive::getBBLiveInMap() const {
604   assert(!Regions.empty());
605   std::vector<MachineInstr *> BBStarters;
606   BBStarters.reserve(Regions.size());
607   auto I = Regions.rbegin(), E = Regions.rend();
608   auto *BB = I->first->getParent();
609   do {
610     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
611     BBStarters.push_back(MI);
612     do {
613       ++I;
614     } while (I != E && I->first->getParent() == BB);
615   } while (I != E);
616   return getLiveRegMap(BBStarters, false /*After*/, *LIS);
617 }
618 
619 void GCNScheduleDAGMILive::finalizeSchedule() {
620   // Start actual scheduling here. This function is called by the base
621   // MachineScheduler after all regions have been recorded by
622   // GCNScheduleDAGMILive::schedule().
623   LiveIns.resize(Regions.size());
624   Pressure.resize(Regions.size());
625   RescheduleRegions.resize(Regions.size());
626   RegionsWithHighRP.resize(Regions.size());
627   RegionsWithExcessRP.resize(Regions.size());
628   RegionsWithMinOcc.resize(Regions.size());
629   RegionsWithIGLPInstrs.resize(Regions.size());
630   RescheduleRegions.set();
631   RegionsWithHighRP.reset();
632   RegionsWithExcessRP.reset();
633   RegionsWithMinOcc.reset();
634   RegionsWithIGLPInstrs.reset();
635 
636   runSchedStages();
637 }
638 
639 void GCNScheduleDAGMILive::runSchedStages() {
640   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
641 
642   if (!Regions.empty())
643     BBLiveInMap = getBBLiveInMap();
644 
645   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
646   while (S.advanceStage()) {
647     auto Stage = createSchedStage(S.getCurrentStage());
648     if (!Stage->initGCNSchedStage())
649       continue;
650 
651     for (auto Region : Regions) {
652       RegionBegin = Region.first;
653       RegionEnd = Region.second;
654       // Setup for scheduling the region and check whether it should be skipped.
655       if (!Stage->initGCNRegion()) {
656         Stage->advanceRegion();
657         exitRegion();
658         continue;
659       }
660 
661       ScheduleDAGMILive::schedule();
662       Stage->finalizeGCNRegion();
663     }
664 
665     Stage->finalizeGCNSchedStage();
666   }
667 }
668 
669 #ifndef NDEBUG
670 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
671   switch (StageID) {
672   case GCNSchedStageID::OccInitialSchedule:
673     OS << "Max Occupancy Initial Schedule";
674     break;
675   case GCNSchedStageID::UnclusteredHighRPReschedule:
676     OS << "Unclustered High Register Pressure Reschedule";
677     break;
678   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
679     OS << "Clustered Low Occupancy Reschedule";
680     break;
681   case GCNSchedStageID::PreRARematerialize:
682     OS << "Pre-RA Rematerialize";
683     break;
684   case GCNSchedStageID::ILPInitialSchedule:
685     OS << "Max ILP Initial Schedule";
686     break;
687   }
688 
689   return OS;
690 }
691 #endif
692 
693 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
694     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
695       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
696 
697 bool GCNSchedStage::initGCNSchedStage() {
698   if (!DAG.LIS)
699     return false;
700 
701   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
702   return true;
703 }
704 
705 bool UnclusteredHighRPStage::initGCNSchedStage() {
706   if (DisableUnclusterHighRP)
707     return false;
708 
709   if (!GCNSchedStage::initGCNSchedStage())
710     return false;
711 
712   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
713     return false;
714 
715   SavedMutations.swap(DAG.Mutations);
716   DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
717 
718   InitialOccupancy = DAG.MinOccupancy;
719   // Aggressivly try to reduce register pressure in the unclustered high RP
720   // stage. Temporarily increase occupancy target in the region.
721   S.SGPRLimitBias = S.HighRPSGPRBias;
722   S.VGPRLimitBias = S.HighRPVGPRBias;
723   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
724     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
725 
726   LLVM_DEBUG(
727       dbgs()
728       << "Retrying function scheduling without clustering. "
729          "Aggressivly try to reduce register pressure to achieve occupancy "
730       << DAG.MinOccupancy << ".\n");
731 
732   return true;
733 }
734 
735 bool ClusteredLowOccStage::initGCNSchedStage() {
736   if (DisableClusteredLowOccupancy)
737     return false;
738 
739   if (!GCNSchedStage::initGCNSchedStage())
740     return false;
741 
742   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
743   // been dropped. All regions will have already been scheduled with the ideal
744   // occupancy targets.
745   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
746     return false;
747 
748   LLVM_DEBUG(
749       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
750              << DAG.MinOccupancy << ".\n");
751   return true;
752 }
753 
754 bool PreRARematStage::initGCNSchedStage() {
755   if (!GCNSchedStage::initGCNSchedStage())
756     return false;
757 
758   if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
759     return false;
760 
761   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
762   // Check maximum occupancy
763   if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
764       DAG.MinOccupancy)
765     return false;
766 
767   // FIXME: This pass will invalidate cached MBBLiveIns for regions
768   // inbetween the defs and region we sinked the def to. Cached pressure
769   // for regions where a def is sinked from will also be invalidated. Will
770   // need to be fixed if there is another pass after this pass.
771   assert(!S.hasNextStage());
772 
773   collectRematerializableInstructions();
774   if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
775     return false;
776 
777   LLVM_DEBUG(
778       dbgs() << "Retrying function scheduling with improved occupancy of "
779              << DAG.MinOccupancy << " from rematerializing\n");
780   return true;
781 }
782 
783 void GCNSchedStage::finalizeGCNSchedStage() {
784   DAG.finishBlock();
785   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
786 }
787 
788 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
789   SavedMutations.swap(DAG.Mutations);
790   S.SGPRLimitBias = S.VGPRLimitBias = 0;
791   if (DAG.MinOccupancy > InitialOccupancy) {
792     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
793       DAG.RegionsWithMinOcc[IDX] =
794           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
795 
796     LLVM_DEBUG(dbgs() << StageID
797                       << " stage successfully increased occupancy to "
798                       << DAG.MinOccupancy << '\n');
799   }
800 
801   GCNSchedStage::finalizeGCNSchedStage();
802 }
803 
804 bool GCNSchedStage::initGCNRegion() {
805   // Check whether this new region is also a new block.
806   if (DAG.RegionBegin->getParent() != CurrentMBB)
807     setupNewBlock();
808 
809   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
810   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
811 
812   // Skip empty scheduling regions (0 or 1 schedulable instructions).
813   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
814     return false;
815 
816   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
817   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
818                     << " " << CurrentMBB->getName()
819                     << "\n  From: " << *DAG.begin() << "    To: ";
820              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
821              else dbgs() << "End";
822              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
823 
824   // Save original instruction order before scheduling for possible revert.
825   Unsched.clear();
826   Unsched.reserve(DAG.NumRegionInstrs);
827   if (StageID == GCNSchedStageID::OccInitialSchedule ||
828       StageID == GCNSchedStageID::ILPInitialSchedule) {
829     for (auto &I : DAG) {
830       Unsched.push_back(&I);
831       if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
832           I.getOpcode() == AMDGPU::IGLP_OPT)
833         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
834     }
835   } else {
836     for (auto &I : DAG)
837       Unsched.push_back(&I);
838   }
839 
840   PressureBefore = DAG.Pressure[RegionIdx];
841 
842   LLVM_DEBUG(
843       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
844              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
845              << "Region live-in pressure:  "
846              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
847              << "Region register pressure: " << print(PressureBefore));
848 
849   S.HasHighPressure = false;
850   S.KnownExcessRP = isRegionWithExcessRP();
851 
852   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
853       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
854     SavedMutations.clear();
855     SavedMutations.swap(DAG.Mutations);
856     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
857                           StageID == GCNSchedStageID::ILPInitialSchedule;
858     DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage));
859   }
860 
861   return true;
862 }
863 
864 bool UnclusteredHighRPStage::initGCNRegion() {
865   // Only reschedule regions with the minimum occupancy or regions that may have
866   // spilling (excess register pressure).
867   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
868        DAG.MinOccupancy <= InitialOccupancy) &&
869       !DAG.RegionsWithExcessRP[RegionIdx])
870     return false;
871 
872   return GCNSchedStage::initGCNRegion();
873 }
874 
875 bool ClusteredLowOccStage::initGCNRegion() {
876   // We may need to reschedule this region if it wasn't rescheduled in the last
877   // stage, or if we found it was testing critical register pressure limits in
878   // the unclustered reschedule stage. The later is because we may not have been
879   // able to raise the min occupancy in the previous stage so the region may be
880   // overly constrained even if it was already rescheduled.
881   if (!DAG.RegionsWithHighRP[RegionIdx])
882     return false;
883 
884   return GCNSchedStage::initGCNRegion();
885 }
886 
887 bool PreRARematStage::initGCNRegion() {
888   if (!DAG.RescheduleRegions[RegionIdx])
889     return false;
890 
891   return GCNSchedStage::initGCNRegion();
892 }
893 
894 void GCNSchedStage::setupNewBlock() {
895   if (CurrentMBB)
896     DAG.finishBlock();
897 
898   CurrentMBB = DAG.RegionBegin->getParent();
899   DAG.startBlock(CurrentMBB);
900   // Get real RP for the region if it hasn't be calculated before. After the
901   // initial schedule stage real RP will be collected after scheduling.
902   if (StageID == GCNSchedStageID::OccInitialSchedule ||
903       StageID == GCNSchedStageID::ILPInitialSchedule)
904     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
905 }
906 
907 void GCNSchedStage::finalizeGCNRegion() {
908   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
909   DAG.RescheduleRegions[RegionIdx] = false;
910   if (S.HasHighPressure)
911     DAG.RegionsWithHighRP[RegionIdx] = true;
912 
913   // Revert scheduling if we have dropped occupancy or there is some other
914   // reason that the original schedule is better.
915   checkScheduling();
916 
917   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
918       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
919     SavedMutations.swap(DAG.Mutations);
920 
921   DAG.exitRegion();
922   RegionIdx++;
923 }
924 
925 void GCNSchedStage::checkScheduling() {
926   // Check the results of scheduling.
927   PressureAfter = DAG.getRealRegPressure(RegionIdx);
928   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
929   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
930 
931   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
932       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
933     DAG.Pressure[RegionIdx] = PressureAfter;
934     DAG.RegionsWithMinOcc[RegionIdx] =
935         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
936 
937     // Early out if we have achieve the occupancy target.
938     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
939     return;
940   }
941 
942   unsigned TargetOccupancy =
943       std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
944   unsigned WavesAfter =
945       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
946   unsigned WavesBefore =
947       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
948   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
949                     << ", after " << WavesAfter << ".\n");
950 
951   // We may not be able to keep the current target occupancy because of the just
952   // scheduled region. We might still be able to revert scheduling if the
953   // occupancy before was higher, or if the current schedule has register
954   // pressure higher than the excess limits which could lead to more spilling.
955   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
956 
957   // Allow memory bound functions to drop to 4 waves if not limited by an
958   // attribute.
959   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
960       WavesAfter >= MFI.getMinAllowedOccupancy()) {
961     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
962                       << MFI.getMinAllowedOccupancy() << " waves\n");
963     NewOccupancy = WavesAfter;
964   }
965 
966   if (NewOccupancy < DAG.MinOccupancy) {
967     DAG.MinOccupancy = NewOccupancy;
968     MFI.limitOccupancy(DAG.MinOccupancy);
969     DAG.RegionsWithMinOcc.reset();
970     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
971                       << DAG.MinOccupancy << ".\n");
972   }
973 
974   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
975   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
976   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
977       PressureAfter.getAGPRNum() > MaxVGPRs ||
978       PressureAfter.getSGPRNum() > MaxSGPRs) {
979     DAG.RescheduleRegions[RegionIdx] = true;
980     DAG.RegionsWithHighRP[RegionIdx] = true;
981     DAG.RegionsWithExcessRP[RegionIdx] = true;
982   }
983 
984   // Revert if this region's schedule would cause a drop in occupancy or
985   // spilling.
986   if (shouldRevertScheduling(WavesAfter)) {
987     revertScheduling();
988   } else {
989     DAG.Pressure[RegionIdx] = PressureAfter;
990     DAG.RegionsWithMinOcc[RegionIdx] =
991         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
992   }
993 }
994 
995 unsigned
996 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
997                                       DenseMap<unsigned, unsigned> &ReadyCycles,
998                                       const TargetSchedModel &SM) {
999   unsigned ReadyCycle = CurrCycle;
1000   for (auto &D : SU.Preds) {
1001     if (D.isAssignedRegDep()) {
1002       MachineInstr *DefMI = D.getSUnit()->getInstr();
1003       unsigned Latency = SM.computeInstrLatency(DefMI);
1004       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1005       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1006     }
1007   }
1008   ReadyCycles[SU.NodeNum] = ReadyCycle;
1009   return ReadyCycle;
1010 }
1011 
1012 #ifndef NDEBUG
1013 struct EarlierIssuingCycle {
1014   bool operator()(std::pair<MachineInstr *, unsigned> A,
1015                   std::pair<MachineInstr *, unsigned> B) const {
1016     return A.second < B.second;
1017   }
1018 };
1019 
1020 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1021                                         EarlierIssuingCycle> &ReadyCycles) {
1022   if (ReadyCycles.empty())
1023     return;
1024   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1025   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1026          << " ##################\n# Cycle #\t\t\tInstruction          "
1027             "             "
1028             "                            \n";
1029   unsigned IPrev = 1;
1030   for (auto &I : ReadyCycles) {
1031     if (I.second > IPrev + 1)
1032       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1033              << " CYCLES DETECTED ******************************\n\n";
1034     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
1035     IPrev = I.second;
1036   }
1037 }
1038 #endif
1039 
1040 ScheduleMetrics
1041 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1042 #ifndef NDEBUG
1043   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1044       ReadyCyclesSorted;
1045 #endif
1046   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1047   unsigned SumBubbles = 0;
1048   DenseMap<unsigned, unsigned> ReadyCycles;
1049   unsigned CurrCycle = 0;
1050   for (auto &SU : InputSchedule) {
1051     unsigned ReadyCycle =
1052         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1053     SumBubbles += ReadyCycle - CurrCycle;
1054 #ifndef NDEBUG
1055     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1056 #endif
1057     CurrCycle = ++ReadyCycle;
1058   }
1059 #ifndef NDEBUG
1060   LLVM_DEBUG(
1061       printScheduleModel(ReadyCyclesSorted);
1062       dbgs() << "\n\t"
1063              << "Metric: "
1064              << (SumBubbles
1065                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1066                      : 1)
1067              << "\n\n");
1068 #endif
1069 
1070   return ScheduleMetrics(CurrCycle, SumBubbles);
1071 }
1072 
1073 ScheduleMetrics
1074 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1075 #ifndef NDEBUG
1076   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1077       ReadyCyclesSorted;
1078 #endif
1079   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1080   unsigned SumBubbles = 0;
1081   DenseMap<unsigned, unsigned> ReadyCycles;
1082   unsigned CurrCycle = 0;
1083   for (auto &MI : DAG) {
1084     SUnit *SU = DAG.getSUnit(&MI);
1085     if (!SU)
1086       continue;
1087     unsigned ReadyCycle =
1088         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1089     SumBubbles += ReadyCycle - CurrCycle;
1090 #ifndef NDEBUG
1091     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1092 #endif
1093     CurrCycle = ++ReadyCycle;
1094   }
1095 #ifndef NDEBUG
1096   LLVM_DEBUG(
1097       printScheduleModel(ReadyCyclesSorted);
1098       dbgs() << "\n\t"
1099              << "Metric: "
1100              << (SumBubbles
1101                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1102                      : 1)
1103              << "\n\n");
1104 #endif
1105 
1106   return ScheduleMetrics(CurrCycle, SumBubbles);
1107 }
1108 
1109 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1110   if (WavesAfter < DAG.MinOccupancy)
1111     return true;
1112 
1113   return false;
1114 }
1115 
1116 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1117   if (PressureAfter == PressureBefore)
1118     return false;
1119 
1120   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1121     return true;
1122 
1123   if (mayCauseSpilling(WavesAfter))
1124     return true;
1125 
1126   return false;
1127 }
1128 
1129 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1130   // If RP is not reduced in the unclustered reschedule stage, revert to the
1131   // old schedule.
1132   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1133        mayCauseSpilling(WavesAfter)) ||
1134       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1135     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1136     return true;
1137   }
1138 
1139   // Do not attempt to relax schedule even more if we are already spilling.
1140   if (isRegionWithExcessRP())
1141     return false;
1142 
1143   LLVM_DEBUG(
1144       dbgs()
1145       << "\n\t      *** In shouldRevertScheduling ***\n"
1146       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1147   ScheduleMetrics MBefore =
1148       getScheduleMetrics(DAG.SUnits);
1149   LLVM_DEBUG(
1150       dbgs()
1151       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1152   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1153   unsigned OldMetric = MBefore.getMetric();
1154   unsigned NewMetric = MAfter.getMetric();
1155   unsigned WavesBefore =
1156       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1157   unsigned Profit =
1158       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1159        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1160        NewMetric) /
1161       ScheduleMetrics::ScaleFactor;
1162   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1163                     << MAfter << "Profit: " << Profit << "\n");
1164   return Profit < ScheduleMetrics::ScaleFactor;
1165 }
1166 
1167 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1168   if (PressureAfter == PressureBefore)
1169     return false;
1170 
1171   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1172     return true;
1173 
1174   if (mayCauseSpilling(WavesAfter))
1175     return true;
1176 
1177   return false;
1178 }
1179 
1180 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1181   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1182     return true;
1183 
1184   if (mayCauseSpilling(WavesAfter))
1185     return true;
1186 
1187   return false;
1188 }
1189 
1190 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1191   if (mayCauseSpilling(WavesAfter))
1192     return true;
1193 
1194   return false;
1195 }
1196 
1197 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1198   if (WavesAfter <= MFI.getMinWavesPerEU() &&
1199       !PressureAfter.less(ST, PressureBefore) &&
1200       isRegionWithExcessRP()) {
1201     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1202     return true;
1203   }
1204 
1205   return false;
1206 }
1207 
1208 void GCNSchedStage::revertScheduling() {
1209   DAG.RegionsWithMinOcc[RegionIdx] =
1210       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1211   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1212   DAG.RescheduleRegions[RegionIdx] =
1213       S.hasNextStage() &&
1214       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1215   DAG.RegionEnd = DAG.RegionBegin;
1216   int SkippedDebugInstr = 0;
1217   for (MachineInstr *MI : Unsched) {
1218     if (MI->isDebugInstr()) {
1219       ++SkippedDebugInstr;
1220       continue;
1221     }
1222 
1223     if (MI->getIterator() != DAG.RegionEnd) {
1224       DAG.BB->remove(MI);
1225       DAG.BB->insert(DAG.RegionEnd, MI);
1226       if (!MI->isDebugInstr())
1227         DAG.LIS->handleMove(*MI, true);
1228     }
1229 
1230     // Reset read-undef flags and update them later.
1231     for (auto &Op : MI->all_defs())
1232       Op.setIsUndef(false);
1233     RegisterOperands RegOpers;
1234     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1235     if (!MI->isDebugInstr()) {
1236       if (DAG.ShouldTrackLaneMasks) {
1237         // Adjust liveness and add missing dead+read-undef flags.
1238         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1239         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1240       } else {
1241         // Adjust for missing dead-def flags.
1242         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1243       }
1244     }
1245     DAG.RegionEnd = MI->getIterator();
1246     ++DAG.RegionEnd;
1247     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1248   }
1249 
1250   // After reverting schedule, debug instrs will now be at the end of the block
1251   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1252   // pass debug instrs to the actual end of the scheduling region.
1253   while (SkippedDebugInstr-- > 0)
1254     ++DAG.RegionEnd;
1255 
1256   // If Unsched.front() instruction is a debug instruction, this will actually
1257   // shrink the region since we moved all debug instructions to the end of the
1258   // block. Find the first instruction that is not a debug instruction.
1259   DAG.RegionBegin = Unsched.front()->getIterator();
1260   if (DAG.RegionBegin->isDebugInstr()) {
1261     for (MachineInstr *MI : Unsched) {
1262       if (MI->isDebugInstr())
1263         continue;
1264       DAG.RegionBegin = MI->getIterator();
1265       break;
1266     }
1267   }
1268 
1269   // Then move the debug instructions back into their correct place and set
1270   // RegionBegin and RegionEnd if needed.
1271   DAG.placeDebugValues();
1272 
1273   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1274 }
1275 
1276 void PreRARematStage::collectRematerializableInstructions() {
1277   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1278   for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1279     Register Reg = Register::index2VirtReg(I);
1280     if (!DAG.LIS->hasInterval(Reg))
1281       continue;
1282 
1283     // TODO: Handle AGPR and SGPR rematerialization
1284     if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1285         !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1286       continue;
1287 
1288     MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1289     MachineInstr *Def = Op->getParent();
1290     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1291       continue;
1292 
1293     MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1294     if (Def->getParent() == UseI->getParent())
1295       continue;
1296 
1297     // We are only collecting defs that are defined in another block and are
1298     // live-through or used inside regions at MinOccupancy. This means that the
1299     // register must be in the live-in set for the region.
1300     bool AddedToRematList = false;
1301     for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1302       auto It = DAG.LiveIns[I].find(Reg);
1303       if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1304         if (DAG.RegionsWithMinOcc[I]) {
1305           RematerializableInsts[I][Def] = UseI;
1306           AddedToRematList = true;
1307         }
1308 
1309         // Collect regions with rematerializable reg as live-in to avoid
1310         // searching later when updating RP.
1311         RematDefToLiveInRegions[Def].push_back(I);
1312       }
1313     }
1314     if (!AddedToRematList)
1315       RematDefToLiveInRegions.erase(Def);
1316   }
1317 }
1318 
1319 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1320                                               const TargetInstrInfo *TII) {
1321   // Temporary copies of cached variables we will be modifying and replacing if
1322   // sinking succeeds.
1323   SmallVector<
1324       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1325       NewRegions;
1326   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1327   DenseMap<unsigned, GCNRegPressure> NewPressure;
1328   BitVector NewRescheduleRegions;
1329   LiveIntervals *LIS = DAG.LIS;
1330 
1331   NewRegions.resize(DAG.Regions.size());
1332   NewRescheduleRegions.resize(DAG.Regions.size());
1333 
1334   // Collect only regions that has a rematerializable def as a live-in.
1335   SmallSet<unsigned, 16> ImpactedRegions;
1336   for (const auto &It : RematDefToLiveInRegions)
1337     ImpactedRegions.insert(It.second.begin(), It.second.end());
1338 
1339   // Make copies of register pressure and live-ins cache that will be updated
1340   // as we rematerialize.
1341   for (auto Idx : ImpactedRegions) {
1342     NewPressure[Idx] = DAG.Pressure[Idx];
1343     NewLiveIns[Idx] = DAG.LiveIns[Idx];
1344   }
1345   NewRegions = DAG.Regions;
1346   NewRescheduleRegions.reset();
1347 
1348   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1349   bool Improved = false;
1350   for (auto I : ImpactedRegions) {
1351     if (!DAG.RegionsWithMinOcc[I])
1352       continue;
1353 
1354     Improved = false;
1355     int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1356     int SGPRUsage = NewPressure[I].getSGPRNum();
1357 
1358     // TODO: Handle occupancy drop due to AGPR and SGPR.
1359     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1360     if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1361       break;
1362 
1363     // The occupancy of this region could have been improved by a previous
1364     // iteration's sinking of defs.
1365     if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1366       NewRescheduleRegions[I] = true;
1367       Improved = true;
1368       continue;
1369     }
1370 
1371     // First check if we have enough trivially rematerializable instructions to
1372     // improve occupancy. Optimistically assume all instructions we are able to
1373     // sink decreased RP.
1374     int TotalSinkableRegs = 0;
1375     for (const auto &It : RematerializableInsts[I]) {
1376       MachineInstr *Def = It.first;
1377       Register DefReg = Def->getOperand(0).getReg();
1378       TotalSinkableRegs +=
1379           SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1380     }
1381     int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1382     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1383     // If in the most optimistic scenario, we cannot improve occupancy, then do
1384     // not attempt to sink any instructions.
1385     if (OptimisticOccupancy <= DAG.MinOccupancy)
1386       break;
1387 
1388     unsigned ImproveOccupancy = 0;
1389     SmallVector<MachineInstr *, 4> SinkedDefs;
1390     for (auto &It : RematerializableInsts[I]) {
1391       MachineInstr *Def = It.first;
1392       MachineBasicBlock::iterator InsertPos =
1393           MachineBasicBlock::iterator(It.second);
1394       Register Reg = Def->getOperand(0).getReg();
1395       // Rematerialize MI to its use block. Since we are only rematerializing
1396       // instructions that do not have any virtual reg uses, we do not need to
1397       // call LiveRangeEdit::allUsesAvailableAt() and
1398       // LiveRangeEdit::canRematerializeAt().
1399       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1400                          Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1401       MachineInstr *NewMI = &*std::prev(InsertPos);
1402       LIS->InsertMachineInstrInMaps(*NewMI);
1403       LIS->removeInterval(Reg);
1404       LIS->createAndComputeVirtRegInterval(Reg);
1405       InsertedMIToOldDef[NewMI] = Def;
1406 
1407       // Update region boundaries in scheduling region we sinked from since we
1408       // may sink an instruction that was at the beginning or end of its region
1409       DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1410                                  /*Removing =*/true);
1411 
1412       // Update region boundaries in region we sinked to.
1413       DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1414 
1415       LaneBitmask PrevMask = NewLiveIns[I][Reg];
1416       // FIXME: Also update cached pressure for where the def was sinked from.
1417       // Update RP for all regions that has this reg as a live-in and remove
1418       // the reg from all regions as a live-in.
1419       for (auto Idx : RematDefToLiveInRegions[Def]) {
1420         NewLiveIns[Idx].erase(Reg);
1421         if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1422           // Def is live-through and not used in this block.
1423           NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1424         } else {
1425           // Def is used and rematerialized into this block.
1426           GCNDownwardRPTracker RPT(*LIS);
1427           auto *NonDbgMI = &*skipDebugInstructionsForward(
1428               NewRegions[Idx].first, NewRegions[Idx].second);
1429           RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1430           RPT.advance(NewRegions[Idx].second);
1431           NewPressure[Idx] = RPT.moveMaxPressure();
1432         }
1433       }
1434 
1435       SinkedDefs.push_back(Def);
1436       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1437       if (ImproveOccupancy > DAG.MinOccupancy)
1438         break;
1439     }
1440 
1441     // Remove defs we just sinked from all regions' list of sinkable defs
1442     for (auto &Def : SinkedDefs)
1443       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1444         RematerializableInsts[TrackedIdx].erase(Def);
1445 
1446     if (ImproveOccupancy <= DAG.MinOccupancy)
1447       break;
1448 
1449     NewRescheduleRegions[I] = true;
1450     Improved = true;
1451   }
1452 
1453   if (!Improved) {
1454     // Occupancy was not improved for all regions that were at MinOccupancy.
1455     // Undo sinking and remove newly rematerialized instructions.
1456     for (auto &Entry : InsertedMIToOldDef) {
1457       MachineInstr *MI = Entry.first;
1458       MachineInstr *OldMI = Entry.second;
1459       Register Reg = MI->getOperand(0).getReg();
1460       LIS->RemoveMachineInstrFromMaps(*MI);
1461       MI->eraseFromParent();
1462       OldMI->clearRegisterDeads(Reg);
1463       LIS->removeInterval(Reg);
1464       LIS->createAndComputeVirtRegInterval(Reg);
1465     }
1466     return false;
1467   }
1468 
1469   // Occupancy was improved for all regions.
1470   for (auto &Entry : InsertedMIToOldDef) {
1471     MachineInstr *MI = Entry.first;
1472     MachineInstr *OldMI = Entry.second;
1473 
1474     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1475     DAG.BBLiveInMap.erase(OldMI);
1476 
1477     // Remove OldMI and update LIS
1478     Register Reg = MI->getOperand(0).getReg();
1479     LIS->RemoveMachineInstrFromMaps(*OldMI);
1480     OldMI->eraseFromParent();
1481     LIS->removeInterval(Reg);
1482     LIS->createAndComputeVirtRegInterval(Reg);
1483   }
1484 
1485   // Update live-ins, register pressure, and regions caches.
1486   for (auto Idx : ImpactedRegions) {
1487     DAG.LiveIns[Idx] = NewLiveIns[Idx];
1488     DAG.Pressure[Idx] = NewPressure[Idx];
1489     DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1490   }
1491   DAG.Regions = NewRegions;
1492   DAG.RescheduleRegions = NewRescheduleRegions;
1493 
1494   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1495   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1496 
1497   return true;
1498 }
1499 
1500 // Copied from MachineLICM
1501 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1502   if (!DAG.TII->isTriviallyReMaterializable(MI))
1503     return false;
1504 
1505   for (const MachineOperand &MO : MI.all_uses())
1506     if (MO.getReg().isVirtual())
1507       return false;
1508 
1509   return true;
1510 }
1511 
1512 // When removing, we will have to check both beginning and ending of the region.
1513 // When inserting, we will only have to check if we are inserting NewMI in front
1514 // of a scheduling region and do not need to check the ending since we will only
1515 // ever be inserting before an already existing MI.
1516 void GCNScheduleDAGMILive::updateRegionBoundaries(
1517     SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1518                               MachineBasicBlock::iterator>> &RegionBoundaries,
1519     MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1520   unsigned I = 0, E = RegionBoundaries.size();
1521   // Search for first region of the block where MI is located
1522   while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1523     ++I;
1524 
1525   for (; I != E; ++I) {
1526     if (MI->getParent() != RegionBoundaries[I].first->getParent())
1527       return;
1528 
1529     if (Removing && MI == RegionBoundaries[I].first &&
1530         MI == RegionBoundaries[I].second) {
1531       // MI is in a region with size 1, after removing, the region will be
1532       // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1533       RegionBoundaries[I] =
1534           std::pair(MI->getParent()->end(), MI->getParent()->end());
1535       return;
1536     }
1537     if (MI == RegionBoundaries[I].first) {
1538       if (Removing)
1539         RegionBoundaries[I] =
1540             std::pair(std::next(MI), RegionBoundaries[I].second);
1541       else
1542         // Inserted NewMI in front of region, set new RegionBegin to NewMI
1543         RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1544                                         RegionBoundaries[I].second);
1545       return;
1546     }
1547     if (Removing && MI == RegionBoundaries[I].second) {
1548       RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1549       return;
1550     }
1551   }
1552 }
1553 
1554 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1555   return std::any_of(
1556       DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
1557         unsigned Opc = MI->getOpcode();
1558         return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1559       });
1560 }
1561 
1562 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1563     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1564     bool RemoveKillFlags)
1565     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1566 
1567 void GCNPostScheduleDAGMILive::schedule() {
1568   HasIGLPInstrs = hasIGLPInstrs(this);
1569   if (HasIGLPInstrs) {
1570     SavedMutations.clear();
1571     SavedMutations.swap(Mutations);
1572     addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true));
1573   }
1574 
1575   ScheduleDAGMI::schedule();
1576 }
1577 
1578 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1579   if (HasIGLPInstrs)
1580     SavedMutations.swap(Mutations);
1581 
1582   ScheduleDAGMI::finalizeSchedule();
1583 }
1584