xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (revision 02e9120893770924227138ba49df1edb3896112a)
1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17 
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
30 
31 #define DEBUG_TYPE "machine-scheduler"
32 
33 using namespace llvm;
34 
35 static cl::opt<bool>
36     DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule",
37                            cl::Hidden,
38                            cl::desc("Disable unclustred high register pressure "
39                                     "reduction scheduling stage."),
40                            cl::init(false));
41 static cl::opt<unsigned> ScheduleMetricBias(
42     "amdgpu-schedule-metric-bias", cl::Hidden,
43     cl::desc(
44         "Sets the bias which adds weight to occupancy vs latency. Set it to "
45         "100 to chase the occupancy only."),
46     cl::init(10));
47 
48 static cl::opt<bool>
49     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
50                cl::desc("Relax occupancy targets for kernels which are memory "
51                         "bound (amdgpu-membound-threshold), or "
52                         "Wave Limited (amdgpu-limit-wave-threshold)."),
53                cl::init(false));
54 
55 const unsigned ScheduleMetrics::ScaleFactor = 100;
56 
57 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
58     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
59       HasHighPressure(false) {}
60 
61 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
62   GenericScheduler::initialize(DAG);
63 
64   MF = &DAG->MF;
65 
66   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
67 
68   SGPRExcessLimit =
69       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
70   VGPRExcessLimit =
71       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
72 
73   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
74   // Set the initial TargetOccupnacy to the maximum occupancy that we can
75   // achieve for this function. This effectively sets a lower bound on the
76   // 'Critical' register limits in the scheduler.
77   // Allow for lower occupancy targets if kernel is wave limited or memory
78   // bound, and using the relaxed occupancy feature.
79   TargetOccupancy =
80       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
81   SGPRCriticalLimit =
82       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
83 
84   if (!KnownExcessRP) {
85     VGPRCriticalLimit =
86         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
87   } else {
88     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
89     // returns a reasonably small number for targets with lots of VGPRs, such
90     // as GFX10 and GFX11.
91     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
92                          "VGPRCriticalLimit calculation method.\n");
93 
94     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
95     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
96     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
97     VGPRBudget = std::max(VGPRBudget, Granule);
98     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
99   }
100 
101   // Subtract error margin and bias from register limits and avoid overflow.
102   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
103   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
104   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
105   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
106 
107   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
108                     << ", VGPRExcessLimit = " << VGPRExcessLimit
109                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
110                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
111 }
112 
113 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
114                                      bool AtTop,
115                                      const RegPressureTracker &RPTracker,
116                                      const SIRegisterInfo *SRI,
117                                      unsigned SGPRPressure,
118                                      unsigned VGPRPressure) {
119   Cand.SU = SU;
120   Cand.AtTop = AtTop;
121 
122   if (!DAG->isTrackingPressure())
123     return;
124 
125   // getDownwardPressure() and getUpwardPressure() make temporary changes to
126   // the tracker, so we need to pass those function a non-const copy.
127   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
128 
129   Pressure.clear();
130   MaxPressure.clear();
131 
132   if (AtTop)
133     TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
134   else {
135     // FIXME: I think for bottom up scheduling, the register pressure is cached
136     // and can be retrieved by DAG->getPressureDif(SU).
137     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
138   }
139 
140   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
141   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
142 
143   // If two instructions increase the pressure of different register sets
144   // by the same amount, the generic scheduler will prefer to schedule the
145   // instruction that increases the set with the least amount of registers,
146   // which in our case would be SGPRs.  This is rarely what we want, so
147   // when we report excess/critical register pressure, we do it either
148   // only for VGPRs or only for SGPRs.
149 
150   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
151   const unsigned MaxVGPRPressureInc = 16;
152   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
153   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
154 
155 
156   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
157   // to increase the likelihood we don't go over the limits.  We should improve
158   // the analysis to look through dependencies to find the path with the least
159   // register pressure.
160 
161   // We only need to update the RPDelta for instructions that increase register
162   // pressure. Instructions that decrease or keep reg pressure the same will be
163   // marked as RegExcess in tryCandidate() when they are compared with
164   // instructions that increase the register pressure.
165   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
166     HasHighPressure = true;
167     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
168     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
169   }
170 
171   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
172     HasHighPressure = true;
173     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
174     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
175   }
176 
177   // Register pressure is considered 'CRITICAL' if it is approaching a value
178   // that would reduce the wave occupancy for the execution unit.  When
179   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
180   // has the same cost, so we don't need to prefer one over the other.
181 
182   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
183   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
184 
185   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
186     HasHighPressure = true;
187     if (SGPRDelta > VGPRDelta) {
188       Cand.RPDelta.CriticalMax =
189         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
190       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
191     } else {
192       Cand.RPDelta.CriticalMax =
193         PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
194       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
195     }
196   }
197 }
198 
199 // This function is mostly cut and pasted from
200 // GenericScheduler::pickNodeFromQueue()
201 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
202                                          const CandPolicy &ZonePolicy,
203                                          const RegPressureTracker &RPTracker,
204                                          SchedCandidate &Cand) {
205   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
206   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
207   unsigned SGPRPressure = 0;
208   unsigned VGPRPressure = 0;
209   if (DAG->isTrackingPressure()) {
210     SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
211     VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
212   }
213   ReadyQueue &Q = Zone.Available;
214   for (SUnit *SU : Q) {
215 
216     SchedCandidate TryCand(ZonePolicy);
217     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
218                   SGPRPressure, VGPRPressure);
219     // Pass SchedBoundary only when comparing nodes from the same boundary.
220     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
221     tryCandidate(Cand, TryCand, ZoneArg);
222     if (TryCand.Reason != NoCand) {
223       // Initialize resource delta if needed in case future heuristics query it.
224       if (TryCand.ResDelta == SchedResourceDelta())
225         TryCand.initResourceDelta(Zone.DAG, SchedModel);
226       Cand.setBest(TryCand);
227       LLVM_DEBUG(traceCandidate(Cand));
228     }
229   }
230 }
231 
232 // This function is mostly cut and pasted from
233 // GenericScheduler::pickNodeBidirectional()
234 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
235   // Schedule as far as possible in the direction of no choice. This is most
236   // efficient, but also provides the best heuristics for CriticalPSets.
237   if (SUnit *SU = Bot.pickOnlyChoice()) {
238     IsTopNode = false;
239     return SU;
240   }
241   if (SUnit *SU = Top.pickOnlyChoice()) {
242     IsTopNode = true;
243     return SU;
244   }
245   // Set the bottom-up policy based on the state of the current bottom zone and
246   // the instructions outside the zone, including the top zone.
247   CandPolicy BotPolicy;
248   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
249   // Set the top-down policy based on the state of the current top zone and
250   // the instructions outside the zone, including the bottom zone.
251   CandPolicy TopPolicy;
252   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
253 
254   // See if BotCand is still valid (because we previously scheduled from Top).
255   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
256   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
257       BotCand.Policy != BotPolicy) {
258     BotCand.reset(CandPolicy());
259     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
260     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
261   } else {
262     LLVM_DEBUG(traceCandidate(BotCand));
263 #ifndef NDEBUG
264     if (VerifyScheduling) {
265       SchedCandidate TCand;
266       TCand.reset(CandPolicy());
267       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
268       assert(TCand.SU == BotCand.SU &&
269              "Last pick result should correspond to re-picking right now");
270     }
271 #endif
272   }
273 
274   // Check if the top Q has a better candidate.
275   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
276   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
277       TopCand.Policy != TopPolicy) {
278     TopCand.reset(CandPolicy());
279     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
280     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
281   } else {
282     LLVM_DEBUG(traceCandidate(TopCand));
283 #ifndef NDEBUG
284     if (VerifyScheduling) {
285       SchedCandidate TCand;
286       TCand.reset(CandPolicy());
287       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
288       assert(TCand.SU == TopCand.SU &&
289            "Last pick result should correspond to re-picking right now");
290     }
291 #endif
292   }
293 
294   // Pick best from BotCand and TopCand.
295   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
296              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
297   SchedCandidate Cand = BotCand;
298   TopCand.Reason = NoCand;
299   tryCandidate(Cand, TopCand, nullptr);
300   if (TopCand.Reason != NoCand) {
301     Cand.setBest(TopCand);
302   }
303   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
304 
305   IsTopNode = Cand.AtTop;
306   return Cand.SU;
307 }
308 
309 // This function is mostly cut and pasted from
310 // GenericScheduler::pickNode()
311 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
312   if (DAG->top() == DAG->bottom()) {
313     assert(Top.Available.empty() && Top.Pending.empty() &&
314            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
315     return nullptr;
316   }
317   SUnit *SU;
318   do {
319     if (RegionPolicy.OnlyTopDown) {
320       SU = Top.pickOnlyChoice();
321       if (!SU) {
322         CandPolicy NoPolicy;
323         TopCand.reset(NoPolicy);
324         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
325         assert(TopCand.Reason != NoCand && "failed to find a candidate");
326         SU = TopCand.SU;
327       }
328       IsTopNode = true;
329     } else if (RegionPolicy.OnlyBottomUp) {
330       SU = Bot.pickOnlyChoice();
331       if (!SU) {
332         CandPolicy NoPolicy;
333         BotCand.reset(NoPolicy);
334         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
335         assert(BotCand.Reason != NoCand && "failed to find a candidate");
336         SU = BotCand.SU;
337       }
338       IsTopNode = false;
339     } else {
340       SU = pickNodeBidirectional(IsTopNode);
341     }
342   } while (SU->isScheduled);
343 
344   if (SU->isTopReady())
345     Top.removeReady(SU);
346   if (SU->isBottomReady())
347     Bot.removeReady(SU);
348 
349   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
350                     << *SU->getInstr());
351   return SU;
352 }
353 
354 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
355   assert(CurrentStage && CurrentStage != SchedStages.end());
356   return *CurrentStage;
357 }
358 
359 bool GCNSchedStrategy::advanceStage() {
360   assert(CurrentStage != SchedStages.end());
361   if (!CurrentStage)
362     CurrentStage = SchedStages.begin();
363   else
364     CurrentStage++;
365 
366   return CurrentStage != SchedStages.end();
367 }
368 
369 bool GCNSchedStrategy::hasNextStage() const {
370   assert(CurrentStage);
371   return std::next(CurrentStage) != SchedStages.end();
372 }
373 
374 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
375   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
376   return *std::next(CurrentStage);
377 }
378 
379 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
380     const MachineSchedContext *C)
381     : GCNSchedStrategy(C) {
382   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
383   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
384   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
385   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
386 }
387 
388 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
389     : GCNSchedStrategy(C) {
390   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
391 }
392 
393 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
394                                           SchedCandidate &TryCand,
395                                           SchedBoundary *Zone) const {
396   // Initialize the candidate if needed.
397   if (!Cand.isValid()) {
398     TryCand.Reason = NodeOrder;
399     return true;
400   }
401 
402   // Avoid spilling by exceeding the register limit.
403   if (DAG->isTrackingPressure() &&
404       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
405                   RegExcess, TRI, DAG->MF))
406     return TryCand.Reason != NoCand;
407 
408   // Bias PhysReg Defs and copies to their uses and defined respectively.
409   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
410                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
411     return TryCand.Reason != NoCand;
412 
413   bool SameBoundary = Zone != nullptr;
414   if (SameBoundary) {
415     // Prioritize instructions that read unbuffered resources by stall cycles.
416     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
417                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
418       return TryCand.Reason != NoCand;
419 
420     // Avoid critical resource consumption and balance the schedule.
421     TryCand.initResourceDelta(DAG, SchedModel);
422     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
423                 TryCand, Cand, ResourceReduce))
424       return TryCand.Reason != NoCand;
425     if (tryGreater(TryCand.ResDelta.DemandedResources,
426                    Cand.ResDelta.DemandedResources, TryCand, Cand,
427                    ResourceDemand))
428       return TryCand.Reason != NoCand;
429 
430     // Unconditionally try to reduce latency.
431     if (tryLatency(TryCand, Cand, *Zone))
432       return TryCand.Reason != NoCand;
433 
434     // Weak edges are for clustering and other constraints.
435     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
436                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
437       return TryCand.Reason != NoCand;
438   }
439 
440   // Keep clustered nodes together to encourage downstream peephole
441   // optimizations which may reduce resource requirements.
442   //
443   // This is a best effort to set things up for a post-RA pass. Optimizations
444   // like generating loads of multiple registers should ideally be done within
445   // the scheduler pass by combining the loads during DAG postprocessing.
446   const SUnit *CandNextClusterSU =
447       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
448   const SUnit *TryCandNextClusterSU =
449       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
450   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
451                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
452     return TryCand.Reason != NoCand;
453 
454   // Avoid increasing the max critical pressure in the scheduled region.
455   if (DAG->isTrackingPressure() &&
456       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
457                   TryCand, Cand, RegCritical, TRI, DAG->MF))
458     return TryCand.Reason != NoCand;
459 
460   // Avoid increasing the max pressure of the entire region.
461   if (DAG->isTrackingPressure() &&
462       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
463                   Cand, RegMax, TRI, DAG->MF))
464     return TryCand.Reason != NoCand;
465 
466   if (SameBoundary) {
467     // Fall through to original instruction order.
468     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
469         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
470       TryCand.Reason = NodeOrder;
471       return true;
472     }
473   }
474   return false;
475 }
476 
477 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
478     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
479     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
480       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
481       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
482 
483   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
484   if (RelaxedOcc) {
485     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
486     if (MinOccupancy != StartingOccupancy)
487       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
488                         << ".\n");
489   }
490 }
491 
492 std::unique_ptr<GCNSchedStage>
493 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
494   switch (SchedStageID) {
495   case GCNSchedStageID::OccInitialSchedule:
496     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
497   case GCNSchedStageID::UnclusteredHighRPReschedule:
498     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
499   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
500     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
501   case GCNSchedStageID::PreRARematerialize:
502     return std::make_unique<PreRARematStage>(SchedStageID, *this);
503   case GCNSchedStageID::ILPInitialSchedule:
504     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
505   }
506 
507   llvm_unreachable("Unknown SchedStageID.");
508 }
509 
510 void GCNScheduleDAGMILive::schedule() {
511   // Collect all scheduling regions. The actual scheduling is performed in
512   // GCNScheduleDAGMILive::finalizeSchedule.
513   Regions.push_back(std::pair(RegionBegin, RegionEnd));
514 }
515 
516 GCNRegPressure
517 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
518   GCNDownwardRPTracker RPTracker(*LIS);
519   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
520   return RPTracker.moveMaxPressure();
521 }
522 
523 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
524                                                 const MachineBasicBlock *MBB) {
525   GCNDownwardRPTracker RPTracker(*LIS);
526 
527   // If the block has the only successor then live-ins of that successor are
528   // live-outs of the current block. We can reuse calculated live set if the
529   // successor will be sent to scheduling past current block.
530 
531   // However, due to the bug in LiveInterval analysis it may happen that two
532   // predecessors of the same successor block have different lane bitmasks for
533   // a live-out register. Workaround that by sticking to one-to-one relationship
534   // i.e. one predecessor with one successor block.
535   const MachineBasicBlock *OnlySucc = nullptr;
536   if (MBB->succ_size() == 1) {
537     auto *Candidate = *MBB->succ_begin();
538     if (!Candidate->empty() && Candidate->pred_size() == 1) {
539       SlotIndexes *Ind = LIS->getSlotIndexes();
540       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
541         OnlySucc = Candidate;
542     }
543   }
544 
545   // Scheduler sends regions from the end of the block upwards.
546   size_t CurRegion = RegionIdx;
547   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
548     if (Regions[CurRegion].first->getParent() != MBB)
549       break;
550   --CurRegion;
551 
552   auto I = MBB->begin();
553   auto LiveInIt = MBBLiveIns.find(MBB);
554   auto &Rgn = Regions[CurRegion];
555   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
556   if (LiveInIt != MBBLiveIns.end()) {
557     auto LiveIn = std::move(LiveInIt->second);
558     RPTracker.reset(*MBB->begin(), &LiveIn);
559     MBBLiveIns.erase(LiveInIt);
560   } else {
561     I = Rgn.first;
562     auto LRS = BBLiveInMap.lookup(NonDbgMI);
563 #ifdef EXPENSIVE_CHECKS
564     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
565 #endif
566     RPTracker.reset(*I, &LRS);
567   }
568 
569   for (;;) {
570     I = RPTracker.getNext();
571 
572     if (Regions[CurRegion].first == I || NonDbgMI == I) {
573       LiveIns[CurRegion] = RPTracker.getLiveRegs();
574       RPTracker.clearMaxPressure();
575     }
576 
577     if (Regions[CurRegion].second == I) {
578       Pressure[CurRegion] = RPTracker.moveMaxPressure();
579       if (CurRegion-- == RegionIdx)
580         break;
581     }
582     RPTracker.advanceToNext();
583     RPTracker.advanceBeforeNext();
584   }
585 
586   if (OnlySucc) {
587     if (I != MBB->end()) {
588       RPTracker.advanceToNext();
589       RPTracker.advance(MBB->end());
590     }
591     RPTracker.advanceBeforeNext();
592     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
593   }
594 }
595 
596 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
597 GCNScheduleDAGMILive::getBBLiveInMap() const {
598   assert(!Regions.empty());
599   std::vector<MachineInstr *> BBStarters;
600   BBStarters.reserve(Regions.size());
601   auto I = Regions.rbegin(), E = Regions.rend();
602   auto *BB = I->first->getParent();
603   do {
604     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
605     BBStarters.push_back(MI);
606     do {
607       ++I;
608     } while (I != E && I->first->getParent() == BB);
609   } while (I != E);
610   return getLiveRegMap(BBStarters, false /*After*/, *LIS);
611 }
612 
613 void GCNScheduleDAGMILive::finalizeSchedule() {
614   // Start actual scheduling here. This function is called by the base
615   // MachineScheduler after all regions have been recorded by
616   // GCNScheduleDAGMILive::schedule().
617   LiveIns.resize(Regions.size());
618   Pressure.resize(Regions.size());
619   RescheduleRegions.resize(Regions.size());
620   RegionsWithHighRP.resize(Regions.size());
621   RegionsWithExcessRP.resize(Regions.size());
622   RegionsWithMinOcc.resize(Regions.size());
623   RegionsWithIGLPInstrs.resize(Regions.size());
624   RescheduleRegions.set();
625   RegionsWithHighRP.reset();
626   RegionsWithExcessRP.reset();
627   RegionsWithMinOcc.reset();
628   RegionsWithIGLPInstrs.reset();
629 
630   runSchedStages();
631 }
632 
633 void GCNScheduleDAGMILive::runSchedStages() {
634   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
635 
636   if (!Regions.empty())
637     BBLiveInMap = getBBLiveInMap();
638 
639   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
640   while (S.advanceStage()) {
641     auto Stage = createSchedStage(S.getCurrentStage());
642     if (!Stage->initGCNSchedStage())
643       continue;
644 
645     for (auto Region : Regions) {
646       RegionBegin = Region.first;
647       RegionEnd = Region.second;
648       // Setup for scheduling the region and check whether it should be skipped.
649       if (!Stage->initGCNRegion()) {
650         Stage->advanceRegion();
651         exitRegion();
652         continue;
653       }
654 
655       ScheduleDAGMILive::schedule();
656       Stage->finalizeGCNRegion();
657     }
658 
659     Stage->finalizeGCNSchedStage();
660   }
661 }
662 
663 #ifndef NDEBUG
664 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
665   switch (StageID) {
666   case GCNSchedStageID::OccInitialSchedule:
667     OS << "Max Occupancy Initial Schedule";
668     break;
669   case GCNSchedStageID::UnclusteredHighRPReschedule:
670     OS << "Unclustered High Register Pressure Reschedule";
671     break;
672   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
673     OS << "Clustered Low Occupancy Reschedule";
674     break;
675   case GCNSchedStageID::PreRARematerialize:
676     OS << "Pre-RA Rematerialize";
677     break;
678   case GCNSchedStageID::ILPInitialSchedule:
679     OS << "Max ILP Initial Schedule";
680     break;
681   }
682 
683   return OS;
684 }
685 #endif
686 
687 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
688     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
689       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
690 
691 bool GCNSchedStage::initGCNSchedStage() {
692   if (!DAG.LIS)
693     return false;
694 
695   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
696   return true;
697 }
698 
699 bool UnclusteredHighRPStage::initGCNSchedStage() {
700   if (DisableUnclusterHighRP)
701     return false;
702 
703   if (!GCNSchedStage::initGCNSchedStage())
704     return false;
705 
706   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
707     return false;
708 
709   SavedMutations.swap(DAG.Mutations);
710   DAG.addMutation(createIGroupLPDAGMutation());
711 
712   InitialOccupancy = DAG.MinOccupancy;
713   // Aggressivly try to reduce register pressure in the unclustered high RP
714   // stage. Temporarily increase occupancy target in the region.
715   S.SGPRLimitBias = S.HighRPSGPRBias;
716   S.VGPRLimitBias = S.HighRPVGPRBias;
717   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
718     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
719 
720   LLVM_DEBUG(
721       dbgs()
722       << "Retrying function scheduling without clustering. "
723          "Aggressivly try to reduce register pressure to achieve occupancy "
724       << DAG.MinOccupancy << ".\n");
725 
726   return true;
727 }
728 
729 bool ClusteredLowOccStage::initGCNSchedStage() {
730   if (!GCNSchedStage::initGCNSchedStage())
731     return false;
732 
733   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
734   // been dropped. All regions will have already been scheduled with the ideal
735   // occupancy targets.
736   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
737     return false;
738 
739   LLVM_DEBUG(
740       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
741              << DAG.MinOccupancy << ".\n");
742   return true;
743 }
744 
745 bool PreRARematStage::initGCNSchedStage() {
746   if (!GCNSchedStage::initGCNSchedStage())
747     return false;
748 
749   if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
750     return false;
751 
752   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
753   // Check maximum occupancy
754   if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
755       DAG.MinOccupancy)
756     return false;
757 
758   // FIXME: This pass will invalidate cached MBBLiveIns for regions
759   // inbetween the defs and region we sinked the def to. Cached pressure
760   // for regions where a def is sinked from will also be invalidated. Will
761   // need to be fixed if there is another pass after this pass.
762   assert(!S.hasNextStage());
763 
764   collectRematerializableInstructions();
765   if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
766     return false;
767 
768   LLVM_DEBUG(
769       dbgs() << "Retrying function scheduling with improved occupancy of "
770              << DAG.MinOccupancy << " from rematerializing\n");
771   return true;
772 }
773 
774 void GCNSchedStage::finalizeGCNSchedStage() {
775   DAG.finishBlock();
776   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
777 }
778 
779 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
780   SavedMutations.swap(DAG.Mutations);
781   S.SGPRLimitBias = S.VGPRLimitBias = 0;
782   if (DAG.MinOccupancy > InitialOccupancy) {
783     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
784       DAG.RegionsWithMinOcc[IDX] =
785           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
786 
787     LLVM_DEBUG(dbgs() << StageID
788                       << " stage successfully increased occupancy to "
789                       << DAG.MinOccupancy << '\n');
790   }
791 
792   GCNSchedStage::finalizeGCNSchedStage();
793 }
794 
795 bool GCNSchedStage::initGCNRegion() {
796   // Check whether this new region is also a new block.
797   if (DAG.RegionBegin->getParent() != CurrentMBB)
798     setupNewBlock();
799 
800   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
801   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
802 
803   // Skip empty scheduling regions (0 or 1 schedulable instructions).
804   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
805     return false;
806 
807   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
808   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
809                     << " " << CurrentMBB->getName()
810                     << "\n  From: " << *DAG.begin() << "    To: ";
811              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
812              else dbgs() << "End";
813              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
814 
815   // Save original instruction order before scheduling for possible revert.
816   Unsched.clear();
817   Unsched.reserve(DAG.NumRegionInstrs);
818   if (StageID == GCNSchedStageID::OccInitialSchedule ||
819       StageID == GCNSchedStageID::ILPInitialSchedule) {
820     for (auto &I : DAG) {
821       Unsched.push_back(&I);
822       if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
823           I.getOpcode() == AMDGPU::IGLP_OPT)
824         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
825     }
826   } else {
827     for (auto &I : DAG)
828       Unsched.push_back(&I);
829   }
830 
831   PressureBefore = DAG.Pressure[RegionIdx];
832 
833   LLVM_DEBUG(
834       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
835              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
836              << "Region live-in pressure:  "
837              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
838              << "Region register pressure: " << print(PressureBefore));
839 
840   S.HasHighPressure = false;
841   S.KnownExcessRP = isRegionWithExcessRP();
842 
843   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
844       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
845     SavedMutations.clear();
846     SavedMutations.swap(DAG.Mutations);
847     DAG.addMutation(createIGroupLPDAGMutation());
848   }
849 
850   return true;
851 }
852 
853 bool UnclusteredHighRPStage::initGCNRegion() {
854   // Only reschedule regions with the minimum occupancy or regions that may have
855   // spilling (excess register pressure).
856   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
857        DAG.MinOccupancy <= InitialOccupancy) &&
858       !DAG.RegionsWithExcessRP[RegionIdx])
859     return false;
860 
861   return GCNSchedStage::initGCNRegion();
862 }
863 
864 bool ClusteredLowOccStage::initGCNRegion() {
865   // We may need to reschedule this region if it wasn't rescheduled in the last
866   // stage, or if we found it was testing critical register pressure limits in
867   // the unclustered reschedule stage. The later is because we may not have been
868   // able to raise the min occupancy in the previous stage so the region may be
869   // overly constrained even if it was already rescheduled.
870   if (!DAG.RegionsWithHighRP[RegionIdx])
871     return false;
872 
873   return GCNSchedStage::initGCNRegion();
874 }
875 
876 bool PreRARematStage::initGCNRegion() {
877   if (!DAG.RescheduleRegions[RegionIdx])
878     return false;
879 
880   return GCNSchedStage::initGCNRegion();
881 }
882 
883 void GCNSchedStage::setupNewBlock() {
884   if (CurrentMBB)
885     DAG.finishBlock();
886 
887   CurrentMBB = DAG.RegionBegin->getParent();
888   DAG.startBlock(CurrentMBB);
889   // Get real RP for the region if it hasn't be calculated before. After the
890   // initial schedule stage real RP will be collected after scheduling.
891   if (StageID == GCNSchedStageID::OccInitialSchedule ||
892       StageID == GCNSchedStageID::ILPInitialSchedule)
893     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
894 }
895 
896 void GCNSchedStage::finalizeGCNRegion() {
897   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
898   DAG.RescheduleRegions[RegionIdx] = false;
899   if (S.HasHighPressure)
900     DAG.RegionsWithHighRP[RegionIdx] = true;
901 
902   // Revert scheduling if we have dropped occupancy or there is some other
903   // reason that the original schedule is better.
904   checkScheduling();
905 
906   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
907       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
908     SavedMutations.swap(DAG.Mutations);
909 
910   DAG.exitRegion();
911   RegionIdx++;
912 }
913 
914 void GCNSchedStage::checkScheduling() {
915   // Check the results of scheduling.
916   PressureAfter = DAG.getRealRegPressure(RegionIdx);
917   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
918   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
919 
920   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
921       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
922     DAG.Pressure[RegionIdx] = PressureAfter;
923     DAG.RegionsWithMinOcc[RegionIdx] =
924         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
925 
926     // Early out if we have achieve the occupancy target.
927     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
928     return;
929   }
930 
931   unsigned TargetOccupancy =
932       std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
933   unsigned WavesAfter =
934       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
935   unsigned WavesBefore =
936       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
937   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
938                     << ", after " << WavesAfter << ".\n");
939 
940   // We may not be able to keep the current target occupancy because of the just
941   // scheduled region. We might still be able to revert scheduling if the
942   // occupancy before was higher, or if the current schedule has register
943   // pressure higher than the excess limits which could lead to more spilling.
944   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
945 
946   // Allow memory bound functions to drop to 4 waves if not limited by an
947   // attribute.
948   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
949       WavesAfter >= MFI.getMinAllowedOccupancy()) {
950     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
951                       << MFI.getMinAllowedOccupancy() << " waves\n");
952     NewOccupancy = WavesAfter;
953   }
954 
955   if (NewOccupancy < DAG.MinOccupancy) {
956     DAG.MinOccupancy = NewOccupancy;
957     MFI.limitOccupancy(DAG.MinOccupancy);
958     DAG.RegionsWithMinOcc.reset();
959     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
960                       << DAG.MinOccupancy << ".\n");
961   }
962 
963   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
964   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
965   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
966       PressureAfter.getAGPRNum() > MaxVGPRs ||
967       PressureAfter.getSGPRNum() > MaxSGPRs) {
968     DAG.RescheduleRegions[RegionIdx] = true;
969     DAG.RegionsWithHighRP[RegionIdx] = true;
970     DAG.RegionsWithExcessRP[RegionIdx] = true;
971   }
972 
973   // Revert if this region's schedule would cause a drop in occupancy or
974   // spilling.
975   if (shouldRevertScheduling(WavesAfter)) {
976     revertScheduling();
977   } else {
978     DAG.Pressure[RegionIdx] = PressureAfter;
979     DAG.RegionsWithMinOcc[RegionIdx] =
980         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
981   }
982 }
983 
984 unsigned
985 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
986                                       DenseMap<unsigned, unsigned> &ReadyCycles,
987                                       const TargetSchedModel &SM) {
988   unsigned ReadyCycle = CurrCycle;
989   for (auto &D : SU.Preds) {
990     if (D.isAssignedRegDep()) {
991       MachineInstr *DefMI = D.getSUnit()->getInstr();
992       unsigned Latency = SM.computeInstrLatency(DefMI);
993       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
994       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
995     }
996   }
997   ReadyCycles[SU.NodeNum] = ReadyCycle;
998   return ReadyCycle;
999 }
1000 
1001 #ifndef NDEBUG
1002 struct EarlierIssuingCycle {
1003   bool operator()(std::pair<MachineInstr *, unsigned> A,
1004                   std::pair<MachineInstr *, unsigned> B) const {
1005     return A.second < B.second;
1006   }
1007 };
1008 
1009 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1010                                         EarlierIssuingCycle> &ReadyCycles) {
1011   if (ReadyCycles.empty())
1012     return;
1013   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1014   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1015          << " ##################\n# Cycle #\t\t\tInstruction          "
1016             "             "
1017             "                            \n";
1018   unsigned IPrev = 1;
1019   for (auto &I : ReadyCycles) {
1020     if (I.second > IPrev + 1)
1021       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1022              << " CYCLES DETECTED ******************************\n\n";
1023     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
1024     IPrev = I.second;
1025   }
1026 }
1027 #endif
1028 
1029 ScheduleMetrics
1030 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1031 #ifndef NDEBUG
1032   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1033       ReadyCyclesSorted;
1034 #endif
1035   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1036   unsigned SumBubbles = 0;
1037   DenseMap<unsigned, unsigned> ReadyCycles;
1038   unsigned CurrCycle = 0;
1039   for (auto &SU : InputSchedule) {
1040     unsigned ReadyCycle =
1041         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1042     SumBubbles += ReadyCycle - CurrCycle;
1043 #ifndef NDEBUG
1044     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1045 #endif
1046     CurrCycle = ++ReadyCycle;
1047   }
1048 #ifndef NDEBUG
1049   LLVM_DEBUG(
1050       printScheduleModel(ReadyCyclesSorted);
1051       dbgs() << "\n\t"
1052              << "Metric: "
1053              << (SumBubbles
1054                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1055                      : 1)
1056              << "\n\n");
1057 #endif
1058 
1059   return ScheduleMetrics(CurrCycle, SumBubbles);
1060 }
1061 
1062 ScheduleMetrics
1063 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1064 #ifndef NDEBUG
1065   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1066       ReadyCyclesSorted;
1067 #endif
1068   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1069   unsigned SumBubbles = 0;
1070   DenseMap<unsigned, unsigned> ReadyCycles;
1071   unsigned CurrCycle = 0;
1072   for (auto &MI : DAG) {
1073     SUnit *SU = DAG.getSUnit(&MI);
1074     if (!SU)
1075       continue;
1076     unsigned ReadyCycle =
1077         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1078     SumBubbles += ReadyCycle - CurrCycle;
1079 #ifndef NDEBUG
1080     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1081 #endif
1082     CurrCycle = ++ReadyCycle;
1083   }
1084 #ifndef NDEBUG
1085   LLVM_DEBUG(
1086       printScheduleModel(ReadyCyclesSorted);
1087       dbgs() << "\n\t"
1088              << "Metric: "
1089              << (SumBubbles
1090                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1091                      : 1)
1092              << "\n\n");
1093 #endif
1094 
1095   return ScheduleMetrics(CurrCycle, SumBubbles);
1096 }
1097 
1098 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1099   if (WavesAfter < DAG.MinOccupancy)
1100     return true;
1101 
1102   return false;
1103 }
1104 
1105 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1106   if (PressureAfter == PressureBefore)
1107     return false;
1108 
1109   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1110     return true;
1111 
1112   if (mayCauseSpilling(WavesAfter))
1113     return true;
1114 
1115   return false;
1116 }
1117 
1118 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1119   // If RP is not reduced in the unclustred reschedule stage, revert to the
1120   // old schedule.
1121   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1122        mayCauseSpilling(WavesAfter)) ||
1123       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1124     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1125     return true;
1126   }
1127 
1128   // Do not attempt to relax schedule even more if we are already spilling.
1129   if (isRegionWithExcessRP())
1130     return false;
1131 
1132   LLVM_DEBUG(
1133       dbgs()
1134       << "\n\t      *** In shouldRevertScheduling ***\n"
1135       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1136   ScheduleMetrics MBefore =
1137       getScheduleMetrics(DAG.SUnits);
1138   LLVM_DEBUG(
1139       dbgs()
1140       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1141   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1142   unsigned OldMetric = MBefore.getMetric();
1143   unsigned NewMetric = MAfter.getMetric();
1144   unsigned WavesBefore =
1145       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1146   unsigned Profit =
1147       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1148        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1149        NewMetric) /
1150       ScheduleMetrics::ScaleFactor;
1151   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1152                     << MAfter << "Profit: " << Profit << "\n");
1153   return Profit < ScheduleMetrics::ScaleFactor;
1154 }
1155 
1156 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1157   if (PressureAfter == PressureBefore)
1158     return false;
1159 
1160   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1161     return true;
1162 
1163   if (mayCauseSpilling(WavesAfter))
1164     return true;
1165 
1166   return false;
1167 }
1168 
1169 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1170   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1171     return true;
1172 
1173   if (mayCauseSpilling(WavesAfter))
1174     return true;
1175 
1176   return false;
1177 }
1178 
1179 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1180   if (mayCauseSpilling(WavesAfter))
1181     return true;
1182 
1183   return false;
1184 }
1185 
1186 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1187   if (WavesAfter <= MFI.getMinWavesPerEU() &&
1188       !PressureAfter.less(ST, PressureBefore) &&
1189       isRegionWithExcessRP()) {
1190     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1191     return true;
1192   }
1193 
1194   return false;
1195 }
1196 
1197 void GCNSchedStage::revertScheduling() {
1198   DAG.RegionsWithMinOcc[RegionIdx] =
1199       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1200   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1201   DAG.RescheduleRegions[RegionIdx] =
1202       S.hasNextStage() &&
1203       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1204   DAG.RegionEnd = DAG.RegionBegin;
1205   int SkippedDebugInstr = 0;
1206   for (MachineInstr *MI : Unsched) {
1207     if (MI->isDebugInstr()) {
1208       ++SkippedDebugInstr;
1209       continue;
1210     }
1211 
1212     if (MI->getIterator() != DAG.RegionEnd) {
1213       DAG.BB->remove(MI);
1214       DAG.BB->insert(DAG.RegionEnd, MI);
1215       if (!MI->isDebugInstr())
1216         DAG.LIS->handleMove(*MI, true);
1217     }
1218 
1219     // Reset read-undef flags and update them later.
1220     for (auto &Op : MI->all_defs())
1221       Op.setIsUndef(false);
1222     RegisterOperands RegOpers;
1223     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1224     if (!MI->isDebugInstr()) {
1225       if (DAG.ShouldTrackLaneMasks) {
1226         // Adjust liveness and add missing dead+read-undef flags.
1227         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1228         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1229       } else {
1230         // Adjust for missing dead-def flags.
1231         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1232       }
1233     }
1234     DAG.RegionEnd = MI->getIterator();
1235     ++DAG.RegionEnd;
1236     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1237   }
1238 
1239   // After reverting schedule, debug instrs will now be at the end of the block
1240   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1241   // pass debug instrs to the actual end of the scheduling region.
1242   while (SkippedDebugInstr-- > 0)
1243     ++DAG.RegionEnd;
1244 
1245   // If Unsched.front() instruction is a debug instruction, this will actually
1246   // shrink the region since we moved all debug instructions to the end of the
1247   // block. Find the first instruction that is not a debug instruction.
1248   DAG.RegionBegin = Unsched.front()->getIterator();
1249   if (DAG.RegionBegin->isDebugInstr()) {
1250     for (MachineInstr *MI : Unsched) {
1251       if (MI->isDebugInstr())
1252         continue;
1253       DAG.RegionBegin = MI->getIterator();
1254       break;
1255     }
1256   }
1257 
1258   // Then move the debug instructions back into their correct place and set
1259   // RegionBegin and RegionEnd if needed.
1260   DAG.placeDebugValues();
1261 
1262   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1263 }
1264 
1265 void PreRARematStage::collectRematerializableInstructions() {
1266   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1267   for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1268     Register Reg = Register::index2VirtReg(I);
1269     if (!DAG.LIS->hasInterval(Reg))
1270       continue;
1271 
1272     // TODO: Handle AGPR and SGPR rematerialization
1273     if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1274         !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1275       continue;
1276 
1277     MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1278     MachineInstr *Def = Op->getParent();
1279     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1280       continue;
1281 
1282     MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1283     if (Def->getParent() == UseI->getParent())
1284       continue;
1285 
1286     // We are only collecting defs that are defined in another block and are
1287     // live-through or used inside regions at MinOccupancy. This means that the
1288     // register must be in the live-in set for the region.
1289     bool AddedToRematList = false;
1290     for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1291       auto It = DAG.LiveIns[I].find(Reg);
1292       if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1293         if (DAG.RegionsWithMinOcc[I]) {
1294           RematerializableInsts[I][Def] = UseI;
1295           AddedToRematList = true;
1296         }
1297 
1298         // Collect regions with rematerializable reg as live-in to avoid
1299         // searching later when updating RP.
1300         RematDefToLiveInRegions[Def].push_back(I);
1301       }
1302     }
1303     if (!AddedToRematList)
1304       RematDefToLiveInRegions.erase(Def);
1305   }
1306 }
1307 
1308 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1309                                               const TargetInstrInfo *TII) {
1310   // Temporary copies of cached variables we will be modifying and replacing if
1311   // sinking succeeds.
1312   SmallVector<
1313       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1314       NewRegions;
1315   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1316   DenseMap<unsigned, GCNRegPressure> NewPressure;
1317   BitVector NewRescheduleRegions;
1318   LiveIntervals *LIS = DAG.LIS;
1319 
1320   NewRegions.resize(DAG.Regions.size());
1321   NewRescheduleRegions.resize(DAG.Regions.size());
1322 
1323   // Collect only regions that has a rematerializable def as a live-in.
1324   SmallSet<unsigned, 16> ImpactedRegions;
1325   for (const auto &It : RematDefToLiveInRegions)
1326     ImpactedRegions.insert(It.second.begin(), It.second.end());
1327 
1328   // Make copies of register pressure and live-ins cache that will be updated
1329   // as we rematerialize.
1330   for (auto Idx : ImpactedRegions) {
1331     NewPressure[Idx] = DAG.Pressure[Idx];
1332     NewLiveIns[Idx] = DAG.LiveIns[Idx];
1333   }
1334   NewRegions = DAG.Regions;
1335   NewRescheduleRegions.reset();
1336 
1337   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1338   bool Improved = false;
1339   for (auto I : ImpactedRegions) {
1340     if (!DAG.RegionsWithMinOcc[I])
1341       continue;
1342 
1343     Improved = false;
1344     int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1345     int SGPRUsage = NewPressure[I].getSGPRNum();
1346 
1347     // TODO: Handle occupancy drop due to AGPR and SGPR.
1348     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1349     if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1350       break;
1351 
1352     // The occupancy of this region could have been improved by a previous
1353     // iteration's sinking of defs.
1354     if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1355       NewRescheduleRegions[I] = true;
1356       Improved = true;
1357       continue;
1358     }
1359 
1360     // First check if we have enough trivially rematerializable instructions to
1361     // improve occupancy. Optimistically assume all instructions we are able to
1362     // sink decreased RP.
1363     int TotalSinkableRegs = 0;
1364     for (const auto &It : RematerializableInsts[I]) {
1365       MachineInstr *Def = It.first;
1366       Register DefReg = Def->getOperand(0).getReg();
1367       TotalSinkableRegs +=
1368           SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1369     }
1370     int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1371     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1372     // If in the most optimistic scenario, we cannot improve occupancy, then do
1373     // not attempt to sink any instructions.
1374     if (OptimisticOccupancy <= DAG.MinOccupancy)
1375       break;
1376 
1377     unsigned ImproveOccupancy = 0;
1378     SmallVector<MachineInstr *, 4> SinkedDefs;
1379     for (auto &It : RematerializableInsts[I]) {
1380       MachineInstr *Def = It.first;
1381       MachineBasicBlock::iterator InsertPos =
1382           MachineBasicBlock::iterator(It.second);
1383       Register Reg = Def->getOperand(0).getReg();
1384       // Rematerialize MI to its use block. Since we are only rematerializing
1385       // instructions that do not have any virtual reg uses, we do not need to
1386       // call LiveRangeEdit::allUsesAvailableAt() and
1387       // LiveRangeEdit::canRematerializeAt().
1388       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1389                          Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1390       MachineInstr *NewMI = &*std::prev(InsertPos);
1391       LIS->InsertMachineInstrInMaps(*NewMI);
1392       LIS->removeInterval(Reg);
1393       LIS->createAndComputeVirtRegInterval(Reg);
1394       InsertedMIToOldDef[NewMI] = Def;
1395 
1396       // Update region boundaries in scheduling region we sinked from since we
1397       // may sink an instruction that was at the beginning or end of its region
1398       DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1399                                  /*Removing =*/true);
1400 
1401       // Update region boundaries in region we sinked to.
1402       DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1403 
1404       LaneBitmask PrevMask = NewLiveIns[I][Reg];
1405       // FIXME: Also update cached pressure for where the def was sinked from.
1406       // Update RP for all regions that has this reg as a live-in and remove
1407       // the reg from all regions as a live-in.
1408       for (auto Idx : RematDefToLiveInRegions[Def]) {
1409         NewLiveIns[Idx].erase(Reg);
1410         if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1411           // Def is live-through and not used in this block.
1412           NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1413         } else {
1414           // Def is used and rematerialized into this block.
1415           GCNDownwardRPTracker RPT(*LIS);
1416           auto *NonDbgMI = &*skipDebugInstructionsForward(
1417               NewRegions[Idx].first, NewRegions[Idx].second);
1418           RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1419           RPT.advance(NewRegions[Idx].second);
1420           NewPressure[Idx] = RPT.moveMaxPressure();
1421         }
1422       }
1423 
1424       SinkedDefs.push_back(Def);
1425       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1426       if (ImproveOccupancy > DAG.MinOccupancy)
1427         break;
1428     }
1429 
1430     // Remove defs we just sinked from all regions' list of sinkable defs
1431     for (auto &Def : SinkedDefs)
1432       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1433         RematerializableInsts[TrackedIdx].erase(Def);
1434 
1435     if (ImproveOccupancy <= DAG.MinOccupancy)
1436       break;
1437 
1438     NewRescheduleRegions[I] = true;
1439     Improved = true;
1440   }
1441 
1442   if (!Improved) {
1443     // Occupancy was not improved for all regions that were at MinOccupancy.
1444     // Undo sinking and remove newly rematerialized instructions.
1445     for (auto &Entry : InsertedMIToOldDef) {
1446       MachineInstr *MI = Entry.first;
1447       MachineInstr *OldMI = Entry.second;
1448       Register Reg = MI->getOperand(0).getReg();
1449       LIS->RemoveMachineInstrFromMaps(*MI);
1450       MI->eraseFromParent();
1451       OldMI->clearRegisterDeads(Reg);
1452       LIS->removeInterval(Reg);
1453       LIS->createAndComputeVirtRegInterval(Reg);
1454     }
1455     return false;
1456   }
1457 
1458   // Occupancy was improved for all regions.
1459   for (auto &Entry : InsertedMIToOldDef) {
1460     MachineInstr *MI = Entry.first;
1461     MachineInstr *OldMI = Entry.second;
1462 
1463     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1464     DAG.BBLiveInMap.erase(OldMI);
1465 
1466     // Remove OldMI and update LIS
1467     Register Reg = MI->getOperand(0).getReg();
1468     LIS->RemoveMachineInstrFromMaps(*OldMI);
1469     OldMI->eraseFromParent();
1470     LIS->removeInterval(Reg);
1471     LIS->createAndComputeVirtRegInterval(Reg);
1472   }
1473 
1474   // Update live-ins, register pressure, and regions caches.
1475   for (auto Idx : ImpactedRegions) {
1476     DAG.LiveIns[Idx] = NewLiveIns[Idx];
1477     DAG.Pressure[Idx] = NewPressure[Idx];
1478     DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1479   }
1480   DAG.Regions = NewRegions;
1481   DAG.RescheduleRegions = NewRescheduleRegions;
1482 
1483   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1484   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1485 
1486   return true;
1487 }
1488 
1489 // Copied from MachineLICM
1490 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1491   if (!DAG.TII->isTriviallyReMaterializable(MI))
1492     return false;
1493 
1494   for (const MachineOperand &MO : MI.all_uses())
1495     if (MO.getReg().isVirtual())
1496       return false;
1497 
1498   return true;
1499 }
1500 
1501 // When removing, we will have to check both beginning and ending of the region.
1502 // When inserting, we will only have to check if we are inserting NewMI in front
1503 // of a scheduling region and do not need to check the ending since we will only
1504 // ever be inserting before an already existing MI.
1505 void GCNScheduleDAGMILive::updateRegionBoundaries(
1506     SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1507                               MachineBasicBlock::iterator>> &RegionBoundaries,
1508     MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1509   unsigned I = 0, E = RegionBoundaries.size();
1510   // Search for first region of the block where MI is located
1511   while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1512     ++I;
1513 
1514   for (; I != E; ++I) {
1515     if (MI->getParent() != RegionBoundaries[I].first->getParent())
1516       return;
1517 
1518     if (Removing && MI == RegionBoundaries[I].first &&
1519         MI == RegionBoundaries[I].second) {
1520       // MI is in a region with size 1, after removing, the region will be
1521       // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1522       RegionBoundaries[I] =
1523           std::pair(MI->getParent()->end(), MI->getParent()->end());
1524       return;
1525     }
1526     if (MI == RegionBoundaries[I].first) {
1527       if (Removing)
1528         RegionBoundaries[I] =
1529             std::pair(std::next(MI), RegionBoundaries[I].second);
1530       else
1531         // Inserted NewMI in front of region, set new RegionBegin to NewMI
1532         RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1533                                         RegionBoundaries[I].second);
1534       return;
1535     }
1536     if (Removing && MI == RegionBoundaries[I].second) {
1537       RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1538       return;
1539     }
1540   }
1541 }
1542 
1543 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1544   return std::any_of(
1545       DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
1546         unsigned Opc = MI->getOpcode();
1547         return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1548       });
1549 }
1550 
1551 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1552     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1553     bool RemoveKillFlags)
1554     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1555 
1556 void GCNPostScheduleDAGMILive::schedule() {
1557   HasIGLPInstrs = hasIGLPInstrs(this);
1558   if (HasIGLPInstrs) {
1559     SavedMutations.clear();
1560     SavedMutations.swap(Mutations);
1561     addMutation(createIGroupLPDAGMutation());
1562   }
1563 
1564   ScheduleDAGMI::schedule();
1565 }
1566 
1567 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1568   if (HasIGLPInstrs)
1569     SavedMutations.swap(Mutations);
1570 
1571   ScheduleDAGMI::finalizeSchedule();
1572 }
1573