xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17 
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "GCNRegPressure.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/CodeGen/RegisterClassInfo.h"
33 #include "llvm/MC/LaneBitmask.h"
34 #include "llvm/Support/ErrorHandling.h"
35 
36 #define DEBUG_TYPE "machine-scheduler"
37 
38 using namespace llvm;
39 
40 static cl::opt<bool> DisableUnclusterHighRP(
41     "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
42     cl::desc("Disable unclustered high register pressure "
43              "reduction scheduling stage."),
44     cl::init(false));
45 
46 static cl::opt<bool> DisableClusteredLowOccupancy(
47     "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
48     cl::desc("Disable clustered low occupancy "
49              "rescheduling for ILP scheduling stage."),
50     cl::init(false));
51 
52 static cl::opt<unsigned> ScheduleMetricBias(
53     "amdgpu-schedule-metric-bias", cl::Hidden,
54     cl::desc(
55         "Sets the bias which adds weight to occupancy vs latency. Set it to "
56         "100 to chase the occupancy only."),
57     cl::init(10));
58 
59 static cl::opt<bool>
60     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
61                cl::desc("Relax occupancy targets for kernels which are memory "
62                         "bound (amdgpu-membound-threshold), or "
63                         "Wave Limited (amdgpu-limit-wave-threshold)."),
64                cl::init(false));
65 
66 static cl::opt<bool> GCNTrackers(
67     "amdgpu-use-amdgpu-trackers", cl::Hidden,
68     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
69     cl::init(false));
70 
71 const unsigned ScheduleMetrics::ScaleFactor = 100;
72 
GCNSchedStrategy(const MachineSchedContext * C)73 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
74     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
75       DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
76 }
77 
initialize(ScheduleDAGMI * DAG)78 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
79   GenericScheduler::initialize(DAG);
80 
81   MF = &DAG->MF;
82 
83   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
84 
85   SGPRExcessLimit =
86       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
87   VGPRExcessLimit =
88       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
89 
90   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
91   // Set the initial TargetOccupnacy to the maximum occupancy that we can
92   // achieve for this function. This effectively sets a lower bound on the
93   // 'Critical' register limits in the scheduler.
94   // Allow for lower occupancy targets if kernel is wave limited or memory
95   // bound, and using the relaxed occupancy feature.
96   TargetOccupancy =
97       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
98   SGPRCriticalLimit =
99       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
100 
101   if (!KnownExcessRP) {
102     VGPRCriticalLimit = std::min(
103         ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
104         VGPRExcessLimit);
105   } else {
106     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
107     // returns a reasonably small number for targets with lots of VGPRs, such
108     // as GFX10 and GFX11.
109     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
110                          "VGPRCriticalLimit calculation method.\n");
111     unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
112     unsigned Granule =
113         AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
114     unsigned Addressable =
115         AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
116     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
117     VGPRBudget = std::max(VGPRBudget, Granule);
118     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
119   }
120 
121   // Subtract error margin and bias from register limits and avoid overflow.
122   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
123   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
124   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
125   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
126 
127   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
128                     << ", VGPRExcessLimit = " << VGPRExcessLimit
129                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
130                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
131 }
132 
133 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
134 /// current register pressure.
135 ///
136 /// This works for the common case, but it has a few exceptions that have been
137 /// observed through trial and error:
138 ///   - Explicit physical register operands
139 ///   - Subregister definitions
140 ///
141 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
142 /// and querying LiveIntervals through the RegPressureTracker is needed to get
143 /// an accurate value.
144 ///
145 /// We should eventually only use PressureDiff for maximum performance, but this
146 /// already allows 80% of SUs to take the fast path without changing scheduling
147 /// at all. Further changes would either change scheduling, or require a lot
148 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
canUsePressureDiffs(const SUnit & SU)149 static bool canUsePressureDiffs(const SUnit &SU) {
150   if (!SU.isInstr())
151     return false;
152 
153   // Cannot use pressure diffs for subregister defs or with physregs, it's
154   // imprecise in both cases.
155   for (const auto &Op : SU.getInstr()->operands()) {
156     if (!Op.isReg() || Op.isImplicit())
157       continue;
158     if (Op.getReg().isPhysical() ||
159         (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
160       return false;
161   }
162   return true;
163 }
164 
getRegisterPressures(bool AtTop,const RegPressureTracker & RPTracker,SUnit * SU,std::vector<unsigned> & Pressure,std::vector<unsigned> & MaxPressure,GCNDownwardRPTracker & DownwardTracker,GCNUpwardRPTracker & UpwardTracker,ScheduleDAGMI * DAG,const SIRegisterInfo * SRI)165 static void getRegisterPressures(
166     bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
167     std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
168     GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
169     ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
170   // getDownwardPressure() and getUpwardPressure() make temporary changes to
171   // the tracker, so we need to pass those function a non-const copy.
172   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
173   if (!GCNTrackers) {
174     AtTop
175         ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
176         : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
177 
178     return;
179   }
180 
181   // GCNTrackers
182   Pressure.resize(4, 0);
183   MachineInstr *MI = SU->getInstr();
184   GCNRegPressure NewPressure;
185   if (AtTop) {
186     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
187     NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
188   } else {
189     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
190     TempUpwardTracker.recede(*MI);
191     NewPressure = TempUpwardTracker.getPressure();
192   }
193   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
194   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
195       NewPressure.getArchVGPRNum();
196   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
197 }
198 
initCandidate(SchedCandidate & Cand,SUnit * SU,bool AtTop,const RegPressureTracker & RPTracker,const SIRegisterInfo * SRI,unsigned SGPRPressure,unsigned VGPRPressure,bool IsBottomUp)199 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
200                                      bool AtTop,
201                                      const RegPressureTracker &RPTracker,
202                                      const SIRegisterInfo *SRI,
203                                      unsigned SGPRPressure,
204                                      unsigned VGPRPressure, bool IsBottomUp) {
205   Cand.SU = SU;
206   Cand.AtTop = AtTop;
207 
208   if (!DAG->isTrackingPressure())
209     return;
210 
211   Pressure.clear();
212   MaxPressure.clear();
213 
214   // We try to use the cached PressureDiffs in the ScheduleDAG whenever
215   // possible over querying the RegPressureTracker.
216   //
217   // RegPressureTracker will make a lot of LIS queries which are very
218   // expensive, it is considered a slow function in this context.
219   //
220   // PressureDiffs are precomputed and cached, and getPressureDiff is just a
221   // trivial lookup into an array. It is pretty much free.
222   //
223   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
224   // PressureDiffs.
225   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
226     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
227                          DownwardTracker, UpwardTracker, DAG, SRI);
228   } else {
229     // Reserve 4 slots.
230     Pressure.resize(4, 0);
231     Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
232     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
233 
234     for (const auto &Diff : DAG->getPressureDiff(SU)) {
235       if (!Diff.isValid())
236         continue;
237       // PressureDiffs is always bottom-up so if we're working top-down we need
238       // to invert its sign.
239       Pressure[Diff.getPSet()] +=
240           (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
241     }
242 
243 #ifdef EXPENSIVE_CHECKS
244     std::vector<unsigned> CheckPressure, CheckMaxPressure;
245     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
246                          DownwardTracker, UpwardTracker, DAG, SRI);
247     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
248             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
249         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
250             CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
251       errs() << "Register Pressure is inaccurate when calculated through "
252                 "PressureDiff\n"
253              << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
254              << ", expected "
255              << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
256              << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
257              << ", expected "
258              << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
259       report_fatal_error("inaccurate register pressure calculation");
260     }
261 #endif
262   }
263 
264   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
265   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
266 
267   // If two instructions increase the pressure of different register sets
268   // by the same amount, the generic scheduler will prefer to schedule the
269   // instruction that increases the set with the least amount of registers,
270   // which in our case would be SGPRs.  This is rarely what we want, so
271   // when we report excess/critical register pressure, we do it either
272   // only for VGPRs or only for SGPRs.
273 
274   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
275   const unsigned MaxVGPRPressureInc = 16;
276   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
277   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
278 
279   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
280   // to increase the likelihood we don't go over the limits.  We should improve
281   // the analysis to look through dependencies to find the path with the least
282   // register pressure.
283 
284   // We only need to update the RPDelta for instructions that increase register
285   // pressure. Instructions that decrease or keep reg pressure the same will be
286   // marked as RegExcess in tryCandidate() when they are compared with
287   // instructions that increase the register pressure.
288   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
289     HasHighPressure = true;
290     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
291     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
292   }
293 
294   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
295     HasHighPressure = true;
296     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
297     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
298   }
299 
300   // Register pressure is considered 'CRITICAL' if it is approaching a value
301   // that would reduce the wave occupancy for the execution unit.  When
302   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
303   // has the same cost, so we don't need to prefer one over the other.
304 
305   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
306   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
307 
308   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
309     HasHighPressure = true;
310     if (SGPRDelta > VGPRDelta) {
311       Cand.RPDelta.CriticalMax =
312           PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
313       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
314     } else {
315       Cand.RPDelta.CriticalMax =
316           PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
317       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
318     }
319   }
320 }
321 
322 // This function is mostly cut and pasted from
323 // GenericScheduler::pickNodeFromQueue()
pickNodeFromQueue(SchedBoundary & Zone,const CandPolicy & ZonePolicy,const RegPressureTracker & RPTracker,SchedCandidate & Cand,bool IsBottomUp)324 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
325                                          const CandPolicy &ZonePolicy,
326                                          const RegPressureTracker &RPTracker,
327                                          SchedCandidate &Cand,
328                                          bool IsBottomUp) {
329   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
330   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
331   unsigned SGPRPressure = 0;
332   unsigned VGPRPressure = 0;
333   if (DAG->isTrackingPressure()) {
334     if (!GCNTrackers) {
335       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
336       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
337     } else {
338       GCNRPTracker *T = IsBottomUp
339                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
340                             : static_cast<GCNRPTracker *>(&DownwardTracker);
341       SGPRPressure = T->getPressure().getSGPRNum();
342       VGPRPressure = T->getPressure().getArchVGPRNum();
343     }
344   }
345   ReadyQueue &Q = Zone.Available;
346   for (SUnit *SU : Q) {
347 
348     SchedCandidate TryCand(ZonePolicy);
349     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
350                   VGPRPressure, IsBottomUp);
351     // Pass SchedBoundary only when comparing nodes from the same boundary.
352     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
353     tryCandidate(Cand, TryCand, ZoneArg);
354     if (TryCand.Reason != NoCand) {
355       // Initialize resource delta if needed in case future heuristics query it.
356       if (TryCand.ResDelta == SchedResourceDelta())
357         TryCand.initResourceDelta(Zone.DAG, SchedModel);
358       Cand.setBest(TryCand);
359       LLVM_DEBUG(traceCandidate(Cand));
360     }
361   }
362 }
363 
364 // This function is mostly cut and pasted from
365 // GenericScheduler::pickNodeBidirectional()
pickNodeBidirectional(bool & IsTopNode)366 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
367   // Schedule as far as possible in the direction of no choice. This is most
368   // efficient, but also provides the best heuristics for CriticalPSets.
369   if (SUnit *SU = Bot.pickOnlyChoice()) {
370     IsTopNode = false;
371     return SU;
372   }
373   if (SUnit *SU = Top.pickOnlyChoice()) {
374     IsTopNode = true;
375     return SU;
376   }
377   // Set the bottom-up policy based on the state of the current bottom zone and
378   // the instructions outside the zone, including the top zone.
379   CandPolicy BotPolicy;
380   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
381   // Set the top-down policy based on the state of the current top zone and
382   // the instructions outside the zone, including the bottom zone.
383   CandPolicy TopPolicy;
384   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
385 
386   // See if BotCand is still valid (because we previously scheduled from Top).
387   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
388   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
389       BotCand.Policy != BotPolicy) {
390     BotCand.reset(CandPolicy());
391     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
392                       /*IsBottomUp=*/true);
393     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
394   } else {
395     LLVM_DEBUG(traceCandidate(BotCand));
396 #ifndef NDEBUG
397     if (VerifyScheduling) {
398       SchedCandidate TCand;
399       TCand.reset(CandPolicy());
400       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
401                         /*IsBottomUp=*/true);
402       assert(TCand.SU == BotCand.SU &&
403              "Last pick result should correspond to re-picking right now");
404     }
405 #endif
406   }
407 
408   // Check if the top Q has a better candidate.
409   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
410   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
411       TopCand.Policy != TopPolicy) {
412     TopCand.reset(CandPolicy());
413     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
414                       /*IsBottomUp=*/false);
415     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
416   } else {
417     LLVM_DEBUG(traceCandidate(TopCand));
418 #ifndef NDEBUG
419     if (VerifyScheduling) {
420       SchedCandidate TCand;
421       TCand.reset(CandPolicy());
422       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
423                         /*IsBottomUp=*/false);
424       assert(TCand.SU == TopCand.SU &&
425              "Last pick result should correspond to re-picking right now");
426     }
427 #endif
428   }
429 
430   // Pick best from BotCand and TopCand.
431   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
432              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
433   SchedCandidate Cand = BotCand;
434   TopCand.Reason = NoCand;
435   tryCandidate(Cand, TopCand, nullptr);
436   if (TopCand.Reason != NoCand) {
437     Cand.setBest(TopCand);
438   }
439   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
440 
441   IsTopNode = Cand.AtTop;
442   return Cand.SU;
443 }
444 
445 // This function is mostly cut and pasted from
446 // GenericScheduler::pickNode()
pickNode(bool & IsTopNode)447 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
448   if (DAG->top() == DAG->bottom()) {
449     assert(Top.Available.empty() && Top.Pending.empty() &&
450            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451     return nullptr;
452   }
453   SUnit *SU;
454   do {
455     if (RegionPolicy.OnlyTopDown) {
456       SU = Top.pickOnlyChoice();
457       if (!SU) {
458         CandPolicy NoPolicy;
459         TopCand.reset(NoPolicy);
460         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
461                           /*IsBottomUp=*/false);
462         assert(TopCand.Reason != NoCand && "failed to find a candidate");
463         SU = TopCand.SU;
464       }
465       IsTopNode = true;
466     } else if (RegionPolicy.OnlyBottomUp) {
467       SU = Bot.pickOnlyChoice();
468       if (!SU) {
469         CandPolicy NoPolicy;
470         BotCand.reset(NoPolicy);
471         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
472                           /*IsBottomUp=*/true);
473         assert(BotCand.Reason != NoCand && "failed to find a candidate");
474         SU = BotCand.SU;
475       }
476       IsTopNode = false;
477     } else {
478       SU = pickNodeBidirectional(IsTopNode);
479     }
480   } while (SU->isScheduled);
481 
482   if (SU->isTopReady())
483     Top.removeReady(SU);
484   if (SU->isBottomReady())
485     Bot.removeReady(SU);
486 
487   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
488                     << *SU->getInstr());
489   return SU;
490 }
491 
schedNode(SUnit * SU,bool IsTopNode)492 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
493   if (GCNTrackers) {
494     MachineInstr *MI = SU->getInstr();
495     IsTopNode ? (void)DownwardTracker.advance(MI, false)
496               : UpwardTracker.recede(*MI);
497   }
498 
499   return GenericScheduler::schedNode(SU, IsTopNode);
500 }
501 
getCurrentStage()502 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
503   assert(CurrentStage && CurrentStage != SchedStages.end());
504   return *CurrentStage;
505 }
506 
advanceStage()507 bool GCNSchedStrategy::advanceStage() {
508   assert(CurrentStage != SchedStages.end());
509   if (!CurrentStage)
510     CurrentStage = SchedStages.begin();
511   else
512     CurrentStage++;
513 
514   return CurrentStage != SchedStages.end();
515 }
516 
hasNextStage() const517 bool GCNSchedStrategy::hasNextStage() const {
518   assert(CurrentStage);
519   return std::next(CurrentStage) != SchedStages.end();
520 }
521 
getNextStage() const522 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
523   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
524   return *std::next(CurrentStage);
525 }
526 
GCNMaxOccupancySchedStrategy(const MachineSchedContext * C,bool IsLegacyScheduler)527 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
528     const MachineSchedContext *C, bool IsLegacyScheduler)
529     : GCNSchedStrategy(C) {
530   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
531   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
532   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
533   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
534   GCNTrackers = GCNTrackers & !IsLegacyScheduler;
535 }
536 
GCNMaxILPSchedStrategy(const MachineSchedContext * C)537 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
538     : GCNSchedStrategy(C) {
539   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
540 }
541 
tryCandidate(SchedCandidate & Cand,SchedCandidate & TryCand,SchedBoundary * Zone) const542 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
543                                           SchedCandidate &TryCand,
544                                           SchedBoundary *Zone) const {
545   // Initialize the candidate if needed.
546   if (!Cand.isValid()) {
547     TryCand.Reason = NodeOrder;
548     return true;
549   }
550 
551   // Avoid spilling by exceeding the register limit.
552   if (DAG->isTrackingPressure() &&
553       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
554                   RegExcess, TRI, DAG->MF))
555     return TryCand.Reason != NoCand;
556 
557   // Bias PhysReg Defs and copies to their uses and defined respectively.
558   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
559                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
560     return TryCand.Reason != NoCand;
561 
562   bool SameBoundary = Zone != nullptr;
563   if (SameBoundary) {
564     // Prioritize instructions that read unbuffered resources by stall cycles.
565     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
566                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
567       return TryCand.Reason != NoCand;
568 
569     // Avoid critical resource consumption and balance the schedule.
570     TryCand.initResourceDelta(DAG, SchedModel);
571     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
572                 TryCand, Cand, ResourceReduce))
573       return TryCand.Reason != NoCand;
574     if (tryGreater(TryCand.ResDelta.DemandedResources,
575                    Cand.ResDelta.DemandedResources, TryCand, Cand,
576                    ResourceDemand))
577       return TryCand.Reason != NoCand;
578 
579     // Unconditionally try to reduce latency.
580     if (tryLatency(TryCand, Cand, *Zone))
581       return TryCand.Reason != NoCand;
582 
583     // Weak edges are for clustering and other constraints.
584     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
585                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
586       return TryCand.Reason != NoCand;
587   }
588 
589   // Keep clustered nodes together to encourage downstream peephole
590   // optimizations which may reduce resource requirements.
591   //
592   // This is a best effort to set things up for a post-RA pass. Optimizations
593   // like generating loads of multiple registers should ideally be done within
594   // the scheduler pass by combining the loads during DAG postprocessing.
595   const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
596   const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
597   if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
598                  CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
599                  Cluster))
600     return TryCand.Reason != NoCand;
601 
602   // Avoid increasing the max critical pressure in the scheduled region.
603   if (DAG->isTrackingPressure() &&
604       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
605                   TryCand, Cand, RegCritical, TRI, DAG->MF))
606     return TryCand.Reason != NoCand;
607 
608   // Avoid increasing the max pressure of the entire region.
609   if (DAG->isTrackingPressure() &&
610       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
611                   Cand, RegMax, TRI, DAG->MF))
612     return TryCand.Reason != NoCand;
613 
614   if (SameBoundary) {
615     // Fall through to original instruction order.
616     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
617         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
618       TryCand.Reason = NodeOrder;
619       return true;
620     }
621   }
622   return false;
623 }
624 
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext * C)625 GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
626     const MachineSchedContext *C)
627     : GCNSchedStrategy(C) {
628   SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
629 }
630 
631 /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
632 /// much as possible. This is achieved by:
633 //  1. Prioritize clustered operations before stall latency heuristic.
634 //  2. Prioritize long-latency-load before stall latency heuristic.
635 ///
636 /// \param Cand provides the policy and current best candidate.
637 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
638 /// \param Zone describes the scheduled zone that we are extending, or nullptr
639 ///             if Cand is from a different zone than TryCand.
640 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
tryCandidate(SchedCandidate & Cand,SchedCandidate & TryCand,SchedBoundary * Zone) const641 bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
642                                                    SchedCandidate &TryCand,
643                                                    SchedBoundary *Zone) const {
644   // Initialize the candidate if needed.
645   if (!Cand.isValid()) {
646     TryCand.Reason = NodeOrder;
647     return true;
648   }
649 
650   // Bias PhysReg Defs and copies to their uses and defined respectively.
651   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
652                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
653     return TryCand.Reason != NoCand;
654 
655   if (DAG->isTrackingPressure()) {
656     // Avoid exceeding the target's limit.
657     if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
658                     RegExcess, TRI, DAG->MF))
659       return TryCand.Reason != NoCand;
660 
661     // Avoid increasing the max critical pressure in the scheduled region.
662     if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
663                     TryCand, Cand, RegCritical, TRI, DAG->MF))
664       return TryCand.Reason != NoCand;
665   }
666 
667   // MaxMemoryClause-specific: We prioritize clustered instructions as we would
668   // get more benefit from clausing these memory instructions.
669   const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
670   const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
671   if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
672                  CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
673                  Cluster))
674     return TryCand.Reason != NoCand;
675 
676   // We only compare a subset of features when comparing nodes between
677   // Top and Bottom boundary. Some properties are simply incomparable, in many
678   // other instances we should only override the other boundary if something
679   // is a clear good pick on one boundary. Skip heuristics that are more
680   // "tie-breaking" in nature.
681   bool SameBoundary = Zone != nullptr;
682   if (SameBoundary) {
683     // For loops that are acyclic path limited, aggressively schedule for
684     // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
685     // heuristics to take precedence.
686     if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
687         tryLatency(TryCand, Cand, *Zone))
688       return TryCand.Reason != NoCand;
689 
690     // MaxMemoryClause-specific: Prioritize long latency memory load
691     // instructions in top-bottom order to hide more latency. The mayLoad check
692     // is used to exclude store-like instructions, which we do not want to
693     // scheduler them too early.
694     bool TryMayLoad =
695         TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
696     bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
697 
698     if (TryMayLoad || CandMayLoad) {
699       bool TryLongLatency =
700           TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
701       bool CandLongLatency =
702           10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
703 
704       if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
705                      Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
706                      Cand, Stall))
707         return TryCand.Reason != NoCand;
708     }
709     // Prioritize instructions that read unbuffered resources by stall cycles.
710     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
711                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
712       return TryCand.Reason != NoCand;
713   }
714 
715   if (SameBoundary) {
716     // Weak edges are for clustering and other constraints.
717     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
718                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
719       return TryCand.Reason != NoCand;
720   }
721 
722   // Avoid increasing the max pressure of the entire region.
723   if (DAG->isTrackingPressure() &&
724       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
725                   Cand, RegMax, TRI, DAG->MF))
726     return TryCand.Reason != NoCand;
727 
728   if (SameBoundary) {
729     // Avoid critical resource consumption and balance the schedule.
730     TryCand.initResourceDelta(DAG, SchedModel);
731     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
732                 TryCand, Cand, ResourceReduce))
733       return TryCand.Reason != NoCand;
734     if (tryGreater(TryCand.ResDelta.DemandedResources,
735                    Cand.ResDelta.DemandedResources, TryCand, Cand,
736                    ResourceDemand))
737       return TryCand.Reason != NoCand;
738 
739     // Avoid serializing long latency dependence chains.
740     // For acyclic path limited loops, latency was already checked above.
741     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
742         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
743       return TryCand.Reason != NoCand;
744 
745     // Fall through to original instruction order.
746     if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
747       assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
748       TryCand.Reason = NodeOrder;
749       return true;
750     }
751   }
752 
753   return false;
754 }
755 
GCNScheduleDAGMILive(MachineSchedContext * C,std::unique_ptr<MachineSchedStrategy> S)756 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
757     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
758     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
759       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
760       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
761       RegionLiveOuts(this, /*IsLiveOut=*/true) {
762 
763   // We want regions with a single MI to be scheduled so that we can reason
764   // about them correctly during scheduling stages that move MIs between regions
765   // (e.g., rematerialization).
766   ScheduleSingleMIRegions = true;
767   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
768   if (RelaxedOcc) {
769     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
770     if (MinOccupancy != StartingOccupancy)
771       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
772                         << ".\n");
773   }
774 }
775 
776 std::unique_ptr<GCNSchedStage>
createSchedStage(GCNSchedStageID SchedStageID)777 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
778   switch (SchedStageID) {
779   case GCNSchedStageID::OccInitialSchedule:
780     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
781   case GCNSchedStageID::UnclusteredHighRPReschedule:
782     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
783   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
784     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
785   case GCNSchedStageID::PreRARematerialize:
786     return std::make_unique<PreRARematStage>(SchedStageID, *this);
787   case GCNSchedStageID::ILPInitialSchedule:
788     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
789   case GCNSchedStageID::MemoryClauseInitialSchedule:
790     return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
791                                                               *this);
792   }
793 
794   llvm_unreachable("Unknown SchedStageID.");
795 }
796 
schedule()797 void GCNScheduleDAGMILive::schedule() {
798   // Collect all scheduling regions. The actual scheduling is performed in
799   // GCNScheduleDAGMILive::finalizeSchedule.
800   Regions.push_back(std::pair(RegionBegin, RegionEnd));
801 }
802 
803 GCNRegPressure
getRealRegPressure(unsigned RegionIdx) const804 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
805   GCNDownwardRPTracker RPTracker(*LIS);
806   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
807   return RPTracker.moveMaxPressure();
808 }
809 
getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,MachineBasicBlock::iterator RegionEnd)810 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
811                                         MachineBasicBlock::iterator RegionEnd) {
812   auto REnd = RegionEnd == RegionBegin->getParent()->end()
813                   ? std::prev(RegionEnd)
814                   : RegionEnd;
815   return &*skipDebugInstructionsBackward(REnd, RegionBegin);
816 }
817 
computeBlockPressure(unsigned RegionIdx,const MachineBasicBlock * MBB)818 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
819                                                 const MachineBasicBlock *MBB) {
820   GCNDownwardRPTracker RPTracker(*LIS);
821 
822   // If the block has the only successor then live-ins of that successor are
823   // live-outs of the current block. We can reuse calculated live set if the
824   // successor will be sent to scheduling past current block.
825 
826   // However, due to the bug in LiveInterval analysis it may happen that two
827   // predecessors of the same successor block have different lane bitmasks for
828   // a live-out register. Workaround that by sticking to one-to-one relationship
829   // i.e. one predecessor with one successor block.
830   const MachineBasicBlock *OnlySucc = nullptr;
831   if (MBB->succ_size() == 1) {
832     auto *Candidate = *MBB->succ_begin();
833     if (!Candidate->empty() && Candidate->pred_size() == 1) {
834       SlotIndexes *Ind = LIS->getSlotIndexes();
835       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
836         OnlySucc = Candidate;
837     }
838   }
839 
840   // Scheduler sends regions from the end of the block upwards.
841   size_t CurRegion = RegionIdx;
842   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
843     if (Regions[CurRegion].first->getParent() != MBB)
844       break;
845   --CurRegion;
846 
847   auto I = MBB->begin();
848   auto LiveInIt = MBBLiveIns.find(MBB);
849   auto &Rgn = Regions[CurRegion];
850   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
851   if (LiveInIt != MBBLiveIns.end()) {
852     auto LiveIn = std::move(LiveInIt->second);
853     RPTracker.reset(*MBB->begin(), &LiveIn);
854     MBBLiveIns.erase(LiveInIt);
855   } else {
856     I = Rgn.first;
857     auto LRS = BBLiveInMap.lookup(NonDbgMI);
858 #ifdef EXPENSIVE_CHECKS
859     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
860 #endif
861     RPTracker.reset(*I, &LRS);
862   }
863 
864   for (;;) {
865     I = RPTracker.getNext();
866 
867     if (Regions[CurRegion].first == I || NonDbgMI == I) {
868       LiveIns[CurRegion] = RPTracker.getLiveRegs();
869       RPTracker.clearMaxPressure();
870     }
871 
872     if (Regions[CurRegion].second == I) {
873       Pressure[CurRegion] = RPTracker.moveMaxPressure();
874       if (CurRegion-- == RegionIdx)
875         break;
876       auto &Rgn = Regions[CurRegion];
877       NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
878     }
879     RPTracker.advanceToNext();
880     RPTracker.advanceBeforeNext();
881   }
882 
883   if (OnlySucc) {
884     if (I != MBB->end()) {
885       RPTracker.advanceToNext();
886       RPTracker.advance(MBB->end());
887     }
888     RPTracker.advanceBeforeNext();
889     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
890   }
891 }
892 
893 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
getRegionLiveInMap() const894 GCNScheduleDAGMILive::getRegionLiveInMap() const {
895   assert(!Regions.empty());
896   std::vector<MachineInstr *> RegionFirstMIs;
897   RegionFirstMIs.reserve(Regions.size());
898   auto I = Regions.rbegin(), E = Regions.rend();
899   do {
900     const MachineBasicBlock *MBB = I->first->getParent();
901     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
902     RegionFirstMIs.push_back(MI);
903     do {
904       ++I;
905     } while (I != E && I->first->getParent() == MBB);
906   } while (I != E);
907   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
908 }
909 
910 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
getRegionLiveOutMap() const911 GCNScheduleDAGMILive::getRegionLiveOutMap() const {
912   assert(!Regions.empty());
913   std::vector<MachineInstr *> RegionLastMIs;
914   RegionLastMIs.reserve(Regions.size());
915   for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
916     RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
917 
918   return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
919 }
920 
buildLiveRegMap()921 void RegionPressureMap::buildLiveRegMap() {
922   IdxToInstruction.clear();
923 
924   RegionLiveRegMap =
925       IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
926   for (unsigned I = 0; I < DAG->Regions.size(); I++) {
927     MachineInstr *RegionKey =
928         IsLiveOut
929             ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
930             : &*DAG->Regions[I].first;
931     IdxToInstruction[I] = RegionKey;
932   }
933 }
934 
finalizeSchedule()935 void GCNScheduleDAGMILive::finalizeSchedule() {
936   // Start actual scheduling here. This function is called by the base
937   // MachineScheduler after all regions have been recorded by
938   // GCNScheduleDAGMILive::schedule().
939   LiveIns.resize(Regions.size());
940   Pressure.resize(Regions.size());
941   RegionsWithHighRP.resize(Regions.size());
942   RegionsWithExcessRP.resize(Regions.size());
943   RegionsWithMinOcc.resize(Regions.size());
944   RegionsWithIGLPInstrs.resize(Regions.size());
945   RegionsWithHighRP.reset();
946   RegionsWithExcessRP.reset();
947   RegionsWithMinOcc.reset();
948   RegionsWithIGLPInstrs.reset();
949 
950   runSchedStages();
951 }
952 
runSchedStages()953 void GCNScheduleDAGMILive::runSchedStages() {
954   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
955 
956   if (!Regions.empty()) {
957     BBLiveInMap = getRegionLiveInMap();
958     if (GCNTrackers)
959       RegionLiveOuts.buildLiveRegMap();
960   }
961 
962   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
963   while (S.advanceStage()) {
964     auto Stage = createSchedStage(S.getCurrentStage());
965     if (!Stage->initGCNSchedStage())
966       continue;
967 
968     for (auto Region : Regions) {
969       RegionBegin = Region.first;
970       RegionEnd = Region.second;
971       // Setup for scheduling the region and check whether it should be skipped.
972       if (!Stage->initGCNRegion()) {
973         Stage->advanceRegion();
974         exitRegion();
975         continue;
976       }
977 
978       if (GCNTrackers) {
979         GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
980         GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
981         GCNRPTracker::LiveRegSet *RegionLiveIns =
982             &LiveIns[Stage->getRegionIdx()];
983 
984         reinterpret_cast<GCNRPTracker *>(DownwardTracker)
985             ->reset(MRI, *RegionLiveIns);
986         reinterpret_cast<GCNRPTracker *>(UpwardTracker)
987             ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
988                              Stage->getRegionIdx()));
989       }
990 
991       ScheduleDAGMILive::schedule();
992       Stage->finalizeGCNRegion();
993     }
994 
995     Stage->finalizeGCNSchedStage();
996   }
997 }
998 
999 #ifndef NDEBUG
operator <<(raw_ostream & OS,const GCNSchedStageID & StageID)1000 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1001   switch (StageID) {
1002   case GCNSchedStageID::OccInitialSchedule:
1003     OS << "Max Occupancy Initial Schedule";
1004     break;
1005   case GCNSchedStageID::UnclusteredHighRPReschedule:
1006     OS << "Unclustered High Register Pressure Reschedule";
1007     break;
1008   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1009     OS << "Clustered Low Occupancy Reschedule";
1010     break;
1011   case GCNSchedStageID::PreRARematerialize:
1012     OS << "Pre-RA Rematerialize";
1013     break;
1014   case GCNSchedStageID::ILPInitialSchedule:
1015     OS << "Max ILP Initial Schedule";
1016     break;
1017   case GCNSchedStageID::MemoryClauseInitialSchedule:
1018     OS << "Max memory clause Initial Schedule";
1019     break;
1020   }
1021 
1022   return OS;
1023 }
1024 #endif
1025 
GCNSchedStage(GCNSchedStageID StageID,GCNScheduleDAGMILive & DAG)1026 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1027     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1028       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1029 
initGCNSchedStage()1030 bool GCNSchedStage::initGCNSchedStage() {
1031   if (!DAG.LIS)
1032     return false;
1033 
1034   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1035   return true;
1036 }
1037 
initGCNSchedStage()1038 bool UnclusteredHighRPStage::initGCNSchedStage() {
1039   if (DisableUnclusterHighRP)
1040     return false;
1041 
1042   if (!GCNSchedStage::initGCNSchedStage())
1043     return false;
1044 
1045   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1046     return false;
1047 
1048   SavedMutations.swap(DAG.Mutations);
1049   DAG.addMutation(
1050       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
1051 
1052   InitialOccupancy = DAG.MinOccupancy;
1053   // Aggressivly try to reduce register pressure in the unclustered high RP
1054   // stage. Temporarily increase occupancy target in the region.
1055   S.SGPRLimitBias = S.HighRPSGPRBias;
1056   S.VGPRLimitBias = S.HighRPVGPRBias;
1057   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1058     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1059 
1060   LLVM_DEBUG(
1061       dbgs()
1062       << "Retrying function scheduling without clustering. "
1063          "Aggressivly try to reduce register pressure to achieve occupancy "
1064       << DAG.MinOccupancy << ".\n");
1065 
1066   return true;
1067 }
1068 
initGCNSchedStage()1069 bool ClusteredLowOccStage::initGCNSchedStage() {
1070   if (DisableClusteredLowOccupancy)
1071     return false;
1072 
1073   if (!GCNSchedStage::initGCNSchedStage())
1074     return false;
1075 
1076   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1077   // been dropped. All regions will have already been scheduled with the ideal
1078   // occupancy targets.
1079   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1080     return false;
1081 
1082   LLVM_DEBUG(
1083       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1084              << DAG.MinOccupancy << ".\n");
1085   return true;
1086 }
1087 
1088 /// Allows to easily filter for this stage's debug output.
1089 #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
1090 
initGCNSchedStage()1091 bool PreRARematStage::initGCNSchedStage() {
1092   // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1093   // regions inbetween the defs and region we sinked the def to. Will need to be
1094   // fixed if there is another pass after this pass.
1095   assert(!S.hasNextStage());
1096 
1097   if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
1098       DAG.Regions.size() == 1)
1099     return false;
1100 
1101   // Before performing any IR modification record the parent region of each MI
1102   // and the parent MBB of each region.
1103   const unsigned NumRegions = DAG.Regions.size();
1104   RegionBB.reserve(NumRegions);
1105   for (unsigned I = 0; I < NumRegions; ++I) {
1106     RegionBoundaries Region = DAG.Regions[I];
1107     for (auto MI = Region.first; MI != Region.second; ++MI)
1108       MIRegion.insert({&*MI, I});
1109     RegionBB.push_back(Region.first->getParent());
1110   }
1111 
1112   if (!canIncreaseOccupancyOrReduceSpill())
1113     return false;
1114 
1115   // Rematerialize identified instructions and update scheduler's state.
1116   rematerialize();
1117   if (GCNTrackers)
1118     DAG.RegionLiveOuts.buildLiveRegMap();
1119   REMAT_DEBUG(
1120       dbgs() << "Retrying function scheduling with new min. occupancy of "
1121              << AchievedOcc << " from rematerializing (original was "
1122              << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
1123   if (AchievedOcc > DAG.MinOccupancy) {
1124     DAG.MinOccupancy = AchievedOcc;
1125     SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1126     MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1127   }
1128   return true;
1129 }
1130 
finalizeGCNSchedStage()1131 void GCNSchedStage::finalizeGCNSchedStage() {
1132   DAG.finishBlock();
1133   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1134 }
1135 
finalizeGCNSchedStage()1136 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1137   SavedMutations.swap(DAG.Mutations);
1138   S.SGPRLimitBias = S.VGPRLimitBias = 0;
1139   if (DAG.MinOccupancy > InitialOccupancy) {
1140     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
1141       DAG.RegionsWithMinOcc[IDX] =
1142           DAG.Pressure[IDX].getOccupancy(
1143               DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
1144 
1145     LLVM_DEBUG(dbgs() << StageID
1146                       << " stage successfully increased occupancy to "
1147                       << DAG.MinOccupancy << '\n');
1148   }
1149 
1150   GCNSchedStage::finalizeGCNSchedStage();
1151 }
1152 
initGCNRegion()1153 bool GCNSchedStage::initGCNRegion() {
1154   // Check whether this new region is also a new block.
1155   if (DAG.RegionBegin->getParent() != CurrentMBB)
1156     setupNewBlock();
1157 
1158   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1159   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1160 
1161   // Skip empty scheduling regions (0 or 1 schedulable instructions).
1162   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1163     return false;
1164 
1165   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1166   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1167                     << " " << CurrentMBB->getName()
1168                     << "\n  From: " << *DAG.begin() << "    To: ";
1169              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1170              else dbgs() << "End";
1171              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1172 
1173   // Save original instruction order before scheduling for possible revert.
1174   Unsched.clear();
1175   Unsched.reserve(DAG.NumRegionInstrs);
1176   if (StageID == GCNSchedStageID::OccInitialSchedule ||
1177       StageID == GCNSchedStageID::ILPInitialSchedule) {
1178     const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1179     for (auto &I : DAG) {
1180       Unsched.push_back(&I);
1181       if (SII->isIGLPMutationOnly(I.getOpcode()))
1182         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1183     }
1184   } else {
1185     for (auto &I : DAG)
1186       Unsched.push_back(&I);
1187   }
1188 
1189   PressureBefore = DAG.Pressure[RegionIdx];
1190 
1191   LLVM_DEBUG(
1192       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1193              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1194              << "Region live-in pressure:  "
1195              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1196              << "Region register pressure: " << print(PressureBefore));
1197 
1198   S.HasHighPressure = false;
1199   S.KnownExcessRP = isRegionWithExcessRP();
1200 
1201   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1202       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1203     SavedMutations.clear();
1204     SavedMutations.swap(DAG.Mutations);
1205     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1206                           StageID == GCNSchedStageID::ILPInitialSchedule;
1207     DAG.addMutation(createIGroupLPDAGMutation(
1208         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1209                        : AMDGPU::SchedulingPhase::PreRAReentry));
1210   }
1211 
1212   return true;
1213 }
1214 
initGCNRegion()1215 bool UnclusteredHighRPStage::initGCNRegion() {
1216   // Only reschedule regions with the minimum occupancy or regions that may have
1217   // spilling (excess register pressure).
1218   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
1219        DAG.MinOccupancy <= InitialOccupancy) &&
1220       !DAG.RegionsWithExcessRP[RegionIdx])
1221     return false;
1222 
1223   return GCNSchedStage::initGCNRegion();
1224 }
1225 
initGCNRegion()1226 bool ClusteredLowOccStage::initGCNRegion() {
1227   // We may need to reschedule this region if it wasn't rescheduled in the last
1228   // stage, or if we found it was testing critical register pressure limits in
1229   // the unclustered reschedule stage. The later is because we may not have been
1230   // able to raise the min occupancy in the previous stage so the region may be
1231   // overly constrained even if it was already rescheduled.
1232   if (!DAG.RegionsWithHighRP[RegionIdx])
1233     return false;
1234 
1235   return GCNSchedStage::initGCNRegion();
1236 }
1237 
initGCNRegion()1238 bool PreRARematStage::initGCNRegion() {
1239   return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1240 }
1241 
setupNewBlock()1242 void GCNSchedStage::setupNewBlock() {
1243   if (CurrentMBB)
1244     DAG.finishBlock();
1245 
1246   CurrentMBB = DAG.RegionBegin->getParent();
1247   DAG.startBlock(CurrentMBB);
1248   // Get real RP for the region if it hasn't be calculated before. After the
1249   // initial schedule stage real RP will be collected after scheduling.
1250   if (StageID == GCNSchedStageID::OccInitialSchedule ||
1251       StageID == GCNSchedStageID::ILPInitialSchedule ||
1252       StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1253     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1254 }
1255 
finalizeGCNRegion()1256 void GCNSchedStage::finalizeGCNRegion() {
1257   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1258   if (S.HasHighPressure)
1259     DAG.RegionsWithHighRP[RegionIdx] = true;
1260 
1261   // Revert scheduling if we have dropped occupancy or there is some other
1262   // reason that the original schedule is better.
1263   checkScheduling();
1264 
1265   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1266       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1267     SavedMutations.swap(DAG.Mutations);
1268 
1269   DAG.exitRegion();
1270   advanceRegion();
1271 }
1272 
checkScheduling()1273 void GCNSchedStage::checkScheduling() {
1274   // Check the results of scheduling.
1275   PressureAfter = DAG.getRealRegPressure(RegionIdx);
1276 
1277   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1278   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1279 
1280   unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1281 
1282   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1283       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1284     DAG.Pressure[RegionIdx] = PressureAfter;
1285     DAG.RegionsWithMinOcc[RegionIdx] =
1286         PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
1287         DAG.MinOccupancy;
1288 
1289     // Early out if we have achieved the occupancy target.
1290     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1291     return;
1292   }
1293 
1294   unsigned TargetOccupancy = std::min(
1295       S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1296   unsigned WavesAfter = std::min(
1297       TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1298   unsigned WavesBefore = std::min(
1299       TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1300   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1301                     << ", after " << WavesAfter << ".\n");
1302 
1303   // We may not be able to keep the current target occupancy because of the just
1304   // scheduled region. We might still be able to revert scheduling if the
1305   // occupancy before was higher, or if the current schedule has register
1306   // pressure higher than the excess limits which could lead to more spilling.
1307   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1308 
1309   // Allow memory bound functions to drop to 4 waves if not limited by an
1310   // attribute.
1311   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1312       WavesAfter >= MFI.getMinAllowedOccupancy()) {
1313     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1314                       << MFI.getMinAllowedOccupancy() << " waves\n");
1315     NewOccupancy = WavesAfter;
1316   }
1317 
1318   if (NewOccupancy < DAG.MinOccupancy) {
1319     DAG.MinOccupancy = NewOccupancy;
1320     MFI.limitOccupancy(DAG.MinOccupancy);
1321     DAG.RegionsWithMinOcc.reset();
1322     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1323                       << DAG.MinOccupancy << ".\n");
1324   }
1325   // The maximum number of arch VGPR on non-unified register file, or the
1326   // maximum VGPR + AGPR in the unified register file case.
1327   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1328   // The maximum number of arch VGPR for both unified and non-unified register
1329   // file.
1330   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1331   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1332 
1333   if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1334       PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1335       PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1336       PressureAfter.getSGPRNum() > MaxSGPRs) {
1337     DAG.RegionsWithHighRP[RegionIdx] = true;
1338     DAG.RegionsWithExcessRP[RegionIdx] = true;
1339   }
1340 
1341   // Revert if this region's schedule would cause a drop in occupancy or
1342   // spilling.
1343   if (shouldRevertScheduling(WavesAfter)) {
1344     revertScheduling();
1345   } else {
1346     DAG.Pressure[RegionIdx] = PressureAfter;
1347     DAG.RegionsWithMinOcc[RegionIdx] =
1348         PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
1349         DAG.MinOccupancy;
1350   }
1351 }
1352 
1353 unsigned
computeSUnitReadyCycle(const SUnit & SU,unsigned CurrCycle,DenseMap<unsigned,unsigned> & ReadyCycles,const TargetSchedModel & SM)1354 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1355                                       DenseMap<unsigned, unsigned> &ReadyCycles,
1356                                       const TargetSchedModel &SM) {
1357   unsigned ReadyCycle = CurrCycle;
1358   for (auto &D : SU.Preds) {
1359     if (D.isAssignedRegDep()) {
1360       MachineInstr *DefMI = D.getSUnit()->getInstr();
1361       unsigned Latency = SM.computeInstrLatency(DefMI);
1362       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1363       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1364     }
1365   }
1366   ReadyCycles[SU.NodeNum] = ReadyCycle;
1367   return ReadyCycle;
1368 }
1369 
1370 #ifndef NDEBUG
1371 struct EarlierIssuingCycle {
operator ()EarlierIssuingCycle1372   bool operator()(std::pair<MachineInstr *, unsigned> A,
1373                   std::pair<MachineInstr *, unsigned> B) const {
1374     return A.second < B.second;
1375   }
1376 };
1377 
printScheduleModel(std::set<std::pair<MachineInstr *,unsigned>,EarlierIssuingCycle> & ReadyCycles)1378 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1379                                         EarlierIssuingCycle> &ReadyCycles) {
1380   if (ReadyCycles.empty())
1381     return;
1382   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1383   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1384          << " ##################\n# Cycle #\t\t\tInstruction          "
1385             "             "
1386             "                            \n";
1387   unsigned IPrev = 1;
1388   for (auto &I : ReadyCycles) {
1389     if (I.second > IPrev + 1)
1390       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1391              << " CYCLES DETECTED ******************************\n\n";
1392     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
1393     IPrev = I.second;
1394   }
1395 }
1396 #endif
1397 
1398 ScheduleMetrics
getScheduleMetrics(const std::vector<SUnit> & InputSchedule)1399 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1400 #ifndef NDEBUG
1401   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1402       ReadyCyclesSorted;
1403 #endif
1404   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1405   unsigned SumBubbles = 0;
1406   DenseMap<unsigned, unsigned> ReadyCycles;
1407   unsigned CurrCycle = 0;
1408   for (auto &SU : InputSchedule) {
1409     unsigned ReadyCycle =
1410         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1411     SumBubbles += ReadyCycle - CurrCycle;
1412 #ifndef NDEBUG
1413     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1414 #endif
1415     CurrCycle = ++ReadyCycle;
1416   }
1417 #ifndef NDEBUG
1418   LLVM_DEBUG(
1419       printScheduleModel(ReadyCyclesSorted);
1420       dbgs() << "\n\t"
1421              << "Metric: "
1422              << (SumBubbles
1423                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1424                      : 1)
1425              << "\n\n");
1426 #endif
1427 
1428   return ScheduleMetrics(CurrCycle, SumBubbles);
1429 }
1430 
1431 ScheduleMetrics
getScheduleMetrics(const GCNScheduleDAGMILive & DAG)1432 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1433 #ifndef NDEBUG
1434   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1435       ReadyCyclesSorted;
1436 #endif
1437   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1438   unsigned SumBubbles = 0;
1439   DenseMap<unsigned, unsigned> ReadyCycles;
1440   unsigned CurrCycle = 0;
1441   for (auto &MI : DAG) {
1442     SUnit *SU = DAG.getSUnit(&MI);
1443     if (!SU)
1444       continue;
1445     unsigned ReadyCycle =
1446         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1447     SumBubbles += ReadyCycle - CurrCycle;
1448 #ifndef NDEBUG
1449     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1450 #endif
1451     CurrCycle = ++ReadyCycle;
1452   }
1453 #ifndef NDEBUG
1454   LLVM_DEBUG(
1455       printScheduleModel(ReadyCyclesSorted);
1456       dbgs() << "\n\t"
1457              << "Metric: "
1458              << (SumBubbles
1459                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1460                      : 1)
1461              << "\n\n");
1462 #endif
1463 
1464   return ScheduleMetrics(CurrCycle, SumBubbles);
1465 }
1466 
shouldRevertScheduling(unsigned WavesAfter)1467 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1468   if (WavesAfter < DAG.MinOccupancy)
1469     return true;
1470 
1471   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1472   if (DAG.MFI.isDynamicVGPREnabled()) {
1473     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1474         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1475         PressureBefore.getVGPRNum(false));
1476     unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1477         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1478         PressureAfter.getVGPRNum(false));
1479     if (BlocksAfter > BlocksBefore)
1480       return true;
1481   }
1482 
1483   return false;
1484 }
1485 
shouldRevertScheduling(unsigned WavesAfter)1486 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1487   if (PressureAfter == PressureBefore)
1488     return false;
1489 
1490   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1491     return true;
1492 
1493   if (mayCauseSpilling(WavesAfter))
1494     return true;
1495 
1496   return false;
1497 }
1498 
shouldRevertScheduling(unsigned WavesAfter)1499 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1500   // If RP is not reduced in the unclustered reschedule stage, revert to the
1501   // old schedule.
1502   if ((WavesAfter <=
1503            PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
1504        mayCauseSpilling(WavesAfter)) ||
1505       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1506     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1507     return true;
1508   }
1509 
1510   // Do not attempt to relax schedule even more if we are already spilling.
1511   if (isRegionWithExcessRP())
1512     return false;
1513 
1514   LLVM_DEBUG(
1515       dbgs()
1516       << "\n\t      *** In shouldRevertScheduling ***\n"
1517       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1518   ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
1519   LLVM_DEBUG(
1520       dbgs()
1521       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1522   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1523   unsigned OldMetric = MBefore.getMetric();
1524   unsigned NewMetric = MAfter.getMetric();
1525   unsigned WavesBefore = std::min(
1526       S.getTargetOccupancy(),
1527       PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
1528   unsigned Profit =
1529       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1530        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1531        NewMetric) /
1532       ScheduleMetrics::ScaleFactor;
1533   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1534                     << MAfter << "Profit: " << Profit << "\n");
1535   return Profit < ScheduleMetrics::ScaleFactor;
1536 }
1537 
shouldRevertScheduling(unsigned WavesAfter)1538 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1539   if (PressureAfter == PressureBefore)
1540     return false;
1541 
1542   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1543     return true;
1544 
1545   if (mayCauseSpilling(WavesAfter))
1546     return true;
1547 
1548   return false;
1549 }
1550 
shouldRevertScheduling(unsigned WavesAfter)1551 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1552   return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1553          mayCauseSpilling(WavesAfter) ||
1554          (IncreaseOccupancy && WavesAfter < TargetOcc);
1555 }
1556 
shouldRevertScheduling(unsigned WavesAfter)1557 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1558   if (mayCauseSpilling(WavesAfter))
1559     return true;
1560 
1561   return false;
1562 }
1563 
shouldRevertScheduling(unsigned WavesAfter)1564 bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
1565     unsigned WavesAfter) {
1566   return mayCauseSpilling(WavesAfter);
1567 }
1568 
mayCauseSpilling(unsigned WavesAfter)1569 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1570   if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1571       !PressureAfter.less(MF, PressureBefore)) {
1572     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1573     return true;
1574   }
1575 
1576   return false;
1577 }
1578 
revertScheduling()1579 void GCNSchedStage::revertScheduling() {
1580   DAG.RegionsWithMinOcc[RegionIdx] =
1581       PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
1582       DAG.MinOccupancy;
1583   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1584   DAG.RegionEnd = DAG.RegionBegin;
1585   int SkippedDebugInstr = 0;
1586   for (MachineInstr *MI : Unsched) {
1587     if (MI->isDebugInstr()) {
1588       ++SkippedDebugInstr;
1589       continue;
1590     }
1591 
1592     if (MI->getIterator() != DAG.RegionEnd) {
1593       DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
1594       if (!MI->isDebugInstr())
1595         DAG.LIS->handleMove(*MI, true);
1596     }
1597 
1598     // Reset read-undef flags and update them later.
1599     for (auto &Op : MI->all_defs())
1600       Op.setIsUndef(false);
1601     RegisterOperands RegOpers;
1602     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1603     if (!MI->isDebugInstr()) {
1604       if (DAG.ShouldTrackLaneMasks) {
1605         // Adjust liveness and add missing dead+read-undef flags.
1606         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1607         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1608       } else {
1609         // Adjust for missing dead-def flags.
1610         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1611       }
1612     }
1613     DAG.RegionEnd = MI->getIterator();
1614     ++DAG.RegionEnd;
1615     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1616   }
1617 
1618   // After reverting schedule, debug instrs will now be at the end of the block
1619   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1620   // pass debug instrs to the actual end of the scheduling region.
1621   while (SkippedDebugInstr-- > 0)
1622     ++DAG.RegionEnd;
1623 
1624   // If Unsched.front() instruction is a debug instruction, this will actually
1625   // shrink the region since we moved all debug instructions to the end of the
1626   // block. Find the first instruction that is not a debug instruction.
1627   DAG.RegionBegin = Unsched.front()->getIterator();
1628   if (DAG.RegionBegin->isDebugInstr()) {
1629     for (MachineInstr *MI : Unsched) {
1630       if (MI->isDebugInstr())
1631         continue;
1632       DAG.RegionBegin = MI->getIterator();
1633       break;
1634     }
1635   }
1636 
1637   // Then move the debug instructions back into their correct place and set
1638   // RegionBegin and RegionEnd if needed.
1639   DAG.placeDebugValues();
1640 
1641   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1642 }
1643 
allUsesAvailableAt(const MachineInstr * InstToRemat,SlotIndex OriginalIdx,SlotIndex RematIdx) const1644 bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
1645                                          SlotIndex OriginalIdx,
1646                                          SlotIndex RematIdx) const {
1647 
1648   LiveIntervals *LIS = DAG.LIS;
1649   MachineRegisterInfo &MRI = DAG.MRI;
1650   OriginalIdx = OriginalIdx.getRegSlot(true);
1651   RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
1652   for (const MachineOperand &MO : InstToRemat->operands()) {
1653     if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1654       continue;
1655 
1656     if (!MO.getReg().isVirtual()) {
1657       // Do not attempt to reason about PhysRegs
1658       // TODO: better analysis of PhysReg livness
1659       if (!DAG.MRI.isConstantPhysReg(MO.getReg()) &&
1660           !DAG.TII->isIgnorableUse(MO))
1661         return false;
1662 
1663       // Constant PhysRegs and IgnorableUses are okay
1664       continue;
1665     }
1666 
1667     LiveInterval &LI = LIS->getInterval(MO.getReg());
1668     const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
1669     assert(OVNI);
1670 
1671     // Don't allow rematerialization immediately after the original def.
1672     // It would be incorrect if InstToRemat redefines the register.
1673     // See PR14098.
1674     if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
1675       return false;
1676 
1677     if (OVNI != LI.getVNInfoAt(RematIdx))
1678       return false;
1679 
1680     // Check that subrange is live at RematIdx.
1681     if (LI.hasSubRanges()) {
1682       const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1683       unsigned SubReg = MO.getSubReg();
1684       LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
1685                               : MRI.getMaxLaneMaskForVReg(MO.getReg());
1686       for (LiveInterval::SubRange &SR : LI.subranges()) {
1687         if ((SR.LaneMask & LM).none())
1688           continue;
1689         if (!SR.liveAt(RematIdx))
1690           return false;
1691 
1692         // Early exit if all used lanes are checked. No need to continue.
1693         LM &= ~SR.LaneMask;
1694         if (LM.none())
1695           break;
1696       }
1697     }
1698   }
1699   return true;
1700 }
1701 
canIncreaseOccupancyOrReduceSpill()1702 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1703   REMAT_DEBUG({
1704     dbgs() << "Collecting rematerializable instructions in ";
1705     MF.getFunction().printAsOperand(dbgs(), false);
1706     dbgs() << '\n';
1707   });
1708 
1709   // Maps optimizable regions (i.e., regions at minimum and register-limited
1710   // occupancy, or regions with spilling) to the target RP we would like to
1711   // reach.
1712   DenseMap<unsigned, GCNRPTarget> OptRegions;
1713   const Function &F = MF.getFunction();
1714   unsigned DynamicVGPRBlockSize =
1715       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1716 
1717   std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
1718   const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
1719   const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
1720   const unsigned MaxSGPRsIncOcc =
1721       ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
1722   const unsigned MaxVGPRsIncOcc =
1723       ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
1724   IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
1725 
1726   // Collect optimizable regions. If there is spilling in any region we will
1727   // just try to reduce spilling. Otherwise we will try to increase occupancy by
1728   // one in the whole function.
1729   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1730     GCNRegPressure &RP = DAG.Pressure[I];
1731     // We allow ArchVGPR or AGPR savings to count as savings of the other kind
1732     // of VGPR only when trying to eliminate spilling. We cannot do this when
1733     // trying to increase occupancy since VGPR class swaps only occur later in
1734     // the register allocator i.e., the scheduler will not be able to reason
1735     // about these savings and will not report an increase in the achievable
1736     // occupancy, triggering rollbacks.
1737     GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
1738                        /*CombineVGPRSavings=*/true);
1739     if (!Target.satisfied() && IncreaseOccupancy) {
1740       // There is spilling in the region and we were so far trying to increase
1741       // occupancy. Strop trying that and focus on reducing spilling.
1742       IncreaseOccupancy = false;
1743       OptRegions.clear();
1744     } else if (IncreaseOccupancy) {
1745       // There is no spilling in the region, try to increase occupancy.
1746       Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
1747                            /*CombineVGPRSavings=*/false);
1748     }
1749     if (!Target.satisfied())
1750       OptRegions.insert({I, Target});
1751   }
1752   if (OptRegions.empty())
1753     return false;
1754 
1755 #ifndef NDEBUG
1756   if (IncreaseOccupancy) {
1757     REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
1758                        << ") in regions:\n");
1759   } else {
1760     REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
1761                        << WavesPerEU.first << ") in regions:\n");
1762   }
1763   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1764     if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
1765       REMAT_DEBUG(dbgs() << "  [" << I << "] " << OptIt->getSecond() << '\n');
1766   }
1767 #endif
1768 
1769   // When we are reducing spilling, the target is the minimum target number of
1770   // waves/EU determined by the subtarget. In cases where either one of
1771   // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
1772   // minimum region occupancy may be higher than the latter.
1773   TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
1774                                 : std::max(DAG.MinOccupancy, WavesPerEU.first);
1775 
1776   // Accounts for a reduction in RP in an optimizable region. Returns whether we
1777   // estimate that we have identified enough rematerialization opportunities to
1778   // achieve our goal, and sets Progress to true when this particular reduction
1779   // in pressure was helpful toward that goal.
1780   auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1781                               bool &Progress) -> bool {
1782     GCNRPTarget &Target = OptIt->getSecond();
1783     if (!Target.isSaveBeneficial(Reg, DAG.MRI))
1784       return false;
1785     Progress = true;
1786     Target.saveReg(Reg, Mask, DAG.MRI);
1787     if (Target.satisfied())
1788       OptRegions.erase(OptIt->getFirst());
1789     return OptRegions.empty();
1790   };
1791 
1792   // We need up-to-date live-out info. to query live-out register masks in
1793   // regions containing rematerializable instructions.
1794   DAG.RegionLiveOuts.buildLiveRegMap();
1795 
1796   // Cache set of registers that are going to be rematerialized.
1797   DenseSet<unsigned> RematRegs;
1798 
1799   // Identify rematerializable instructions in the function.
1800   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1801     auto Region = DAG.Regions[I];
1802     for (auto MI = Region.first; MI != Region.second; ++MI) {
1803       // The instruction must be trivially rematerializable.
1804       MachineInstr &DefMI = *MI;
1805       if (!isTriviallyReMaterializable(DefMI))
1806         continue;
1807 
1808       // We only support rematerializing virtual registers with one definition.
1809       Register Reg = DefMI.getOperand(0).getReg();
1810       if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
1811         continue;
1812 
1813       // We only care to rematerialize the instruction if it has a single
1814       // non-debug user in a different region. The using MI may not belong to a
1815       // region if it is a lone region terminator.
1816       MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
1817       if (!UseMI)
1818         continue;
1819       auto UseRegion = MIRegion.find(UseMI);
1820       if (UseRegion != MIRegion.end() && UseRegion->second == I)
1821         continue;
1822 
1823       // Do not rematerialize an instruction if it uses or is used by an
1824       // instruction that we have designated for rematerialization.
1825       // FIXME: Allow for rematerialization chains: this requires 1. updating
1826       // remat points to account for uses that are rematerialized, and 2. either
1827       // rematerializing the candidates in careful ordering, or deferring the
1828       // MBB RP walk until the entire chain has been rematerialized.
1829       if (Rematerializations.contains(UseMI) ||
1830           llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
1831             return MO.isReg() && RematRegs.contains(MO.getReg());
1832           }))
1833         continue;
1834 
1835       // Do not rematerialize an instruction it it uses registers that aren't
1836       // available at its use. This ensures that we are not extending any live
1837       // range while rematerializing.
1838       SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
1839       SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1840       if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
1841         continue;
1842 
1843       REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1844       RematInstruction &Remat =
1845           Rematerializations.try_emplace(&DefMI, UseMI).first->second;
1846 
1847       bool RematUseful = false;
1848       if (auto It = OptRegions.find(I); It != OptRegions.end()) {
1849         // Optimistically consider that moving the instruction out of its
1850         // defining region will reduce RP in the latter; this assumes that
1851         // maximum RP in the region is reached somewhere between the defining
1852         // instruction and the end of the region.
1853         REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
1854         LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
1855         if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
1856           return true;
1857       }
1858 
1859       for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
1860         // We are only collecting regions in which the register is a live-in
1861         // (and may be live-through).
1862         auto It = DAG.LiveIns[LIRegion].find(Reg);
1863         if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
1864           continue;
1865         Remat.LiveInRegions.insert(LIRegion);
1866 
1867         // Account for the reduction in RP due to the rematerialization in an
1868         // optimizable region in which the defined register is a live-in. This
1869         // is exact for live-through region but optimistic in the using region,
1870         // where RP is actually reduced only if maximum RP is reached somewhere
1871         // between the beginning of the region and the rematerializable
1872         // instruction's use.
1873         if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
1874           REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
1875           if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
1876                                RematUseful))
1877             return true;
1878         }
1879       }
1880 
1881       // If the instruction is not a live-in or live-out in any optimizable
1882       // region then there is no point in rematerializing it.
1883       if (!RematUseful) {
1884         Rematerializations.pop_back();
1885         REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
1886       } else {
1887         RematRegs.insert(Reg);
1888       }
1889     }
1890   }
1891 
1892   if (IncreaseOccupancy) {
1893     // We were trying to increase occupancy but failed, abort the stage.
1894     REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
1895     Rematerializations.clear();
1896     return false;
1897   }
1898   REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
1899   return !Rematerializations.empty();
1900 }
1901 
rematerialize()1902 void PreRARematStage::rematerialize() {
1903   const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1904 
1905   // Collect regions whose RP changes in unpredictable way; we will have to
1906   // fully recompute their RP after all rematerailizations.
1907   DenseSet<unsigned> RecomputeRP;
1908 
1909   // Rematerialize all instructions.
1910   for (auto &[DefMI, Remat] : Rematerializations) {
1911     MachineBasicBlock::iterator InsertPos(Remat.UseMI);
1912     Register Reg = DefMI->getOperand(0).getReg();
1913     unsigned SubReg = DefMI->getOperand(0).getSubReg();
1914     unsigned DefRegion = MIRegion.at(DefMI);
1915 
1916     // Rematerialize DefMI to its use block.
1917     TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
1918                        *DAG.TRI);
1919     Remat.RematMI = &*std::prev(InsertPos);
1920     Remat.RematMI->getOperand(0).setSubReg(SubReg);
1921     DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
1922 
1923     // Update region boundaries in regions we sinked from (remove defining MI)
1924     // and to (insert MI rematerialized in use block). Only then we can erase
1925     // the original MI.
1926     DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
1927     auto UseRegion = MIRegion.find(Remat.UseMI);
1928     if (UseRegion != MIRegion.end()) {
1929       DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
1930                                  Remat.RematMI);
1931     }
1932     DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
1933     DefMI->eraseFromParent();
1934 
1935     // Collect all regions impacted by the rematerialization and update their
1936     // live-in/RP information.
1937     for (unsigned I : Remat.LiveInRegions) {
1938       ImpactedRegions.insert({I, DAG.Pressure[I]});
1939       GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
1940 
1941 #ifdef EXPENSIVE_CHECKS
1942       // All uses are known to be available / live at the remat point. Thus, the
1943       // uses should already be live in to the region.
1944       for (MachineOperand &MO : DefMI->operands()) {
1945         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1946           continue;
1947 
1948         Register UseReg = MO.getReg();
1949         if (!UseReg.isVirtual())
1950           continue;
1951 
1952         LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1953         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1954         if (LI.hasSubRanges() && MO.getSubReg())
1955           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1956 
1957         LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
1958         LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1959         // If this register has lanes not covered by the LiveIns, be sure they
1960         // do not map to any subrange. ref:
1961         // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1962         if (UncoveredLanes.any()) {
1963           assert(LI.hasSubRanges());
1964           for (LiveInterval::SubRange &SR : LI.subranges())
1965             assert((SR.LaneMask & UncoveredLanes).none());
1966         }
1967       }
1968 #endif
1969 
1970       // The register is no longer a live-in in all regions but the one that
1971       // contains the single use. In live-through regions, maximum register
1972       // pressure decreases predictably so we can directly update it. In the
1973       // using region, maximum RP may or may not decrease, so we will mark it
1974       // for re-computation after all materializations have taken place.
1975       LaneBitmask PrevMask = RegionLiveIns[Reg];
1976       RegionLiveIns.erase(Reg);
1977       RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
1978       if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
1979         DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1980       else
1981         RecomputeRP.insert(I);
1982     }
1983     // RP in the region from which the instruction was rematerialized may or may
1984     // not decrease.
1985     ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
1986     RecomputeRP.insert(DefRegion);
1987 
1988     // Recompute live interval to reflect the register's rematerialization.
1989     Register RematReg = Remat.RematMI->getOperand(0).getReg();
1990     DAG.LIS->removeInterval(RematReg);
1991     DAG.LIS->createAndComputeVirtRegInterval(RematReg);
1992   }
1993 
1994   // All regions impacted by at least one rematerialization must be rescheduled.
1995   // Maximum pressure must also be recomputed for all regions where it changed
1996   // non-predictably and checked against the target occupancy.
1997   AchievedOcc = TargetOcc;
1998   for (auto &[I, OriginalRP] : ImpactedRegions) {
1999     bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
2000     RescheduleRegions[I] = !IsEmptyRegion;
2001     if (!RecomputeRP.contains(I))
2002       continue;
2003 
2004     GCNRegPressure RP;
2005     if (IsEmptyRegion) {
2006       RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2007     } else {
2008       GCNDownwardRPTracker RPT(*DAG.LIS);
2009       auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
2010                                                       DAG.Regions[I].second);
2011       if (NonDbgMI == DAG.Regions[I].second) {
2012         // Region is non-empty but contains only debug instructions.
2013         RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2014       } else {
2015         RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
2016         RPT.advance(DAG.Regions[I].second);
2017         RP = RPT.moveMaxPressure();
2018       }
2019     }
2020     DAG.Pressure[I] = RP;
2021     AchievedOcc = std::min(
2022         AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
2023                                              ->getDynamicVGPRBlockSize()));
2024   }
2025   REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2026 }
2027 
2028 // Copied from MachineLICM
isTriviallyReMaterializable(const MachineInstr & MI)2029 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
2030   if (!DAG.TII->isTriviallyReMaterializable(MI))
2031     return false;
2032 
2033   for (const MachineOperand &MO : MI.all_uses()) {
2034     // We can't remat physreg uses, unless it is a constant or an ignorable
2035     // use (e.g. implicit exec use on VALU instructions)
2036     if (MO.getReg().isPhysical()) {
2037       if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
2038         continue;
2039       return false;
2040     }
2041   }
2042 
2043   return true;
2044 }
2045 
finalizeGCNSchedStage()2046 void PreRARematStage::finalizeGCNSchedStage() {
2047   // We consider that reducing spilling is always beneficial so we never
2048   // rollback rematerializations in such cases. It's also possible that
2049   // rescheduling lowers occupancy over the one achieved just through remats, in
2050   // which case we do not want to rollback either (the rescheduling was already
2051   // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2052   unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2053   if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
2054     return;
2055 
2056   REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2057   const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2058 
2059   // Rollback the rematerializations.
2060   for (const auto &[DefMI, Remat] : Rematerializations) {
2061     MachineInstr &RematMI = *Remat.RematMI;
2062     unsigned DefRegion = MIRegion.at(DefMI);
2063     MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2064     MachineBasicBlock *MBB = RegionBB[DefRegion];
2065     Register Reg = RematMI.getOperand(0).getReg();
2066     unsigned SubReg = RematMI.getOperand(0).getSubReg();
2067 
2068     // Re-rematerialize MI at the end of its original region. Note that it may
2069     // not be rematerialized exactly in the same position as originally within
2070     // the region, but it should not matter much.
2071     TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
2072     MachineInstr *NewMI = &*std::prev(InsertPos);
2073     NewMI->getOperand(0).setSubReg(SubReg);
2074     DAG.LIS->InsertMachineInstrInMaps(*NewMI);
2075 
2076     auto UseRegion = MIRegion.find(Remat.UseMI);
2077     if (UseRegion != MIRegion.end()) {
2078       DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
2079                                  nullptr);
2080     }
2081     DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2082 
2083     // Erase rematerialized MI.
2084     DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
2085     RematMI.eraseFromParent();
2086 
2087     // Recompute live interval for the re-rematerialized register
2088     DAG.LIS->removeInterval(Reg);
2089     DAG.LIS->createAndComputeVirtRegInterval(Reg);
2090 
2091     // Re-add the register as a live-in in all regions it used to be one in.
2092     for (unsigned LIRegion : Remat.LiveInRegions)
2093       DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2094   }
2095 
2096   // Reset RP in all impacted regions.
2097   for (auto &[I, OriginalRP] : ImpactedRegions)
2098     DAG.Pressure[I] = OriginalRP;
2099 
2100   GCNSchedStage::finalizeGCNSchedStage();
2101 }
2102 
updateRegionBoundaries(RegionBoundaries & RegionBounds,MachineBasicBlock::iterator MI,MachineInstr * NewMI)2103 void GCNScheduleDAGMILive::updateRegionBoundaries(
2104     RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
2105     MachineInstr *NewMI) {
2106   assert((!NewMI || NewMI != RegionBounds.second) &&
2107          "cannot remove at region end");
2108 
2109   if (RegionBounds.first == RegionBounds.second) {
2110     assert(NewMI && "cannot remove from an empty region");
2111     RegionBounds.first = NewMI;
2112     return;
2113   }
2114 
2115   // We only care for modifications at the beginning of a non-empty region since
2116   // the upper region boundary is exclusive.
2117   if (MI != RegionBounds.first)
2118     return;
2119   if (!NewMI)
2120     RegionBounds.first = std::next(MI); // Removal
2121   else
2122     RegionBounds.first = NewMI; // Insertion
2123 }
2124 
hasIGLPInstrs(ScheduleDAGInstrs * DAG)2125 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
2126   const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2127   return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2128     return SII->isIGLPMutationOnly(MI->getOpcode());
2129   });
2130 }
2131 
GCNPostScheduleDAGMILive(MachineSchedContext * C,std::unique_ptr<MachineSchedStrategy> S,bool RemoveKillFlags)2132 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
2133     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2134     bool RemoveKillFlags)
2135     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
2136 
schedule()2137 void GCNPostScheduleDAGMILive::schedule() {
2138   HasIGLPInstrs = hasIGLPInstrs(this);
2139   if (HasIGLPInstrs) {
2140     SavedMutations.clear();
2141     SavedMutations.swap(Mutations);
2142     addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
2143   }
2144 
2145   ScheduleDAGMI::schedule();
2146 }
2147 
finalizeSchedule()2148 void GCNPostScheduleDAGMILive::finalizeSchedule() {
2149   if (HasIGLPInstrs)
2150     SavedMutations.swap(Mutations);
2151 
2152   ScheduleDAGMI::finalizeSchedule();
2153 }
2154