xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17 
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
30 
31 #define DEBUG_TYPE "machine-scheduler"
32 
33 using namespace llvm;
34 
35 static cl::opt<bool> DisableUnclusterHighRP(
36     "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
37     cl::desc("Disable unclustered high register pressure "
38              "reduction scheduling stage."),
39     cl::init(false));
40 
41 static cl::opt<bool> DisableClusteredLowOccupancy(
42     "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
43     cl::desc("Disable clustered low occupancy "
44              "rescheduling for ILP scheduling stage."),
45     cl::init(false));
46 
47 static cl::opt<unsigned> ScheduleMetricBias(
48     "amdgpu-schedule-metric-bias", cl::Hidden,
49     cl::desc(
50         "Sets the bias which adds weight to occupancy vs latency. Set it to "
51         "100 to chase the occupancy only."),
52     cl::init(10));
53 
54 static cl::opt<bool>
55     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
56                cl::desc("Relax occupancy targets for kernels which are memory "
57                         "bound (amdgpu-membound-threshold), or "
58                         "Wave Limited (amdgpu-limit-wave-threshold)."),
59                cl::init(false));
60 
61 const unsigned ScheduleMetrics::ScaleFactor = 100;
62 
GCNSchedStrategy(const MachineSchedContext * C)63 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
64     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
65       HasHighPressure(false) {}
66 
initialize(ScheduleDAGMI * DAG)67 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
68   GenericScheduler::initialize(DAG);
69 
70   MF = &DAG->MF;
71 
72   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
73 
74   SGPRExcessLimit =
75       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
76   VGPRExcessLimit =
77       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
78 
79   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
80   // Set the initial TargetOccupnacy to the maximum occupancy that we can
81   // achieve for this function. This effectively sets a lower bound on the
82   // 'Critical' register limits in the scheduler.
83   // Allow for lower occupancy targets if kernel is wave limited or memory
84   // bound, and using the relaxed occupancy feature.
85   TargetOccupancy =
86       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
87   SGPRCriticalLimit =
88       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
89 
90   if (!KnownExcessRP) {
91     VGPRCriticalLimit =
92         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
93   } else {
94     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
95     // returns a reasonably small number for targets with lots of VGPRs, such
96     // as GFX10 and GFX11.
97     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
98                          "VGPRCriticalLimit calculation method.\n");
99 
100     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
101     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
102     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
103     VGPRBudget = std::max(VGPRBudget, Granule);
104     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
105   }
106 
107   // Subtract error margin and bias from register limits and avoid overflow.
108   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
109   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
110   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
111   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
112 
113   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
114                     << ", VGPRExcessLimit = " << VGPRExcessLimit
115                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
116                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
117 }
118 
119 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
120 /// current register pressure.
121 ///
122 /// This works for the common case, but it has a few exceptions that have been
123 /// observed through trial and error:
124 ///   - Explicit physical register operands
125 ///   - Subregister definitions
126 ///
127 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
128 /// and querying LiveIntervals through the RegPressureTracker is needed to get
129 /// an accurate value.
130 ///
131 /// We should eventually only use PressureDiff for maximum performance, but this
132 /// already allows 80% of SUs to take the fast path without changing scheduling
133 /// at all. Further changes would either change scheduling, or require a lot
134 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
canUsePressureDiffs(const SUnit & SU)135 static bool canUsePressureDiffs(const SUnit &SU) {
136   if (!SU.isInstr())
137     return false;
138 
139   // Cannot use pressure diffs for subregister defs or with physregs, it's
140   // imprecise in both cases.
141   for (const auto &Op : SU.getInstr()->operands()) {
142     if (!Op.isReg() || Op.isImplicit())
143       continue;
144     if (Op.getReg().isPhysical() ||
145         (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
146       return false;
147   }
148   return true;
149 }
150 
getRegisterPressures(bool AtTop,const RegPressureTracker & RPTracker,SUnit * SU,std::vector<unsigned> & Pressure,std::vector<unsigned> & MaxPressure)151 static void getRegisterPressures(bool AtTop,
152                                  const RegPressureTracker &RPTracker, SUnit *SU,
153                                  std::vector<unsigned> &Pressure,
154                                  std::vector<unsigned> &MaxPressure) {
155   // getDownwardPressure() and getUpwardPressure() make temporary changes to
156   // the tracker, so we need to pass those function a non-const copy.
157   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
158   if (AtTop)
159     TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
160   else
161     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
162 }
163 
initCandidate(SchedCandidate & Cand,SUnit * SU,bool AtTop,const RegPressureTracker & RPTracker,const SIRegisterInfo * SRI,unsigned SGPRPressure,unsigned VGPRPressure,bool IsBottomUp)164 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
165                                      bool AtTop,
166                                      const RegPressureTracker &RPTracker,
167                                      const SIRegisterInfo *SRI,
168                                      unsigned SGPRPressure,
169                                      unsigned VGPRPressure, bool IsBottomUp) {
170   Cand.SU = SU;
171   Cand.AtTop = AtTop;
172 
173   if (!DAG->isTrackingPressure())
174     return;
175 
176   Pressure.clear();
177   MaxPressure.clear();
178 
179   // We try to use the cached PressureDiffs in the ScheduleDAG whenever
180   // possible over querying the RegPressureTracker.
181   //
182   // RegPressureTracker will make a lot of LIS queries which are very
183   // expensive, it is considered a slow function in this context.
184   //
185   // PressureDiffs are precomputed and cached, and getPressureDiff is just a
186   // trivial lookup into an array. It is pretty much free.
187   //
188   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
189   // PressureDiffs.
190   if (AtTop || !canUsePressureDiffs(*SU)) {
191     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
192   } else {
193     // Reserve 4 slots.
194     Pressure.resize(4, 0);
195     Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
196     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
197 
198     for (const auto &Diff : DAG->getPressureDiff(SU)) {
199       if (!Diff.isValid())
200         continue;
201       // PressureDiffs is always bottom-up so if we're working top-down we need
202       // to invert its sign.
203       Pressure[Diff.getPSet()] +=
204           (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
205     }
206 
207 #ifdef EXPENSIVE_CHECKS
208     std::vector<unsigned> CheckPressure, CheckMaxPressure;
209     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
210     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
211             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
212         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
213             CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
214       errs() << "Register Pressure is inaccurate when calculated through "
215                 "PressureDiff\n"
216              << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
217              << ", expected "
218              << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
219              << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
220              << ", expected "
221              << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
222       report_fatal_error("inaccurate register pressure calculation");
223     }
224 #endif
225   }
226 
227   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
228   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
229 
230   // If two instructions increase the pressure of different register sets
231   // by the same amount, the generic scheduler will prefer to schedule the
232   // instruction that increases the set with the least amount of registers,
233   // which in our case would be SGPRs.  This is rarely what we want, so
234   // when we report excess/critical register pressure, we do it either
235   // only for VGPRs or only for SGPRs.
236 
237   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
238   const unsigned MaxVGPRPressureInc = 16;
239   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
240   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
241 
242   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
243   // to increase the likelihood we don't go over the limits.  We should improve
244   // the analysis to look through dependencies to find the path with the least
245   // register pressure.
246 
247   // We only need to update the RPDelta for instructions that increase register
248   // pressure. Instructions that decrease or keep reg pressure the same will be
249   // marked as RegExcess in tryCandidate() when they are compared with
250   // instructions that increase the register pressure.
251   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
252     HasHighPressure = true;
253     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
254     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
255   }
256 
257   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
258     HasHighPressure = true;
259     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
260     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
261   }
262 
263   // Register pressure is considered 'CRITICAL' if it is approaching a value
264   // that would reduce the wave occupancy for the execution unit.  When
265   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
266   // has the same cost, so we don't need to prefer one over the other.
267 
268   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
269   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
270 
271   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
272     HasHighPressure = true;
273     if (SGPRDelta > VGPRDelta) {
274       Cand.RPDelta.CriticalMax =
275         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
276       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
277     } else {
278       Cand.RPDelta.CriticalMax =
279         PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
280       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
281     }
282   }
283 }
284 
285 // This function is mostly cut and pasted from
286 // GenericScheduler::pickNodeFromQueue()
pickNodeFromQueue(SchedBoundary & Zone,const CandPolicy & ZonePolicy,const RegPressureTracker & RPTracker,SchedCandidate & Cand,bool IsBottomUp)287 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
288                                          const CandPolicy &ZonePolicy,
289                                          const RegPressureTracker &RPTracker,
290                                          SchedCandidate &Cand,
291                                          bool IsBottomUp) {
292   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
293   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
294   unsigned SGPRPressure = 0;
295   unsigned VGPRPressure = 0;
296   if (DAG->isTrackingPressure()) {
297     SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
298     VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
299   }
300   ReadyQueue &Q = Zone.Available;
301   for (SUnit *SU : Q) {
302 
303     SchedCandidate TryCand(ZonePolicy);
304     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
305                   VGPRPressure, IsBottomUp);
306     // Pass SchedBoundary only when comparing nodes from the same boundary.
307     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
308     tryCandidate(Cand, TryCand, ZoneArg);
309     if (TryCand.Reason != NoCand) {
310       // Initialize resource delta if needed in case future heuristics query it.
311       if (TryCand.ResDelta == SchedResourceDelta())
312         TryCand.initResourceDelta(Zone.DAG, SchedModel);
313       Cand.setBest(TryCand);
314       LLVM_DEBUG(traceCandidate(Cand));
315     }
316   }
317 }
318 
319 // This function is mostly cut and pasted from
320 // GenericScheduler::pickNodeBidirectional()
pickNodeBidirectional(bool & IsTopNode)321 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
322   // Schedule as far as possible in the direction of no choice. This is most
323   // efficient, but also provides the best heuristics for CriticalPSets.
324   if (SUnit *SU = Bot.pickOnlyChoice()) {
325     IsTopNode = false;
326     return SU;
327   }
328   if (SUnit *SU = Top.pickOnlyChoice()) {
329     IsTopNode = true;
330     return SU;
331   }
332   // Set the bottom-up policy based on the state of the current bottom zone and
333   // the instructions outside the zone, including the top zone.
334   CandPolicy BotPolicy;
335   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
336   // Set the top-down policy based on the state of the current top zone and
337   // the instructions outside the zone, including the bottom zone.
338   CandPolicy TopPolicy;
339   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
340 
341   // See if BotCand is still valid (because we previously scheduled from Top).
342   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
343   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
344       BotCand.Policy != BotPolicy) {
345     BotCand.reset(CandPolicy());
346     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
347                       /*IsBottomUp=*/true);
348     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
349   } else {
350     LLVM_DEBUG(traceCandidate(BotCand));
351 #ifndef NDEBUG
352     if (VerifyScheduling) {
353       SchedCandidate TCand;
354       TCand.reset(CandPolicy());
355       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
356                         /*IsBottomUp=*/true);
357       assert(TCand.SU == BotCand.SU &&
358              "Last pick result should correspond to re-picking right now");
359     }
360 #endif
361   }
362 
363   // Check if the top Q has a better candidate.
364   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
365   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
366       TopCand.Policy != TopPolicy) {
367     TopCand.reset(CandPolicy());
368     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
369                       /*IsBottomUp=*/false);
370     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
371   } else {
372     LLVM_DEBUG(traceCandidate(TopCand));
373 #ifndef NDEBUG
374     if (VerifyScheduling) {
375       SchedCandidate TCand;
376       TCand.reset(CandPolicy());
377       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
378                         /*IsBottomUp=*/false);
379       assert(TCand.SU == TopCand.SU &&
380            "Last pick result should correspond to re-picking right now");
381     }
382 #endif
383   }
384 
385   // Pick best from BotCand and TopCand.
386   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
387              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
388   SchedCandidate Cand = BotCand;
389   TopCand.Reason = NoCand;
390   tryCandidate(Cand, TopCand, nullptr);
391   if (TopCand.Reason != NoCand) {
392     Cand.setBest(TopCand);
393   }
394   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
395 
396   IsTopNode = Cand.AtTop;
397   return Cand.SU;
398 }
399 
400 // This function is mostly cut and pasted from
401 // GenericScheduler::pickNode()
pickNode(bool & IsTopNode)402 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
403   if (DAG->top() == DAG->bottom()) {
404     assert(Top.Available.empty() && Top.Pending.empty() &&
405            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
406     return nullptr;
407   }
408   SUnit *SU;
409   do {
410     if (RegionPolicy.OnlyTopDown) {
411       SU = Top.pickOnlyChoice();
412       if (!SU) {
413         CandPolicy NoPolicy;
414         TopCand.reset(NoPolicy);
415         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
416                           /*IsBottomUp=*/false);
417         assert(TopCand.Reason != NoCand && "failed to find a candidate");
418         SU = TopCand.SU;
419       }
420       IsTopNode = true;
421     } else if (RegionPolicy.OnlyBottomUp) {
422       SU = Bot.pickOnlyChoice();
423       if (!SU) {
424         CandPolicy NoPolicy;
425         BotCand.reset(NoPolicy);
426         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
427                           /*IsBottomUp=*/true);
428         assert(BotCand.Reason != NoCand && "failed to find a candidate");
429         SU = BotCand.SU;
430       }
431       IsTopNode = false;
432     } else {
433       SU = pickNodeBidirectional(IsTopNode);
434     }
435   } while (SU->isScheduled);
436 
437   if (SU->isTopReady())
438     Top.removeReady(SU);
439   if (SU->isBottomReady())
440     Bot.removeReady(SU);
441 
442   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
443                     << *SU->getInstr());
444   return SU;
445 }
446 
getCurrentStage()447 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
448   assert(CurrentStage && CurrentStage != SchedStages.end());
449   return *CurrentStage;
450 }
451 
advanceStage()452 bool GCNSchedStrategy::advanceStage() {
453   assert(CurrentStage != SchedStages.end());
454   if (!CurrentStage)
455     CurrentStage = SchedStages.begin();
456   else
457     CurrentStage++;
458 
459   return CurrentStage != SchedStages.end();
460 }
461 
hasNextStage() const462 bool GCNSchedStrategy::hasNextStage() const {
463   assert(CurrentStage);
464   return std::next(CurrentStage) != SchedStages.end();
465 }
466 
getNextStage() const467 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
468   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
469   return *std::next(CurrentStage);
470 }
471 
GCNMaxOccupancySchedStrategy(const MachineSchedContext * C)472 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
473     const MachineSchedContext *C)
474     : GCNSchedStrategy(C) {
475   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
476   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
477   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
478   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
479 }
480 
GCNMaxILPSchedStrategy(const MachineSchedContext * C)481 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
482     : GCNSchedStrategy(C) {
483   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
484 }
485 
tryCandidate(SchedCandidate & Cand,SchedCandidate & TryCand,SchedBoundary * Zone) const486 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
487                                           SchedCandidate &TryCand,
488                                           SchedBoundary *Zone) const {
489   // Initialize the candidate if needed.
490   if (!Cand.isValid()) {
491     TryCand.Reason = NodeOrder;
492     return true;
493   }
494 
495   // Avoid spilling by exceeding the register limit.
496   if (DAG->isTrackingPressure() &&
497       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
498                   RegExcess, TRI, DAG->MF))
499     return TryCand.Reason != NoCand;
500 
501   // Bias PhysReg Defs and copies to their uses and defined respectively.
502   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
503                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
504     return TryCand.Reason != NoCand;
505 
506   bool SameBoundary = Zone != nullptr;
507   if (SameBoundary) {
508     // Prioritize instructions that read unbuffered resources by stall cycles.
509     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
510                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
511       return TryCand.Reason != NoCand;
512 
513     // Avoid critical resource consumption and balance the schedule.
514     TryCand.initResourceDelta(DAG, SchedModel);
515     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
516                 TryCand, Cand, ResourceReduce))
517       return TryCand.Reason != NoCand;
518     if (tryGreater(TryCand.ResDelta.DemandedResources,
519                    Cand.ResDelta.DemandedResources, TryCand, Cand,
520                    ResourceDemand))
521       return TryCand.Reason != NoCand;
522 
523     // Unconditionally try to reduce latency.
524     if (tryLatency(TryCand, Cand, *Zone))
525       return TryCand.Reason != NoCand;
526 
527     // Weak edges are for clustering and other constraints.
528     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
529                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
530       return TryCand.Reason != NoCand;
531   }
532 
533   // Keep clustered nodes together to encourage downstream peephole
534   // optimizations which may reduce resource requirements.
535   //
536   // This is a best effort to set things up for a post-RA pass. Optimizations
537   // like generating loads of multiple registers should ideally be done within
538   // the scheduler pass by combining the loads during DAG postprocessing.
539   const SUnit *CandNextClusterSU =
540       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
541   const SUnit *TryCandNextClusterSU =
542       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
543   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
544                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
545     return TryCand.Reason != NoCand;
546 
547   // Avoid increasing the max critical pressure in the scheduled region.
548   if (DAG->isTrackingPressure() &&
549       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
550                   TryCand, Cand, RegCritical, TRI, DAG->MF))
551     return TryCand.Reason != NoCand;
552 
553   // Avoid increasing the max pressure of the entire region.
554   if (DAG->isTrackingPressure() &&
555       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
556                   Cand, RegMax, TRI, DAG->MF))
557     return TryCand.Reason != NoCand;
558 
559   if (SameBoundary) {
560     // Fall through to original instruction order.
561     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
562         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
563       TryCand.Reason = NodeOrder;
564       return true;
565     }
566   }
567   return false;
568 }
569 
GCNScheduleDAGMILive(MachineSchedContext * C,std::unique_ptr<MachineSchedStrategy> S)570 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
571     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
572     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
573       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
574       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
575 
576   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
577   if (RelaxedOcc) {
578     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
579     if (MinOccupancy != StartingOccupancy)
580       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
581                         << ".\n");
582   }
583 }
584 
585 std::unique_ptr<GCNSchedStage>
createSchedStage(GCNSchedStageID SchedStageID)586 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
587   switch (SchedStageID) {
588   case GCNSchedStageID::OccInitialSchedule:
589     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
590   case GCNSchedStageID::UnclusteredHighRPReschedule:
591     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
592   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
593     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
594   case GCNSchedStageID::PreRARematerialize:
595     return std::make_unique<PreRARematStage>(SchedStageID, *this);
596   case GCNSchedStageID::ILPInitialSchedule:
597     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
598   }
599 
600   llvm_unreachable("Unknown SchedStageID.");
601 }
602 
schedule()603 void GCNScheduleDAGMILive::schedule() {
604   // Collect all scheduling regions. The actual scheduling is performed in
605   // GCNScheduleDAGMILive::finalizeSchedule.
606   Regions.push_back(std::pair(RegionBegin, RegionEnd));
607 }
608 
609 GCNRegPressure
getRealRegPressure(unsigned RegionIdx) const610 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
611   GCNDownwardRPTracker RPTracker(*LIS);
612   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
613   return RPTracker.moveMaxPressure();
614 }
615 
computeBlockPressure(unsigned RegionIdx,const MachineBasicBlock * MBB)616 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
617                                                 const MachineBasicBlock *MBB) {
618   GCNDownwardRPTracker RPTracker(*LIS);
619 
620   // If the block has the only successor then live-ins of that successor are
621   // live-outs of the current block. We can reuse calculated live set if the
622   // successor will be sent to scheduling past current block.
623 
624   // However, due to the bug in LiveInterval analysis it may happen that two
625   // predecessors of the same successor block have different lane bitmasks for
626   // a live-out register. Workaround that by sticking to one-to-one relationship
627   // i.e. one predecessor with one successor block.
628   const MachineBasicBlock *OnlySucc = nullptr;
629   if (MBB->succ_size() == 1) {
630     auto *Candidate = *MBB->succ_begin();
631     if (!Candidate->empty() && Candidate->pred_size() == 1) {
632       SlotIndexes *Ind = LIS->getSlotIndexes();
633       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
634         OnlySucc = Candidate;
635     }
636   }
637 
638   // Scheduler sends regions from the end of the block upwards.
639   size_t CurRegion = RegionIdx;
640   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
641     if (Regions[CurRegion].first->getParent() != MBB)
642       break;
643   --CurRegion;
644 
645   auto I = MBB->begin();
646   auto LiveInIt = MBBLiveIns.find(MBB);
647   auto &Rgn = Regions[CurRegion];
648   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
649   if (LiveInIt != MBBLiveIns.end()) {
650     auto LiveIn = std::move(LiveInIt->second);
651     RPTracker.reset(*MBB->begin(), &LiveIn);
652     MBBLiveIns.erase(LiveInIt);
653   } else {
654     I = Rgn.first;
655     auto LRS = BBLiveInMap.lookup(NonDbgMI);
656 #ifdef EXPENSIVE_CHECKS
657     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
658 #endif
659     RPTracker.reset(*I, &LRS);
660   }
661 
662   for (;;) {
663     I = RPTracker.getNext();
664 
665     if (Regions[CurRegion].first == I || NonDbgMI == I) {
666       LiveIns[CurRegion] = RPTracker.getLiveRegs();
667       RPTracker.clearMaxPressure();
668     }
669 
670     if (Regions[CurRegion].second == I) {
671       Pressure[CurRegion] = RPTracker.moveMaxPressure();
672       if (CurRegion-- == RegionIdx)
673         break;
674     }
675     RPTracker.advanceToNext();
676     RPTracker.advanceBeforeNext();
677   }
678 
679   if (OnlySucc) {
680     if (I != MBB->end()) {
681       RPTracker.advanceToNext();
682       RPTracker.advance(MBB->end());
683     }
684     RPTracker.advanceBeforeNext();
685     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
686   }
687 }
688 
689 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
getBBLiveInMap() const690 GCNScheduleDAGMILive::getBBLiveInMap() const {
691   assert(!Regions.empty());
692   std::vector<MachineInstr *> BBStarters;
693   BBStarters.reserve(Regions.size());
694   auto I = Regions.rbegin(), E = Regions.rend();
695   auto *BB = I->first->getParent();
696   do {
697     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
698     BBStarters.push_back(MI);
699     do {
700       ++I;
701     } while (I != E && I->first->getParent() == BB);
702   } while (I != E);
703   return getLiveRegMap(BBStarters, false /*After*/, *LIS);
704 }
705 
finalizeSchedule()706 void GCNScheduleDAGMILive::finalizeSchedule() {
707   // Start actual scheduling here. This function is called by the base
708   // MachineScheduler after all regions have been recorded by
709   // GCNScheduleDAGMILive::schedule().
710   LiveIns.resize(Regions.size());
711   Pressure.resize(Regions.size());
712   RescheduleRegions.resize(Regions.size());
713   RegionsWithHighRP.resize(Regions.size());
714   RegionsWithExcessRP.resize(Regions.size());
715   RegionsWithMinOcc.resize(Regions.size());
716   RegionsWithIGLPInstrs.resize(Regions.size());
717   RescheduleRegions.set();
718   RegionsWithHighRP.reset();
719   RegionsWithExcessRP.reset();
720   RegionsWithMinOcc.reset();
721   RegionsWithIGLPInstrs.reset();
722 
723   runSchedStages();
724 }
725 
runSchedStages()726 void GCNScheduleDAGMILive::runSchedStages() {
727   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
728 
729   if (!Regions.empty())
730     BBLiveInMap = getBBLiveInMap();
731 
732   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
733   while (S.advanceStage()) {
734     auto Stage = createSchedStage(S.getCurrentStage());
735     if (!Stage->initGCNSchedStage())
736       continue;
737 
738     for (auto Region : Regions) {
739       RegionBegin = Region.first;
740       RegionEnd = Region.second;
741       // Setup for scheduling the region and check whether it should be skipped.
742       if (!Stage->initGCNRegion()) {
743         Stage->advanceRegion();
744         exitRegion();
745         continue;
746       }
747 
748       ScheduleDAGMILive::schedule();
749       Stage->finalizeGCNRegion();
750     }
751 
752     Stage->finalizeGCNSchedStage();
753   }
754 }
755 
756 #ifndef NDEBUG
operator <<(raw_ostream & OS,const GCNSchedStageID & StageID)757 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
758   switch (StageID) {
759   case GCNSchedStageID::OccInitialSchedule:
760     OS << "Max Occupancy Initial Schedule";
761     break;
762   case GCNSchedStageID::UnclusteredHighRPReschedule:
763     OS << "Unclustered High Register Pressure Reschedule";
764     break;
765   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
766     OS << "Clustered Low Occupancy Reschedule";
767     break;
768   case GCNSchedStageID::PreRARematerialize:
769     OS << "Pre-RA Rematerialize";
770     break;
771   case GCNSchedStageID::ILPInitialSchedule:
772     OS << "Max ILP Initial Schedule";
773     break;
774   }
775 
776   return OS;
777 }
778 #endif
779 
GCNSchedStage(GCNSchedStageID StageID,GCNScheduleDAGMILive & DAG)780 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
781     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
782       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
783 
initGCNSchedStage()784 bool GCNSchedStage::initGCNSchedStage() {
785   if (!DAG.LIS)
786     return false;
787 
788   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
789   return true;
790 }
791 
initGCNSchedStage()792 bool UnclusteredHighRPStage::initGCNSchedStage() {
793   if (DisableUnclusterHighRP)
794     return false;
795 
796   if (!GCNSchedStage::initGCNSchedStage())
797     return false;
798 
799   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
800     return false;
801 
802   SavedMutations.swap(DAG.Mutations);
803   DAG.addMutation(
804       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
805 
806   InitialOccupancy = DAG.MinOccupancy;
807   // Aggressivly try to reduce register pressure in the unclustered high RP
808   // stage. Temporarily increase occupancy target in the region.
809   S.SGPRLimitBias = S.HighRPSGPRBias;
810   S.VGPRLimitBias = S.HighRPVGPRBias;
811   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
812     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
813 
814   LLVM_DEBUG(
815       dbgs()
816       << "Retrying function scheduling without clustering. "
817          "Aggressivly try to reduce register pressure to achieve occupancy "
818       << DAG.MinOccupancy << ".\n");
819 
820   return true;
821 }
822 
initGCNSchedStage()823 bool ClusteredLowOccStage::initGCNSchedStage() {
824   if (DisableClusteredLowOccupancy)
825     return false;
826 
827   if (!GCNSchedStage::initGCNSchedStage())
828     return false;
829 
830   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
831   // been dropped. All regions will have already been scheduled with the ideal
832   // occupancy targets.
833   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
834     return false;
835 
836   LLVM_DEBUG(
837       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
838              << DAG.MinOccupancy << ".\n");
839   return true;
840 }
841 
initGCNSchedStage()842 bool PreRARematStage::initGCNSchedStage() {
843   if (!GCNSchedStage::initGCNSchedStage())
844     return false;
845 
846   if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
847     return false;
848 
849   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
850   // Check maximum occupancy
851   if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
852       DAG.MinOccupancy)
853     return false;
854 
855   // FIXME: This pass will invalidate cached MBBLiveIns for regions
856   // inbetween the defs and region we sinked the def to. Cached pressure
857   // for regions where a def is sinked from will also be invalidated. Will
858   // need to be fixed if there is another pass after this pass.
859   assert(!S.hasNextStage());
860 
861   collectRematerializableInstructions();
862   if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
863     return false;
864 
865   LLVM_DEBUG(
866       dbgs() << "Retrying function scheduling with improved occupancy of "
867              << DAG.MinOccupancy << " from rematerializing\n");
868   return true;
869 }
870 
finalizeGCNSchedStage()871 void GCNSchedStage::finalizeGCNSchedStage() {
872   DAG.finishBlock();
873   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
874 }
875 
finalizeGCNSchedStage()876 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
877   SavedMutations.swap(DAG.Mutations);
878   S.SGPRLimitBias = S.VGPRLimitBias = 0;
879   if (DAG.MinOccupancy > InitialOccupancy) {
880     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
881       DAG.RegionsWithMinOcc[IDX] =
882           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
883 
884     LLVM_DEBUG(dbgs() << StageID
885                       << " stage successfully increased occupancy to "
886                       << DAG.MinOccupancy << '\n');
887   }
888 
889   GCNSchedStage::finalizeGCNSchedStage();
890 }
891 
initGCNRegion()892 bool GCNSchedStage::initGCNRegion() {
893   // Check whether this new region is also a new block.
894   if (DAG.RegionBegin->getParent() != CurrentMBB)
895     setupNewBlock();
896 
897   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
898   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
899 
900   // Skip empty scheduling regions (0 or 1 schedulable instructions).
901   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
902     return false;
903 
904   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
905   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
906                     << " " << CurrentMBB->getName()
907                     << "\n  From: " << *DAG.begin() << "    To: ";
908              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
909              else dbgs() << "End";
910              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
911 
912   // Save original instruction order before scheduling for possible revert.
913   Unsched.clear();
914   Unsched.reserve(DAG.NumRegionInstrs);
915   if (StageID == GCNSchedStageID::OccInitialSchedule ||
916       StageID == GCNSchedStageID::ILPInitialSchedule) {
917     for (auto &I : DAG) {
918       Unsched.push_back(&I);
919       if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
920           I.getOpcode() == AMDGPU::IGLP_OPT)
921         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
922     }
923   } else {
924     for (auto &I : DAG)
925       Unsched.push_back(&I);
926   }
927 
928   PressureBefore = DAG.Pressure[RegionIdx];
929 
930   LLVM_DEBUG(
931       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
932              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
933              << "Region live-in pressure:  "
934              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
935              << "Region register pressure: " << print(PressureBefore));
936 
937   S.HasHighPressure = false;
938   S.KnownExcessRP = isRegionWithExcessRP();
939 
940   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
941       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
942     SavedMutations.clear();
943     SavedMutations.swap(DAG.Mutations);
944     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
945                           StageID == GCNSchedStageID::ILPInitialSchedule;
946     DAG.addMutation(createIGroupLPDAGMutation(
947         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
948                        : AMDGPU::SchedulingPhase::PreRAReentry));
949   }
950 
951   return true;
952 }
953 
initGCNRegion()954 bool UnclusteredHighRPStage::initGCNRegion() {
955   // Only reschedule regions with the minimum occupancy or regions that may have
956   // spilling (excess register pressure).
957   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
958        DAG.MinOccupancy <= InitialOccupancy) &&
959       !DAG.RegionsWithExcessRP[RegionIdx])
960     return false;
961 
962   return GCNSchedStage::initGCNRegion();
963 }
964 
initGCNRegion()965 bool ClusteredLowOccStage::initGCNRegion() {
966   // We may need to reschedule this region if it wasn't rescheduled in the last
967   // stage, or if we found it was testing critical register pressure limits in
968   // the unclustered reschedule stage. The later is because we may not have been
969   // able to raise the min occupancy in the previous stage so the region may be
970   // overly constrained even if it was already rescheduled.
971   if (!DAG.RegionsWithHighRP[RegionIdx])
972     return false;
973 
974   return GCNSchedStage::initGCNRegion();
975 }
976 
initGCNRegion()977 bool PreRARematStage::initGCNRegion() {
978   if (!DAG.RescheduleRegions[RegionIdx])
979     return false;
980 
981   return GCNSchedStage::initGCNRegion();
982 }
983 
setupNewBlock()984 void GCNSchedStage::setupNewBlock() {
985   if (CurrentMBB)
986     DAG.finishBlock();
987 
988   CurrentMBB = DAG.RegionBegin->getParent();
989   DAG.startBlock(CurrentMBB);
990   // Get real RP for the region if it hasn't be calculated before. After the
991   // initial schedule stage real RP will be collected after scheduling.
992   if (StageID == GCNSchedStageID::OccInitialSchedule ||
993       StageID == GCNSchedStageID::ILPInitialSchedule)
994     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
995 }
996 
finalizeGCNRegion()997 void GCNSchedStage::finalizeGCNRegion() {
998   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
999   DAG.RescheduleRegions[RegionIdx] = false;
1000   if (S.HasHighPressure)
1001     DAG.RegionsWithHighRP[RegionIdx] = true;
1002 
1003   // Revert scheduling if we have dropped occupancy or there is some other
1004   // reason that the original schedule is better.
1005   checkScheduling();
1006 
1007   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1008       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1009     SavedMutations.swap(DAG.Mutations);
1010 
1011   DAG.exitRegion();
1012   RegionIdx++;
1013 }
1014 
checkScheduling()1015 void GCNSchedStage::checkScheduling() {
1016   // Check the results of scheduling.
1017   PressureAfter = DAG.getRealRegPressure(RegionIdx);
1018   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1019   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1020 
1021   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1022       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1023     DAG.Pressure[RegionIdx] = PressureAfter;
1024     DAG.RegionsWithMinOcc[RegionIdx] =
1025         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1026 
1027     // Early out if we have achieved the occupancy target.
1028     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1029     return;
1030   }
1031 
1032   unsigned TargetOccupancy =
1033       std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
1034   unsigned WavesAfter =
1035       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
1036   unsigned WavesBefore =
1037       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
1038   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1039                     << ", after " << WavesAfter << ".\n");
1040 
1041   // We may not be able to keep the current target occupancy because of the just
1042   // scheduled region. We might still be able to revert scheduling if the
1043   // occupancy before was higher, or if the current schedule has register
1044   // pressure higher than the excess limits which could lead to more spilling.
1045   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1046 
1047   // Allow memory bound functions to drop to 4 waves if not limited by an
1048   // attribute.
1049   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1050       WavesAfter >= MFI.getMinAllowedOccupancy()) {
1051     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1052                       << MFI.getMinAllowedOccupancy() << " waves\n");
1053     NewOccupancy = WavesAfter;
1054   }
1055 
1056   if (NewOccupancy < DAG.MinOccupancy) {
1057     DAG.MinOccupancy = NewOccupancy;
1058     MFI.limitOccupancy(DAG.MinOccupancy);
1059     DAG.RegionsWithMinOcc.reset();
1060     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1061                       << DAG.MinOccupancy << ".\n");
1062   }
1063   // The maximum number of arch VGPR on non-unified register file, or the
1064   // maximum VGPR + AGPR in the unified register file case.
1065   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1066   // The maximum number of arch VGPR for both unified and non-unified register
1067   // file.
1068   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1069   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1070 
1071   if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1072       PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
1073       PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1074       PressureAfter.getSGPRNum() > MaxSGPRs) {
1075     DAG.RescheduleRegions[RegionIdx] = true;
1076     DAG.RegionsWithHighRP[RegionIdx] = true;
1077     DAG.RegionsWithExcessRP[RegionIdx] = true;
1078   }
1079 
1080   // Revert if this region's schedule would cause a drop in occupancy or
1081   // spilling.
1082   if (shouldRevertScheduling(WavesAfter)) {
1083     revertScheduling();
1084   } else {
1085     DAG.Pressure[RegionIdx] = PressureAfter;
1086     DAG.RegionsWithMinOcc[RegionIdx] =
1087         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1088   }
1089 }
1090 
1091 unsigned
computeSUnitReadyCycle(const SUnit & SU,unsigned CurrCycle,DenseMap<unsigned,unsigned> & ReadyCycles,const TargetSchedModel & SM)1092 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1093                                       DenseMap<unsigned, unsigned> &ReadyCycles,
1094                                       const TargetSchedModel &SM) {
1095   unsigned ReadyCycle = CurrCycle;
1096   for (auto &D : SU.Preds) {
1097     if (D.isAssignedRegDep()) {
1098       MachineInstr *DefMI = D.getSUnit()->getInstr();
1099       unsigned Latency = SM.computeInstrLatency(DefMI);
1100       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1101       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1102     }
1103   }
1104   ReadyCycles[SU.NodeNum] = ReadyCycle;
1105   return ReadyCycle;
1106 }
1107 
1108 #ifndef NDEBUG
1109 struct EarlierIssuingCycle {
operator ()EarlierIssuingCycle1110   bool operator()(std::pair<MachineInstr *, unsigned> A,
1111                   std::pair<MachineInstr *, unsigned> B) const {
1112     return A.second < B.second;
1113   }
1114 };
1115 
printScheduleModel(std::set<std::pair<MachineInstr *,unsigned>,EarlierIssuingCycle> & ReadyCycles)1116 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1117                                         EarlierIssuingCycle> &ReadyCycles) {
1118   if (ReadyCycles.empty())
1119     return;
1120   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1121   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1122          << " ##################\n# Cycle #\t\t\tInstruction          "
1123             "             "
1124             "                            \n";
1125   unsigned IPrev = 1;
1126   for (auto &I : ReadyCycles) {
1127     if (I.second > IPrev + 1)
1128       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1129              << " CYCLES DETECTED ******************************\n\n";
1130     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
1131     IPrev = I.second;
1132   }
1133 }
1134 #endif
1135 
1136 ScheduleMetrics
getScheduleMetrics(const std::vector<SUnit> & InputSchedule)1137 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1138 #ifndef NDEBUG
1139   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1140       ReadyCyclesSorted;
1141 #endif
1142   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1143   unsigned SumBubbles = 0;
1144   DenseMap<unsigned, unsigned> ReadyCycles;
1145   unsigned CurrCycle = 0;
1146   for (auto &SU : InputSchedule) {
1147     unsigned ReadyCycle =
1148         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1149     SumBubbles += ReadyCycle - CurrCycle;
1150 #ifndef NDEBUG
1151     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1152 #endif
1153     CurrCycle = ++ReadyCycle;
1154   }
1155 #ifndef NDEBUG
1156   LLVM_DEBUG(
1157       printScheduleModel(ReadyCyclesSorted);
1158       dbgs() << "\n\t"
1159              << "Metric: "
1160              << (SumBubbles
1161                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1162                      : 1)
1163              << "\n\n");
1164 #endif
1165 
1166   return ScheduleMetrics(CurrCycle, SumBubbles);
1167 }
1168 
1169 ScheduleMetrics
getScheduleMetrics(const GCNScheduleDAGMILive & DAG)1170 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1171 #ifndef NDEBUG
1172   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1173       ReadyCyclesSorted;
1174 #endif
1175   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1176   unsigned SumBubbles = 0;
1177   DenseMap<unsigned, unsigned> ReadyCycles;
1178   unsigned CurrCycle = 0;
1179   for (auto &MI : DAG) {
1180     SUnit *SU = DAG.getSUnit(&MI);
1181     if (!SU)
1182       continue;
1183     unsigned ReadyCycle =
1184         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1185     SumBubbles += ReadyCycle - CurrCycle;
1186 #ifndef NDEBUG
1187     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1188 #endif
1189     CurrCycle = ++ReadyCycle;
1190   }
1191 #ifndef NDEBUG
1192   LLVM_DEBUG(
1193       printScheduleModel(ReadyCyclesSorted);
1194       dbgs() << "\n\t"
1195              << "Metric: "
1196              << (SumBubbles
1197                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1198                      : 1)
1199              << "\n\n");
1200 #endif
1201 
1202   return ScheduleMetrics(CurrCycle, SumBubbles);
1203 }
1204 
shouldRevertScheduling(unsigned WavesAfter)1205 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1206   if (WavesAfter < DAG.MinOccupancy)
1207     return true;
1208 
1209   return false;
1210 }
1211 
shouldRevertScheduling(unsigned WavesAfter)1212 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1213   if (PressureAfter == PressureBefore)
1214     return false;
1215 
1216   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1217     return true;
1218 
1219   if (mayCauseSpilling(WavesAfter))
1220     return true;
1221 
1222   return false;
1223 }
1224 
shouldRevertScheduling(unsigned WavesAfter)1225 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1226   // If RP is not reduced in the unclustered reschedule stage, revert to the
1227   // old schedule.
1228   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1229        mayCauseSpilling(WavesAfter)) ||
1230       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1231     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1232     return true;
1233   }
1234 
1235   // Do not attempt to relax schedule even more if we are already spilling.
1236   if (isRegionWithExcessRP())
1237     return false;
1238 
1239   LLVM_DEBUG(
1240       dbgs()
1241       << "\n\t      *** In shouldRevertScheduling ***\n"
1242       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1243   ScheduleMetrics MBefore =
1244       getScheduleMetrics(DAG.SUnits);
1245   LLVM_DEBUG(
1246       dbgs()
1247       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1248   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1249   unsigned OldMetric = MBefore.getMetric();
1250   unsigned NewMetric = MAfter.getMetric();
1251   unsigned WavesBefore =
1252       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1253   unsigned Profit =
1254       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1255        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1256        NewMetric) /
1257       ScheduleMetrics::ScaleFactor;
1258   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1259                     << MAfter << "Profit: " << Profit << "\n");
1260   return Profit < ScheduleMetrics::ScaleFactor;
1261 }
1262 
shouldRevertScheduling(unsigned WavesAfter)1263 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1264   if (PressureAfter == PressureBefore)
1265     return false;
1266 
1267   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1268     return true;
1269 
1270   if (mayCauseSpilling(WavesAfter))
1271     return true;
1272 
1273   return false;
1274 }
1275 
shouldRevertScheduling(unsigned WavesAfter)1276 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1277   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1278     return true;
1279 
1280   if (mayCauseSpilling(WavesAfter))
1281     return true;
1282 
1283   return false;
1284 }
1285 
shouldRevertScheduling(unsigned WavesAfter)1286 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1287   if (mayCauseSpilling(WavesAfter))
1288     return true;
1289 
1290   return false;
1291 }
1292 
mayCauseSpilling(unsigned WavesAfter)1293 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1294   if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1295       !PressureAfter.less(MF, PressureBefore)) {
1296     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1297     return true;
1298   }
1299 
1300   return false;
1301 }
1302 
revertScheduling()1303 void GCNSchedStage::revertScheduling() {
1304   DAG.RegionsWithMinOcc[RegionIdx] =
1305       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1306   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1307   DAG.RescheduleRegions[RegionIdx] =
1308       S.hasNextStage() &&
1309       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1310   DAG.RegionEnd = DAG.RegionBegin;
1311   int SkippedDebugInstr = 0;
1312   for (MachineInstr *MI : Unsched) {
1313     if (MI->isDebugInstr()) {
1314       ++SkippedDebugInstr;
1315       continue;
1316     }
1317 
1318     if (MI->getIterator() != DAG.RegionEnd) {
1319       DAG.BB->remove(MI);
1320       DAG.BB->insert(DAG.RegionEnd, MI);
1321       if (!MI->isDebugInstr())
1322         DAG.LIS->handleMove(*MI, true);
1323     }
1324 
1325     // Reset read-undef flags and update them later.
1326     for (auto &Op : MI->all_defs())
1327       Op.setIsUndef(false);
1328     RegisterOperands RegOpers;
1329     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1330     if (!MI->isDebugInstr()) {
1331       if (DAG.ShouldTrackLaneMasks) {
1332         // Adjust liveness and add missing dead+read-undef flags.
1333         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1334         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1335       } else {
1336         // Adjust for missing dead-def flags.
1337         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1338       }
1339     }
1340     DAG.RegionEnd = MI->getIterator();
1341     ++DAG.RegionEnd;
1342     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1343   }
1344 
1345   // After reverting schedule, debug instrs will now be at the end of the block
1346   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1347   // pass debug instrs to the actual end of the scheduling region.
1348   while (SkippedDebugInstr-- > 0)
1349     ++DAG.RegionEnd;
1350 
1351   // If Unsched.front() instruction is a debug instruction, this will actually
1352   // shrink the region since we moved all debug instructions to the end of the
1353   // block. Find the first instruction that is not a debug instruction.
1354   DAG.RegionBegin = Unsched.front()->getIterator();
1355   if (DAG.RegionBegin->isDebugInstr()) {
1356     for (MachineInstr *MI : Unsched) {
1357       if (MI->isDebugInstr())
1358         continue;
1359       DAG.RegionBegin = MI->getIterator();
1360       break;
1361     }
1362   }
1363 
1364   // Then move the debug instructions back into their correct place and set
1365   // RegionBegin and RegionEnd if needed.
1366   DAG.placeDebugValues();
1367 
1368   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1369 }
1370 
collectRematerializableInstructions()1371 void PreRARematStage::collectRematerializableInstructions() {
1372   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1373   for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1374     Register Reg = Register::index2VirtReg(I);
1375     if (!DAG.LIS->hasInterval(Reg))
1376       continue;
1377 
1378     // TODO: Handle AGPR and SGPR rematerialization
1379     if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1380         !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1381       continue;
1382 
1383     MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1384     MachineInstr *Def = Op->getParent();
1385     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1386       continue;
1387 
1388     MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1389     if (Def->getParent() == UseI->getParent())
1390       continue;
1391 
1392     // We are only collecting defs that are defined in another block and are
1393     // live-through or used inside regions at MinOccupancy. This means that the
1394     // register must be in the live-in set for the region.
1395     bool AddedToRematList = false;
1396     for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1397       auto It = DAG.LiveIns[I].find(Reg);
1398       if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1399         if (DAG.RegionsWithMinOcc[I]) {
1400           RematerializableInsts[I][Def] = UseI;
1401           AddedToRematList = true;
1402         }
1403 
1404         // Collect regions with rematerializable reg as live-in to avoid
1405         // searching later when updating RP.
1406         RematDefToLiveInRegions[Def].push_back(I);
1407       }
1408     }
1409     if (!AddedToRematList)
1410       RematDefToLiveInRegions.erase(Def);
1411   }
1412 }
1413 
sinkTriviallyRematInsts(const GCNSubtarget & ST,const TargetInstrInfo * TII)1414 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1415                                               const TargetInstrInfo *TII) {
1416   // Temporary copies of cached variables we will be modifying and replacing if
1417   // sinking succeeds.
1418   SmallVector<
1419       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1420       NewRegions;
1421   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1422   DenseMap<unsigned, GCNRegPressure> NewPressure;
1423   BitVector NewRescheduleRegions;
1424   LiveIntervals *LIS = DAG.LIS;
1425 
1426   NewRegions.resize(DAG.Regions.size());
1427   NewRescheduleRegions.resize(DAG.Regions.size());
1428 
1429   // Collect only regions that has a rematerializable def as a live-in.
1430   SmallSet<unsigned, 16> ImpactedRegions;
1431   for (const auto &It : RematDefToLiveInRegions)
1432     ImpactedRegions.insert(It.second.begin(), It.second.end());
1433 
1434   // Make copies of register pressure and live-ins cache that will be updated
1435   // as we rematerialize.
1436   for (auto Idx : ImpactedRegions) {
1437     NewPressure[Idx] = DAG.Pressure[Idx];
1438     NewLiveIns[Idx] = DAG.LiveIns[Idx];
1439   }
1440   NewRegions = DAG.Regions;
1441   NewRescheduleRegions.reset();
1442 
1443   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1444   bool Improved = false;
1445   for (auto I : ImpactedRegions) {
1446     if (!DAG.RegionsWithMinOcc[I])
1447       continue;
1448 
1449     Improved = false;
1450     int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1451     int SGPRUsage = NewPressure[I].getSGPRNum();
1452 
1453     // TODO: Handle occupancy drop due to AGPR and SGPR.
1454     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1455     if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1456       break;
1457 
1458     // The occupancy of this region could have been improved by a previous
1459     // iteration's sinking of defs.
1460     if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1461       NewRescheduleRegions[I] = true;
1462       Improved = true;
1463       continue;
1464     }
1465 
1466     // First check if we have enough trivially rematerializable instructions to
1467     // improve occupancy. Optimistically assume all instructions we are able to
1468     // sink decreased RP.
1469     int TotalSinkableRegs = 0;
1470     for (const auto &It : RematerializableInsts[I]) {
1471       MachineInstr *Def = It.first;
1472       Register DefReg = Def->getOperand(0).getReg();
1473       TotalSinkableRegs +=
1474           SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1475     }
1476     int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1477     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1478     // If in the most optimistic scenario, we cannot improve occupancy, then do
1479     // not attempt to sink any instructions.
1480     if (OptimisticOccupancy <= DAG.MinOccupancy)
1481       break;
1482 
1483     unsigned ImproveOccupancy = 0;
1484     SmallVector<MachineInstr *, 4> SinkedDefs;
1485     for (auto &It : RematerializableInsts[I]) {
1486       MachineInstr *Def = It.first;
1487       MachineBasicBlock::iterator InsertPos =
1488           MachineBasicBlock::iterator(It.second);
1489       Register Reg = Def->getOperand(0).getReg();
1490       // Rematerialize MI to its use block. Since we are only rematerializing
1491       // instructions that do not have any virtual reg uses, we do not need to
1492       // call LiveRangeEdit::allUsesAvailableAt() and
1493       // LiveRangeEdit::canRematerializeAt().
1494       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1495                          Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1496       MachineInstr *NewMI = &*std::prev(InsertPos);
1497       LIS->InsertMachineInstrInMaps(*NewMI);
1498       LIS->removeInterval(Reg);
1499       LIS->createAndComputeVirtRegInterval(Reg);
1500       InsertedMIToOldDef[NewMI] = Def;
1501 
1502       // Update region boundaries in scheduling region we sinked from since we
1503       // may sink an instruction that was at the beginning or end of its region
1504       DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1505                                  /*Removing =*/true);
1506 
1507       // Update region boundaries in region we sinked to.
1508       DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1509 
1510       LaneBitmask PrevMask = NewLiveIns[I][Reg];
1511       // FIXME: Also update cached pressure for where the def was sinked from.
1512       // Update RP for all regions that has this reg as a live-in and remove
1513       // the reg from all regions as a live-in.
1514       for (auto Idx : RematDefToLiveInRegions[Def]) {
1515         NewLiveIns[Idx].erase(Reg);
1516         if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1517           // Def is live-through and not used in this block.
1518           NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1519         } else {
1520           // Def is used and rematerialized into this block.
1521           GCNDownwardRPTracker RPT(*LIS);
1522           auto *NonDbgMI = &*skipDebugInstructionsForward(
1523               NewRegions[Idx].first, NewRegions[Idx].second);
1524           RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1525           RPT.advance(NewRegions[Idx].second);
1526           NewPressure[Idx] = RPT.moveMaxPressure();
1527         }
1528       }
1529 
1530       SinkedDefs.push_back(Def);
1531       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1532       if (ImproveOccupancy > DAG.MinOccupancy)
1533         break;
1534     }
1535 
1536     // Remove defs we just sinked from all regions' list of sinkable defs
1537     for (auto &Def : SinkedDefs)
1538       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1539         RematerializableInsts[TrackedIdx].erase(Def);
1540 
1541     if (ImproveOccupancy <= DAG.MinOccupancy)
1542       break;
1543 
1544     NewRescheduleRegions[I] = true;
1545     Improved = true;
1546   }
1547 
1548   if (!Improved) {
1549     // Occupancy was not improved for all regions that were at MinOccupancy.
1550     // Undo sinking and remove newly rematerialized instructions.
1551     for (auto &Entry : InsertedMIToOldDef) {
1552       MachineInstr *MI = Entry.first;
1553       MachineInstr *OldMI = Entry.second;
1554       Register Reg = MI->getOperand(0).getReg();
1555       LIS->RemoveMachineInstrFromMaps(*MI);
1556       MI->eraseFromParent();
1557       OldMI->clearRegisterDeads(Reg);
1558       LIS->removeInterval(Reg);
1559       LIS->createAndComputeVirtRegInterval(Reg);
1560     }
1561     return false;
1562   }
1563 
1564   // Occupancy was improved for all regions.
1565   for (auto &Entry : InsertedMIToOldDef) {
1566     MachineInstr *MI = Entry.first;
1567     MachineInstr *OldMI = Entry.second;
1568 
1569     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1570     DAG.BBLiveInMap.erase(OldMI);
1571 
1572     // Remove OldMI and update LIS
1573     Register Reg = MI->getOperand(0).getReg();
1574     LIS->RemoveMachineInstrFromMaps(*OldMI);
1575     OldMI->eraseFromParent();
1576     LIS->removeInterval(Reg);
1577     LIS->createAndComputeVirtRegInterval(Reg);
1578   }
1579 
1580   // Update live-ins, register pressure, and regions caches.
1581   for (auto Idx : ImpactedRegions) {
1582     DAG.LiveIns[Idx] = NewLiveIns[Idx];
1583     DAG.Pressure[Idx] = NewPressure[Idx];
1584     DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1585   }
1586   DAG.Regions = NewRegions;
1587   DAG.RescheduleRegions = NewRescheduleRegions;
1588 
1589   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1590   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1591 
1592   return true;
1593 }
1594 
1595 // Copied from MachineLICM
isTriviallyReMaterializable(const MachineInstr & MI)1596 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1597   if (!DAG.TII->isTriviallyReMaterializable(MI))
1598     return false;
1599 
1600   for (const MachineOperand &MO : MI.all_uses())
1601     if (MO.getReg().isVirtual())
1602       return false;
1603 
1604   return true;
1605 }
1606 
1607 // When removing, we will have to check both beginning and ending of the region.
1608 // When inserting, we will only have to check if we are inserting NewMI in front
1609 // of a scheduling region and do not need to check the ending since we will only
1610 // ever be inserting before an already existing MI.
updateRegionBoundaries(SmallVectorImpl<std::pair<MachineBasicBlock::iterator,MachineBasicBlock::iterator>> & RegionBoundaries,MachineBasicBlock::iterator MI,MachineInstr * NewMI,bool Removing)1611 void GCNScheduleDAGMILive::updateRegionBoundaries(
1612     SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1613                               MachineBasicBlock::iterator>> &RegionBoundaries,
1614     MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1615   unsigned I = 0, E = RegionBoundaries.size();
1616   // Search for first region of the block where MI is located
1617   while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1618     ++I;
1619 
1620   for (; I != E; ++I) {
1621     if (MI->getParent() != RegionBoundaries[I].first->getParent())
1622       return;
1623 
1624     if (Removing && MI == RegionBoundaries[I].first &&
1625         MI == RegionBoundaries[I].second) {
1626       // MI is in a region with size 1, after removing, the region will be
1627       // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1628       RegionBoundaries[I] =
1629           std::pair(MI->getParent()->end(), MI->getParent()->end());
1630       return;
1631     }
1632     if (MI == RegionBoundaries[I].first) {
1633       if (Removing)
1634         RegionBoundaries[I] =
1635             std::pair(std::next(MI), RegionBoundaries[I].second);
1636       else
1637         // Inserted NewMI in front of region, set new RegionBegin to NewMI
1638         RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1639                                         RegionBoundaries[I].second);
1640       return;
1641     }
1642     if (Removing && MI == RegionBoundaries[I].second) {
1643       RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1644       return;
1645     }
1646   }
1647 }
1648 
hasIGLPInstrs(ScheduleDAGInstrs * DAG)1649 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1650   return std::any_of(
1651       DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
1652         unsigned Opc = MI->getOpcode();
1653         return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1654       });
1655 }
1656 
GCNPostScheduleDAGMILive(MachineSchedContext * C,std::unique_ptr<MachineSchedStrategy> S,bool RemoveKillFlags)1657 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1658     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1659     bool RemoveKillFlags)
1660     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1661 
schedule()1662 void GCNPostScheduleDAGMILive::schedule() {
1663   HasIGLPInstrs = hasIGLPInstrs(this);
1664   if (HasIGLPInstrs) {
1665     SavedMutations.clear();
1666     SavedMutations.swap(Mutations);
1667     addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1668   }
1669 
1670   ScheduleDAGMI::schedule();
1671 }
1672 
finalizeSchedule()1673 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1674   if (HasIGLPInstrs)
1675     SavedMutations.swap(Mutations);
1676 
1677   ScheduleDAGMI::finalizeSchedule();
1678 }
1679