1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
17
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
25
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
30
31 #define DEBUG_TYPE "machine-scheduler"
32
33 using namespace llvm;
34
35 static cl::opt<bool> DisableUnclusterHighRP(
36 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
37 cl::desc("Disable unclustered high register pressure "
38 "reduction scheduling stage."),
39 cl::init(false));
40
41 static cl::opt<bool> DisableClusteredLowOccupancy(
42 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
43 cl::desc("Disable clustered low occupancy "
44 "rescheduling for ILP scheduling stage."),
45 cl::init(false));
46
47 static cl::opt<unsigned> ScheduleMetricBias(
48 "amdgpu-schedule-metric-bias", cl::Hidden,
49 cl::desc(
50 "Sets the bias which adds weight to occupancy vs latency. Set it to "
51 "100 to chase the occupancy only."),
52 cl::init(10));
53
54 static cl::opt<bool>
55 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
56 cl::desc("Relax occupancy targets for kernels which are memory "
57 "bound (amdgpu-membound-threshold), or "
58 "Wave Limited (amdgpu-limit-wave-threshold)."),
59 cl::init(false));
60
61 const unsigned ScheduleMetrics::ScaleFactor = 100;
62
GCNSchedStrategy(const MachineSchedContext * C)63 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
64 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
65 HasHighPressure(false) {}
66
initialize(ScheduleDAGMI * DAG)67 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
68 GenericScheduler::initialize(DAG);
69
70 MF = &DAG->MF;
71
72 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
73
74 SGPRExcessLimit =
75 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
76 VGPRExcessLimit =
77 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
78
79 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
80 // Set the initial TargetOccupnacy to the maximum occupancy that we can
81 // achieve for this function. This effectively sets a lower bound on the
82 // 'Critical' register limits in the scheduler.
83 // Allow for lower occupancy targets if kernel is wave limited or memory
84 // bound, and using the relaxed occupancy feature.
85 TargetOccupancy =
86 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
87 SGPRCriticalLimit =
88 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
89
90 if (!KnownExcessRP) {
91 VGPRCriticalLimit =
92 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
93 } else {
94 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
95 // returns a reasonably small number for targets with lots of VGPRs, such
96 // as GFX10 and GFX11.
97 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
98 "VGPRCriticalLimit calculation method.\n");
99
100 unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
101 unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
102 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
103 VGPRBudget = std::max(VGPRBudget, Granule);
104 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
105 }
106
107 // Subtract error margin and bias from register limits and avoid overflow.
108 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
109 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
110 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
111 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
112
113 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
114 << ", VGPRExcessLimit = " << VGPRExcessLimit
115 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
116 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
117 }
118
119 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
120 /// current register pressure.
121 ///
122 /// This works for the common case, but it has a few exceptions that have been
123 /// observed through trial and error:
124 /// - Explicit physical register operands
125 /// - Subregister definitions
126 ///
127 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
128 /// and querying LiveIntervals through the RegPressureTracker is needed to get
129 /// an accurate value.
130 ///
131 /// We should eventually only use PressureDiff for maximum performance, but this
132 /// already allows 80% of SUs to take the fast path without changing scheduling
133 /// at all. Further changes would either change scheduling, or require a lot
134 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
canUsePressureDiffs(const SUnit & SU)135 static bool canUsePressureDiffs(const SUnit &SU) {
136 if (!SU.isInstr())
137 return false;
138
139 // Cannot use pressure diffs for subregister defs or with physregs, it's
140 // imprecise in both cases.
141 for (const auto &Op : SU.getInstr()->operands()) {
142 if (!Op.isReg() || Op.isImplicit())
143 continue;
144 if (Op.getReg().isPhysical() ||
145 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
146 return false;
147 }
148 return true;
149 }
150
getRegisterPressures(bool AtTop,const RegPressureTracker & RPTracker,SUnit * SU,std::vector<unsigned> & Pressure,std::vector<unsigned> & MaxPressure)151 static void getRegisterPressures(bool AtTop,
152 const RegPressureTracker &RPTracker, SUnit *SU,
153 std::vector<unsigned> &Pressure,
154 std::vector<unsigned> &MaxPressure) {
155 // getDownwardPressure() and getUpwardPressure() make temporary changes to
156 // the tracker, so we need to pass those function a non-const copy.
157 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
158 if (AtTop)
159 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
160 else
161 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
162 }
163
initCandidate(SchedCandidate & Cand,SUnit * SU,bool AtTop,const RegPressureTracker & RPTracker,const SIRegisterInfo * SRI,unsigned SGPRPressure,unsigned VGPRPressure,bool IsBottomUp)164 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
165 bool AtTop,
166 const RegPressureTracker &RPTracker,
167 const SIRegisterInfo *SRI,
168 unsigned SGPRPressure,
169 unsigned VGPRPressure, bool IsBottomUp) {
170 Cand.SU = SU;
171 Cand.AtTop = AtTop;
172
173 if (!DAG->isTrackingPressure())
174 return;
175
176 Pressure.clear();
177 MaxPressure.clear();
178
179 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
180 // possible over querying the RegPressureTracker.
181 //
182 // RegPressureTracker will make a lot of LIS queries which are very
183 // expensive, it is considered a slow function in this context.
184 //
185 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
186 // trivial lookup into an array. It is pretty much free.
187 //
188 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
189 // PressureDiffs.
190 if (AtTop || !canUsePressureDiffs(*SU)) {
191 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
192 } else {
193 // Reserve 4 slots.
194 Pressure.resize(4, 0);
195 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
196 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
197
198 for (const auto &Diff : DAG->getPressureDiff(SU)) {
199 if (!Diff.isValid())
200 continue;
201 // PressureDiffs is always bottom-up so if we're working top-down we need
202 // to invert its sign.
203 Pressure[Diff.getPSet()] +=
204 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
205 }
206
207 #ifdef EXPENSIVE_CHECKS
208 std::vector<unsigned> CheckPressure, CheckMaxPressure;
209 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
210 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
211 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
212 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
213 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
214 errs() << "Register Pressure is inaccurate when calculated through "
215 "PressureDiff\n"
216 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
217 << ", expected "
218 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
219 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
220 << ", expected "
221 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
222 report_fatal_error("inaccurate register pressure calculation");
223 }
224 #endif
225 }
226
227 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
228 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
229
230 // If two instructions increase the pressure of different register sets
231 // by the same amount, the generic scheduler will prefer to schedule the
232 // instruction that increases the set with the least amount of registers,
233 // which in our case would be SGPRs. This is rarely what we want, so
234 // when we report excess/critical register pressure, we do it either
235 // only for VGPRs or only for SGPRs.
236
237 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
238 const unsigned MaxVGPRPressureInc = 16;
239 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
240 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
241
242 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
243 // to increase the likelihood we don't go over the limits. We should improve
244 // the analysis to look through dependencies to find the path with the least
245 // register pressure.
246
247 // We only need to update the RPDelta for instructions that increase register
248 // pressure. Instructions that decrease or keep reg pressure the same will be
249 // marked as RegExcess in tryCandidate() when they are compared with
250 // instructions that increase the register pressure.
251 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
252 HasHighPressure = true;
253 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
254 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
255 }
256
257 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
258 HasHighPressure = true;
259 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
260 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
261 }
262
263 // Register pressure is considered 'CRITICAL' if it is approaching a value
264 // that would reduce the wave occupancy for the execution unit. When
265 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
266 // has the same cost, so we don't need to prefer one over the other.
267
268 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
269 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
270
271 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
272 HasHighPressure = true;
273 if (SGPRDelta > VGPRDelta) {
274 Cand.RPDelta.CriticalMax =
275 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
276 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
277 } else {
278 Cand.RPDelta.CriticalMax =
279 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
280 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
281 }
282 }
283 }
284
285 // This function is mostly cut and pasted from
286 // GenericScheduler::pickNodeFromQueue()
pickNodeFromQueue(SchedBoundary & Zone,const CandPolicy & ZonePolicy,const RegPressureTracker & RPTracker,SchedCandidate & Cand,bool IsBottomUp)287 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
288 const CandPolicy &ZonePolicy,
289 const RegPressureTracker &RPTracker,
290 SchedCandidate &Cand,
291 bool IsBottomUp) {
292 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
293 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
294 unsigned SGPRPressure = 0;
295 unsigned VGPRPressure = 0;
296 if (DAG->isTrackingPressure()) {
297 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
298 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
299 }
300 ReadyQueue &Q = Zone.Available;
301 for (SUnit *SU : Q) {
302
303 SchedCandidate TryCand(ZonePolicy);
304 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
305 VGPRPressure, IsBottomUp);
306 // Pass SchedBoundary only when comparing nodes from the same boundary.
307 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
308 tryCandidate(Cand, TryCand, ZoneArg);
309 if (TryCand.Reason != NoCand) {
310 // Initialize resource delta if needed in case future heuristics query it.
311 if (TryCand.ResDelta == SchedResourceDelta())
312 TryCand.initResourceDelta(Zone.DAG, SchedModel);
313 Cand.setBest(TryCand);
314 LLVM_DEBUG(traceCandidate(Cand));
315 }
316 }
317 }
318
319 // This function is mostly cut and pasted from
320 // GenericScheduler::pickNodeBidirectional()
pickNodeBidirectional(bool & IsTopNode)321 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
322 // Schedule as far as possible in the direction of no choice. This is most
323 // efficient, but also provides the best heuristics for CriticalPSets.
324 if (SUnit *SU = Bot.pickOnlyChoice()) {
325 IsTopNode = false;
326 return SU;
327 }
328 if (SUnit *SU = Top.pickOnlyChoice()) {
329 IsTopNode = true;
330 return SU;
331 }
332 // Set the bottom-up policy based on the state of the current bottom zone and
333 // the instructions outside the zone, including the top zone.
334 CandPolicy BotPolicy;
335 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
336 // Set the top-down policy based on the state of the current top zone and
337 // the instructions outside the zone, including the bottom zone.
338 CandPolicy TopPolicy;
339 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
340
341 // See if BotCand is still valid (because we previously scheduled from Top).
342 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
343 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
344 BotCand.Policy != BotPolicy) {
345 BotCand.reset(CandPolicy());
346 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
347 /*IsBottomUp=*/true);
348 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
349 } else {
350 LLVM_DEBUG(traceCandidate(BotCand));
351 #ifndef NDEBUG
352 if (VerifyScheduling) {
353 SchedCandidate TCand;
354 TCand.reset(CandPolicy());
355 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
356 /*IsBottomUp=*/true);
357 assert(TCand.SU == BotCand.SU &&
358 "Last pick result should correspond to re-picking right now");
359 }
360 #endif
361 }
362
363 // Check if the top Q has a better candidate.
364 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
365 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
366 TopCand.Policy != TopPolicy) {
367 TopCand.reset(CandPolicy());
368 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
369 /*IsBottomUp=*/false);
370 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
371 } else {
372 LLVM_DEBUG(traceCandidate(TopCand));
373 #ifndef NDEBUG
374 if (VerifyScheduling) {
375 SchedCandidate TCand;
376 TCand.reset(CandPolicy());
377 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
378 /*IsBottomUp=*/false);
379 assert(TCand.SU == TopCand.SU &&
380 "Last pick result should correspond to re-picking right now");
381 }
382 #endif
383 }
384
385 // Pick best from BotCand and TopCand.
386 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
387 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
388 SchedCandidate Cand = BotCand;
389 TopCand.Reason = NoCand;
390 tryCandidate(Cand, TopCand, nullptr);
391 if (TopCand.Reason != NoCand) {
392 Cand.setBest(TopCand);
393 }
394 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
395
396 IsTopNode = Cand.AtTop;
397 return Cand.SU;
398 }
399
400 // This function is mostly cut and pasted from
401 // GenericScheduler::pickNode()
pickNode(bool & IsTopNode)402 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
403 if (DAG->top() == DAG->bottom()) {
404 assert(Top.Available.empty() && Top.Pending.empty() &&
405 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
406 return nullptr;
407 }
408 SUnit *SU;
409 do {
410 if (RegionPolicy.OnlyTopDown) {
411 SU = Top.pickOnlyChoice();
412 if (!SU) {
413 CandPolicy NoPolicy;
414 TopCand.reset(NoPolicy);
415 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
416 /*IsBottomUp=*/false);
417 assert(TopCand.Reason != NoCand && "failed to find a candidate");
418 SU = TopCand.SU;
419 }
420 IsTopNode = true;
421 } else if (RegionPolicy.OnlyBottomUp) {
422 SU = Bot.pickOnlyChoice();
423 if (!SU) {
424 CandPolicy NoPolicy;
425 BotCand.reset(NoPolicy);
426 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
427 /*IsBottomUp=*/true);
428 assert(BotCand.Reason != NoCand && "failed to find a candidate");
429 SU = BotCand.SU;
430 }
431 IsTopNode = false;
432 } else {
433 SU = pickNodeBidirectional(IsTopNode);
434 }
435 } while (SU->isScheduled);
436
437 if (SU->isTopReady())
438 Top.removeReady(SU);
439 if (SU->isBottomReady())
440 Bot.removeReady(SU);
441
442 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
443 << *SU->getInstr());
444 return SU;
445 }
446
getCurrentStage()447 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
448 assert(CurrentStage && CurrentStage != SchedStages.end());
449 return *CurrentStage;
450 }
451
advanceStage()452 bool GCNSchedStrategy::advanceStage() {
453 assert(CurrentStage != SchedStages.end());
454 if (!CurrentStage)
455 CurrentStage = SchedStages.begin();
456 else
457 CurrentStage++;
458
459 return CurrentStage != SchedStages.end();
460 }
461
hasNextStage() const462 bool GCNSchedStrategy::hasNextStage() const {
463 assert(CurrentStage);
464 return std::next(CurrentStage) != SchedStages.end();
465 }
466
getNextStage() const467 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
468 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
469 return *std::next(CurrentStage);
470 }
471
GCNMaxOccupancySchedStrategy(const MachineSchedContext * C)472 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
473 const MachineSchedContext *C)
474 : GCNSchedStrategy(C) {
475 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
476 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
477 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
478 SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
479 }
480
GCNMaxILPSchedStrategy(const MachineSchedContext * C)481 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
482 : GCNSchedStrategy(C) {
483 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
484 }
485
tryCandidate(SchedCandidate & Cand,SchedCandidate & TryCand,SchedBoundary * Zone) const486 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
487 SchedCandidate &TryCand,
488 SchedBoundary *Zone) const {
489 // Initialize the candidate if needed.
490 if (!Cand.isValid()) {
491 TryCand.Reason = NodeOrder;
492 return true;
493 }
494
495 // Avoid spilling by exceeding the register limit.
496 if (DAG->isTrackingPressure() &&
497 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
498 RegExcess, TRI, DAG->MF))
499 return TryCand.Reason != NoCand;
500
501 // Bias PhysReg Defs and copies to their uses and defined respectively.
502 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
503 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
504 return TryCand.Reason != NoCand;
505
506 bool SameBoundary = Zone != nullptr;
507 if (SameBoundary) {
508 // Prioritize instructions that read unbuffered resources by stall cycles.
509 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
510 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
511 return TryCand.Reason != NoCand;
512
513 // Avoid critical resource consumption and balance the schedule.
514 TryCand.initResourceDelta(DAG, SchedModel);
515 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
516 TryCand, Cand, ResourceReduce))
517 return TryCand.Reason != NoCand;
518 if (tryGreater(TryCand.ResDelta.DemandedResources,
519 Cand.ResDelta.DemandedResources, TryCand, Cand,
520 ResourceDemand))
521 return TryCand.Reason != NoCand;
522
523 // Unconditionally try to reduce latency.
524 if (tryLatency(TryCand, Cand, *Zone))
525 return TryCand.Reason != NoCand;
526
527 // Weak edges are for clustering and other constraints.
528 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
529 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
530 return TryCand.Reason != NoCand;
531 }
532
533 // Keep clustered nodes together to encourage downstream peephole
534 // optimizations which may reduce resource requirements.
535 //
536 // This is a best effort to set things up for a post-RA pass. Optimizations
537 // like generating loads of multiple registers should ideally be done within
538 // the scheduler pass by combining the loads during DAG postprocessing.
539 const SUnit *CandNextClusterSU =
540 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
541 const SUnit *TryCandNextClusterSU =
542 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
543 if (tryGreater(TryCand.SU == TryCandNextClusterSU,
544 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
545 return TryCand.Reason != NoCand;
546
547 // Avoid increasing the max critical pressure in the scheduled region.
548 if (DAG->isTrackingPressure() &&
549 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
550 TryCand, Cand, RegCritical, TRI, DAG->MF))
551 return TryCand.Reason != NoCand;
552
553 // Avoid increasing the max pressure of the entire region.
554 if (DAG->isTrackingPressure() &&
555 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
556 Cand, RegMax, TRI, DAG->MF))
557 return TryCand.Reason != NoCand;
558
559 if (SameBoundary) {
560 // Fall through to original instruction order.
561 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
562 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
563 TryCand.Reason = NodeOrder;
564 return true;
565 }
566 }
567 return false;
568 }
569
GCNScheduleDAGMILive(MachineSchedContext * C,std::unique_ptr<MachineSchedStrategy> S)570 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
571 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
572 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
573 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
574 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
575
576 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
577 if (RelaxedOcc) {
578 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
579 if (MinOccupancy != StartingOccupancy)
580 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
581 << ".\n");
582 }
583 }
584
585 std::unique_ptr<GCNSchedStage>
createSchedStage(GCNSchedStageID SchedStageID)586 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
587 switch (SchedStageID) {
588 case GCNSchedStageID::OccInitialSchedule:
589 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
590 case GCNSchedStageID::UnclusteredHighRPReschedule:
591 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
592 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
593 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
594 case GCNSchedStageID::PreRARematerialize:
595 return std::make_unique<PreRARematStage>(SchedStageID, *this);
596 case GCNSchedStageID::ILPInitialSchedule:
597 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
598 }
599
600 llvm_unreachable("Unknown SchedStageID.");
601 }
602
schedule()603 void GCNScheduleDAGMILive::schedule() {
604 // Collect all scheduling regions. The actual scheduling is performed in
605 // GCNScheduleDAGMILive::finalizeSchedule.
606 Regions.push_back(std::pair(RegionBegin, RegionEnd));
607 }
608
609 GCNRegPressure
getRealRegPressure(unsigned RegionIdx) const610 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
611 GCNDownwardRPTracker RPTracker(*LIS);
612 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
613 return RPTracker.moveMaxPressure();
614 }
615
computeBlockPressure(unsigned RegionIdx,const MachineBasicBlock * MBB)616 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
617 const MachineBasicBlock *MBB) {
618 GCNDownwardRPTracker RPTracker(*LIS);
619
620 // If the block has the only successor then live-ins of that successor are
621 // live-outs of the current block. We can reuse calculated live set if the
622 // successor will be sent to scheduling past current block.
623
624 // However, due to the bug in LiveInterval analysis it may happen that two
625 // predecessors of the same successor block have different lane bitmasks for
626 // a live-out register. Workaround that by sticking to one-to-one relationship
627 // i.e. one predecessor with one successor block.
628 const MachineBasicBlock *OnlySucc = nullptr;
629 if (MBB->succ_size() == 1) {
630 auto *Candidate = *MBB->succ_begin();
631 if (!Candidate->empty() && Candidate->pred_size() == 1) {
632 SlotIndexes *Ind = LIS->getSlotIndexes();
633 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
634 OnlySucc = Candidate;
635 }
636 }
637
638 // Scheduler sends regions from the end of the block upwards.
639 size_t CurRegion = RegionIdx;
640 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
641 if (Regions[CurRegion].first->getParent() != MBB)
642 break;
643 --CurRegion;
644
645 auto I = MBB->begin();
646 auto LiveInIt = MBBLiveIns.find(MBB);
647 auto &Rgn = Regions[CurRegion];
648 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
649 if (LiveInIt != MBBLiveIns.end()) {
650 auto LiveIn = std::move(LiveInIt->second);
651 RPTracker.reset(*MBB->begin(), &LiveIn);
652 MBBLiveIns.erase(LiveInIt);
653 } else {
654 I = Rgn.first;
655 auto LRS = BBLiveInMap.lookup(NonDbgMI);
656 #ifdef EXPENSIVE_CHECKS
657 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
658 #endif
659 RPTracker.reset(*I, &LRS);
660 }
661
662 for (;;) {
663 I = RPTracker.getNext();
664
665 if (Regions[CurRegion].first == I || NonDbgMI == I) {
666 LiveIns[CurRegion] = RPTracker.getLiveRegs();
667 RPTracker.clearMaxPressure();
668 }
669
670 if (Regions[CurRegion].second == I) {
671 Pressure[CurRegion] = RPTracker.moveMaxPressure();
672 if (CurRegion-- == RegionIdx)
673 break;
674 }
675 RPTracker.advanceToNext();
676 RPTracker.advanceBeforeNext();
677 }
678
679 if (OnlySucc) {
680 if (I != MBB->end()) {
681 RPTracker.advanceToNext();
682 RPTracker.advance(MBB->end());
683 }
684 RPTracker.advanceBeforeNext();
685 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
686 }
687 }
688
689 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
getBBLiveInMap() const690 GCNScheduleDAGMILive::getBBLiveInMap() const {
691 assert(!Regions.empty());
692 std::vector<MachineInstr *> BBStarters;
693 BBStarters.reserve(Regions.size());
694 auto I = Regions.rbegin(), E = Regions.rend();
695 auto *BB = I->first->getParent();
696 do {
697 auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
698 BBStarters.push_back(MI);
699 do {
700 ++I;
701 } while (I != E && I->first->getParent() == BB);
702 } while (I != E);
703 return getLiveRegMap(BBStarters, false /*After*/, *LIS);
704 }
705
finalizeSchedule()706 void GCNScheduleDAGMILive::finalizeSchedule() {
707 // Start actual scheduling here. This function is called by the base
708 // MachineScheduler after all regions have been recorded by
709 // GCNScheduleDAGMILive::schedule().
710 LiveIns.resize(Regions.size());
711 Pressure.resize(Regions.size());
712 RescheduleRegions.resize(Regions.size());
713 RegionsWithHighRP.resize(Regions.size());
714 RegionsWithExcessRP.resize(Regions.size());
715 RegionsWithMinOcc.resize(Regions.size());
716 RegionsWithIGLPInstrs.resize(Regions.size());
717 RescheduleRegions.set();
718 RegionsWithHighRP.reset();
719 RegionsWithExcessRP.reset();
720 RegionsWithMinOcc.reset();
721 RegionsWithIGLPInstrs.reset();
722
723 runSchedStages();
724 }
725
runSchedStages()726 void GCNScheduleDAGMILive::runSchedStages() {
727 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
728
729 if (!Regions.empty())
730 BBLiveInMap = getBBLiveInMap();
731
732 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
733 while (S.advanceStage()) {
734 auto Stage = createSchedStage(S.getCurrentStage());
735 if (!Stage->initGCNSchedStage())
736 continue;
737
738 for (auto Region : Regions) {
739 RegionBegin = Region.first;
740 RegionEnd = Region.second;
741 // Setup for scheduling the region and check whether it should be skipped.
742 if (!Stage->initGCNRegion()) {
743 Stage->advanceRegion();
744 exitRegion();
745 continue;
746 }
747
748 ScheduleDAGMILive::schedule();
749 Stage->finalizeGCNRegion();
750 }
751
752 Stage->finalizeGCNSchedStage();
753 }
754 }
755
756 #ifndef NDEBUG
operator <<(raw_ostream & OS,const GCNSchedStageID & StageID)757 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
758 switch (StageID) {
759 case GCNSchedStageID::OccInitialSchedule:
760 OS << "Max Occupancy Initial Schedule";
761 break;
762 case GCNSchedStageID::UnclusteredHighRPReschedule:
763 OS << "Unclustered High Register Pressure Reschedule";
764 break;
765 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
766 OS << "Clustered Low Occupancy Reschedule";
767 break;
768 case GCNSchedStageID::PreRARematerialize:
769 OS << "Pre-RA Rematerialize";
770 break;
771 case GCNSchedStageID::ILPInitialSchedule:
772 OS << "Max ILP Initial Schedule";
773 break;
774 }
775
776 return OS;
777 }
778 #endif
779
GCNSchedStage(GCNSchedStageID StageID,GCNScheduleDAGMILive & DAG)780 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
781 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
782 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
783
initGCNSchedStage()784 bool GCNSchedStage::initGCNSchedStage() {
785 if (!DAG.LIS)
786 return false;
787
788 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
789 return true;
790 }
791
initGCNSchedStage()792 bool UnclusteredHighRPStage::initGCNSchedStage() {
793 if (DisableUnclusterHighRP)
794 return false;
795
796 if (!GCNSchedStage::initGCNSchedStage())
797 return false;
798
799 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
800 return false;
801
802 SavedMutations.swap(DAG.Mutations);
803 DAG.addMutation(
804 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
805
806 InitialOccupancy = DAG.MinOccupancy;
807 // Aggressivly try to reduce register pressure in the unclustered high RP
808 // stage. Temporarily increase occupancy target in the region.
809 S.SGPRLimitBias = S.HighRPSGPRBias;
810 S.VGPRLimitBias = S.HighRPVGPRBias;
811 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
812 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
813
814 LLVM_DEBUG(
815 dbgs()
816 << "Retrying function scheduling without clustering. "
817 "Aggressivly try to reduce register pressure to achieve occupancy "
818 << DAG.MinOccupancy << ".\n");
819
820 return true;
821 }
822
initGCNSchedStage()823 bool ClusteredLowOccStage::initGCNSchedStage() {
824 if (DisableClusteredLowOccupancy)
825 return false;
826
827 if (!GCNSchedStage::initGCNSchedStage())
828 return false;
829
830 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
831 // been dropped. All regions will have already been scheduled with the ideal
832 // occupancy targets.
833 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
834 return false;
835
836 LLVM_DEBUG(
837 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
838 << DAG.MinOccupancy << ".\n");
839 return true;
840 }
841
initGCNSchedStage()842 bool PreRARematStage::initGCNSchedStage() {
843 if (!GCNSchedStage::initGCNSchedStage())
844 return false;
845
846 if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
847 return false;
848
849 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
850 // Check maximum occupancy
851 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
852 DAG.MinOccupancy)
853 return false;
854
855 // FIXME: This pass will invalidate cached MBBLiveIns for regions
856 // inbetween the defs and region we sinked the def to. Cached pressure
857 // for regions where a def is sinked from will also be invalidated. Will
858 // need to be fixed if there is another pass after this pass.
859 assert(!S.hasNextStage());
860
861 collectRematerializableInstructions();
862 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
863 return false;
864
865 LLVM_DEBUG(
866 dbgs() << "Retrying function scheduling with improved occupancy of "
867 << DAG.MinOccupancy << " from rematerializing\n");
868 return true;
869 }
870
finalizeGCNSchedStage()871 void GCNSchedStage::finalizeGCNSchedStage() {
872 DAG.finishBlock();
873 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
874 }
875
finalizeGCNSchedStage()876 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
877 SavedMutations.swap(DAG.Mutations);
878 S.SGPRLimitBias = S.VGPRLimitBias = 0;
879 if (DAG.MinOccupancy > InitialOccupancy) {
880 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
881 DAG.RegionsWithMinOcc[IDX] =
882 DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
883
884 LLVM_DEBUG(dbgs() << StageID
885 << " stage successfully increased occupancy to "
886 << DAG.MinOccupancy << '\n');
887 }
888
889 GCNSchedStage::finalizeGCNSchedStage();
890 }
891
initGCNRegion()892 bool GCNSchedStage::initGCNRegion() {
893 // Check whether this new region is also a new block.
894 if (DAG.RegionBegin->getParent() != CurrentMBB)
895 setupNewBlock();
896
897 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
898 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
899
900 // Skip empty scheduling regions (0 or 1 schedulable instructions).
901 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
902 return false;
903
904 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
905 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
906 << " " << CurrentMBB->getName()
907 << "\n From: " << *DAG.begin() << " To: ";
908 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
909 else dbgs() << "End";
910 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
911
912 // Save original instruction order before scheduling for possible revert.
913 Unsched.clear();
914 Unsched.reserve(DAG.NumRegionInstrs);
915 if (StageID == GCNSchedStageID::OccInitialSchedule ||
916 StageID == GCNSchedStageID::ILPInitialSchedule) {
917 for (auto &I : DAG) {
918 Unsched.push_back(&I);
919 if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
920 I.getOpcode() == AMDGPU::IGLP_OPT)
921 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
922 }
923 } else {
924 for (auto &I : DAG)
925 Unsched.push_back(&I);
926 }
927
928 PressureBefore = DAG.Pressure[RegionIdx];
929
930 LLVM_DEBUG(
931 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
932 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
933 << "Region live-in pressure: "
934 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
935 << "Region register pressure: " << print(PressureBefore));
936
937 S.HasHighPressure = false;
938 S.KnownExcessRP = isRegionWithExcessRP();
939
940 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
941 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
942 SavedMutations.clear();
943 SavedMutations.swap(DAG.Mutations);
944 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
945 StageID == GCNSchedStageID::ILPInitialSchedule;
946 DAG.addMutation(createIGroupLPDAGMutation(
947 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
948 : AMDGPU::SchedulingPhase::PreRAReentry));
949 }
950
951 return true;
952 }
953
initGCNRegion()954 bool UnclusteredHighRPStage::initGCNRegion() {
955 // Only reschedule regions with the minimum occupancy or regions that may have
956 // spilling (excess register pressure).
957 if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
958 DAG.MinOccupancy <= InitialOccupancy) &&
959 !DAG.RegionsWithExcessRP[RegionIdx])
960 return false;
961
962 return GCNSchedStage::initGCNRegion();
963 }
964
initGCNRegion()965 bool ClusteredLowOccStage::initGCNRegion() {
966 // We may need to reschedule this region if it wasn't rescheduled in the last
967 // stage, or if we found it was testing critical register pressure limits in
968 // the unclustered reschedule stage. The later is because we may not have been
969 // able to raise the min occupancy in the previous stage so the region may be
970 // overly constrained even if it was already rescheduled.
971 if (!DAG.RegionsWithHighRP[RegionIdx])
972 return false;
973
974 return GCNSchedStage::initGCNRegion();
975 }
976
initGCNRegion()977 bool PreRARematStage::initGCNRegion() {
978 if (!DAG.RescheduleRegions[RegionIdx])
979 return false;
980
981 return GCNSchedStage::initGCNRegion();
982 }
983
setupNewBlock()984 void GCNSchedStage::setupNewBlock() {
985 if (CurrentMBB)
986 DAG.finishBlock();
987
988 CurrentMBB = DAG.RegionBegin->getParent();
989 DAG.startBlock(CurrentMBB);
990 // Get real RP for the region if it hasn't be calculated before. After the
991 // initial schedule stage real RP will be collected after scheduling.
992 if (StageID == GCNSchedStageID::OccInitialSchedule ||
993 StageID == GCNSchedStageID::ILPInitialSchedule)
994 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
995 }
996
finalizeGCNRegion()997 void GCNSchedStage::finalizeGCNRegion() {
998 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
999 DAG.RescheduleRegions[RegionIdx] = false;
1000 if (S.HasHighPressure)
1001 DAG.RegionsWithHighRP[RegionIdx] = true;
1002
1003 // Revert scheduling if we have dropped occupancy or there is some other
1004 // reason that the original schedule is better.
1005 checkScheduling();
1006
1007 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1008 StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1009 SavedMutations.swap(DAG.Mutations);
1010
1011 DAG.exitRegion();
1012 RegionIdx++;
1013 }
1014
checkScheduling()1015 void GCNSchedStage::checkScheduling() {
1016 // Check the results of scheduling.
1017 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1018 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1019 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1020
1021 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1022 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1023 DAG.Pressure[RegionIdx] = PressureAfter;
1024 DAG.RegionsWithMinOcc[RegionIdx] =
1025 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1026
1027 // Early out if we have achieved the occupancy target.
1028 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1029 return;
1030 }
1031
1032 unsigned TargetOccupancy =
1033 std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
1034 unsigned WavesAfter =
1035 std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
1036 unsigned WavesBefore =
1037 std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
1038 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1039 << ", after " << WavesAfter << ".\n");
1040
1041 // We may not be able to keep the current target occupancy because of the just
1042 // scheduled region. We might still be able to revert scheduling if the
1043 // occupancy before was higher, or if the current schedule has register
1044 // pressure higher than the excess limits which could lead to more spilling.
1045 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1046
1047 // Allow memory bound functions to drop to 4 waves if not limited by an
1048 // attribute.
1049 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1050 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1051 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1052 << MFI.getMinAllowedOccupancy() << " waves\n");
1053 NewOccupancy = WavesAfter;
1054 }
1055
1056 if (NewOccupancy < DAG.MinOccupancy) {
1057 DAG.MinOccupancy = NewOccupancy;
1058 MFI.limitOccupancy(DAG.MinOccupancy);
1059 DAG.RegionsWithMinOcc.reset();
1060 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1061 << DAG.MinOccupancy << ".\n");
1062 }
1063 // The maximum number of arch VGPR on non-unified register file, or the
1064 // maximum VGPR + AGPR in the unified register file case.
1065 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1066 // The maximum number of arch VGPR for both unified and non-unified register
1067 // file.
1068 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1069 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1070
1071 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1072 PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
1073 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1074 PressureAfter.getSGPRNum() > MaxSGPRs) {
1075 DAG.RescheduleRegions[RegionIdx] = true;
1076 DAG.RegionsWithHighRP[RegionIdx] = true;
1077 DAG.RegionsWithExcessRP[RegionIdx] = true;
1078 }
1079
1080 // Revert if this region's schedule would cause a drop in occupancy or
1081 // spilling.
1082 if (shouldRevertScheduling(WavesAfter)) {
1083 revertScheduling();
1084 } else {
1085 DAG.Pressure[RegionIdx] = PressureAfter;
1086 DAG.RegionsWithMinOcc[RegionIdx] =
1087 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1088 }
1089 }
1090
1091 unsigned
computeSUnitReadyCycle(const SUnit & SU,unsigned CurrCycle,DenseMap<unsigned,unsigned> & ReadyCycles,const TargetSchedModel & SM)1092 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1093 DenseMap<unsigned, unsigned> &ReadyCycles,
1094 const TargetSchedModel &SM) {
1095 unsigned ReadyCycle = CurrCycle;
1096 for (auto &D : SU.Preds) {
1097 if (D.isAssignedRegDep()) {
1098 MachineInstr *DefMI = D.getSUnit()->getInstr();
1099 unsigned Latency = SM.computeInstrLatency(DefMI);
1100 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1101 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1102 }
1103 }
1104 ReadyCycles[SU.NodeNum] = ReadyCycle;
1105 return ReadyCycle;
1106 }
1107
1108 #ifndef NDEBUG
1109 struct EarlierIssuingCycle {
operator ()EarlierIssuingCycle1110 bool operator()(std::pair<MachineInstr *, unsigned> A,
1111 std::pair<MachineInstr *, unsigned> B) const {
1112 return A.second < B.second;
1113 }
1114 };
1115
printScheduleModel(std::set<std::pair<MachineInstr *,unsigned>,EarlierIssuingCycle> & ReadyCycles)1116 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1117 EarlierIssuingCycle> &ReadyCycles) {
1118 if (ReadyCycles.empty())
1119 return;
1120 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1121 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1122 << " ##################\n# Cycle #\t\t\tInstruction "
1123 " "
1124 " \n";
1125 unsigned IPrev = 1;
1126 for (auto &I : ReadyCycles) {
1127 if (I.second > IPrev + 1)
1128 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1129 << " CYCLES DETECTED ******************************\n\n";
1130 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1131 IPrev = I.second;
1132 }
1133 }
1134 #endif
1135
1136 ScheduleMetrics
getScheduleMetrics(const std::vector<SUnit> & InputSchedule)1137 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1138 #ifndef NDEBUG
1139 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1140 ReadyCyclesSorted;
1141 #endif
1142 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1143 unsigned SumBubbles = 0;
1144 DenseMap<unsigned, unsigned> ReadyCycles;
1145 unsigned CurrCycle = 0;
1146 for (auto &SU : InputSchedule) {
1147 unsigned ReadyCycle =
1148 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1149 SumBubbles += ReadyCycle - CurrCycle;
1150 #ifndef NDEBUG
1151 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1152 #endif
1153 CurrCycle = ++ReadyCycle;
1154 }
1155 #ifndef NDEBUG
1156 LLVM_DEBUG(
1157 printScheduleModel(ReadyCyclesSorted);
1158 dbgs() << "\n\t"
1159 << "Metric: "
1160 << (SumBubbles
1161 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1162 : 1)
1163 << "\n\n");
1164 #endif
1165
1166 return ScheduleMetrics(CurrCycle, SumBubbles);
1167 }
1168
1169 ScheduleMetrics
getScheduleMetrics(const GCNScheduleDAGMILive & DAG)1170 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1171 #ifndef NDEBUG
1172 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1173 ReadyCyclesSorted;
1174 #endif
1175 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1176 unsigned SumBubbles = 0;
1177 DenseMap<unsigned, unsigned> ReadyCycles;
1178 unsigned CurrCycle = 0;
1179 for (auto &MI : DAG) {
1180 SUnit *SU = DAG.getSUnit(&MI);
1181 if (!SU)
1182 continue;
1183 unsigned ReadyCycle =
1184 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1185 SumBubbles += ReadyCycle - CurrCycle;
1186 #ifndef NDEBUG
1187 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1188 #endif
1189 CurrCycle = ++ReadyCycle;
1190 }
1191 #ifndef NDEBUG
1192 LLVM_DEBUG(
1193 printScheduleModel(ReadyCyclesSorted);
1194 dbgs() << "\n\t"
1195 << "Metric: "
1196 << (SumBubbles
1197 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1198 : 1)
1199 << "\n\n");
1200 #endif
1201
1202 return ScheduleMetrics(CurrCycle, SumBubbles);
1203 }
1204
shouldRevertScheduling(unsigned WavesAfter)1205 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1206 if (WavesAfter < DAG.MinOccupancy)
1207 return true;
1208
1209 return false;
1210 }
1211
shouldRevertScheduling(unsigned WavesAfter)1212 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1213 if (PressureAfter == PressureBefore)
1214 return false;
1215
1216 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1217 return true;
1218
1219 if (mayCauseSpilling(WavesAfter))
1220 return true;
1221
1222 return false;
1223 }
1224
shouldRevertScheduling(unsigned WavesAfter)1225 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1226 // If RP is not reduced in the unclustered reschedule stage, revert to the
1227 // old schedule.
1228 if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1229 mayCauseSpilling(WavesAfter)) ||
1230 GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1231 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1232 return true;
1233 }
1234
1235 // Do not attempt to relax schedule even more if we are already spilling.
1236 if (isRegionWithExcessRP())
1237 return false;
1238
1239 LLVM_DEBUG(
1240 dbgs()
1241 << "\n\t *** In shouldRevertScheduling ***\n"
1242 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1243 ScheduleMetrics MBefore =
1244 getScheduleMetrics(DAG.SUnits);
1245 LLVM_DEBUG(
1246 dbgs()
1247 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1248 ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1249 unsigned OldMetric = MBefore.getMetric();
1250 unsigned NewMetric = MAfter.getMetric();
1251 unsigned WavesBefore =
1252 std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1253 unsigned Profit =
1254 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1255 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1256 NewMetric) /
1257 ScheduleMetrics::ScaleFactor;
1258 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1259 << MAfter << "Profit: " << Profit << "\n");
1260 return Profit < ScheduleMetrics::ScaleFactor;
1261 }
1262
shouldRevertScheduling(unsigned WavesAfter)1263 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1264 if (PressureAfter == PressureBefore)
1265 return false;
1266
1267 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1268 return true;
1269
1270 if (mayCauseSpilling(WavesAfter))
1271 return true;
1272
1273 return false;
1274 }
1275
shouldRevertScheduling(unsigned WavesAfter)1276 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1277 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1278 return true;
1279
1280 if (mayCauseSpilling(WavesAfter))
1281 return true;
1282
1283 return false;
1284 }
1285
shouldRevertScheduling(unsigned WavesAfter)1286 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1287 if (mayCauseSpilling(WavesAfter))
1288 return true;
1289
1290 return false;
1291 }
1292
mayCauseSpilling(unsigned WavesAfter)1293 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1294 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1295 !PressureAfter.less(MF, PressureBefore)) {
1296 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1297 return true;
1298 }
1299
1300 return false;
1301 }
1302
revertScheduling()1303 void GCNSchedStage::revertScheduling() {
1304 DAG.RegionsWithMinOcc[RegionIdx] =
1305 PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1306 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1307 DAG.RescheduleRegions[RegionIdx] =
1308 S.hasNextStage() &&
1309 S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1310 DAG.RegionEnd = DAG.RegionBegin;
1311 int SkippedDebugInstr = 0;
1312 for (MachineInstr *MI : Unsched) {
1313 if (MI->isDebugInstr()) {
1314 ++SkippedDebugInstr;
1315 continue;
1316 }
1317
1318 if (MI->getIterator() != DAG.RegionEnd) {
1319 DAG.BB->remove(MI);
1320 DAG.BB->insert(DAG.RegionEnd, MI);
1321 if (!MI->isDebugInstr())
1322 DAG.LIS->handleMove(*MI, true);
1323 }
1324
1325 // Reset read-undef flags and update them later.
1326 for (auto &Op : MI->all_defs())
1327 Op.setIsUndef(false);
1328 RegisterOperands RegOpers;
1329 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1330 if (!MI->isDebugInstr()) {
1331 if (DAG.ShouldTrackLaneMasks) {
1332 // Adjust liveness and add missing dead+read-undef flags.
1333 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1334 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1335 } else {
1336 // Adjust for missing dead-def flags.
1337 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1338 }
1339 }
1340 DAG.RegionEnd = MI->getIterator();
1341 ++DAG.RegionEnd;
1342 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1343 }
1344
1345 // After reverting schedule, debug instrs will now be at the end of the block
1346 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1347 // pass debug instrs to the actual end of the scheduling region.
1348 while (SkippedDebugInstr-- > 0)
1349 ++DAG.RegionEnd;
1350
1351 // If Unsched.front() instruction is a debug instruction, this will actually
1352 // shrink the region since we moved all debug instructions to the end of the
1353 // block. Find the first instruction that is not a debug instruction.
1354 DAG.RegionBegin = Unsched.front()->getIterator();
1355 if (DAG.RegionBegin->isDebugInstr()) {
1356 for (MachineInstr *MI : Unsched) {
1357 if (MI->isDebugInstr())
1358 continue;
1359 DAG.RegionBegin = MI->getIterator();
1360 break;
1361 }
1362 }
1363
1364 // Then move the debug instructions back into their correct place and set
1365 // RegionBegin and RegionEnd if needed.
1366 DAG.placeDebugValues();
1367
1368 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1369 }
1370
collectRematerializableInstructions()1371 void PreRARematStage::collectRematerializableInstructions() {
1372 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1373 for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1374 Register Reg = Register::index2VirtReg(I);
1375 if (!DAG.LIS->hasInterval(Reg))
1376 continue;
1377
1378 // TODO: Handle AGPR and SGPR rematerialization
1379 if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1380 !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1381 continue;
1382
1383 MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1384 MachineInstr *Def = Op->getParent();
1385 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1386 continue;
1387
1388 MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1389 if (Def->getParent() == UseI->getParent())
1390 continue;
1391
1392 // We are only collecting defs that are defined in another block and are
1393 // live-through or used inside regions at MinOccupancy. This means that the
1394 // register must be in the live-in set for the region.
1395 bool AddedToRematList = false;
1396 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1397 auto It = DAG.LiveIns[I].find(Reg);
1398 if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1399 if (DAG.RegionsWithMinOcc[I]) {
1400 RematerializableInsts[I][Def] = UseI;
1401 AddedToRematList = true;
1402 }
1403
1404 // Collect regions with rematerializable reg as live-in to avoid
1405 // searching later when updating RP.
1406 RematDefToLiveInRegions[Def].push_back(I);
1407 }
1408 }
1409 if (!AddedToRematList)
1410 RematDefToLiveInRegions.erase(Def);
1411 }
1412 }
1413
sinkTriviallyRematInsts(const GCNSubtarget & ST,const TargetInstrInfo * TII)1414 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1415 const TargetInstrInfo *TII) {
1416 // Temporary copies of cached variables we will be modifying and replacing if
1417 // sinking succeeds.
1418 SmallVector<
1419 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1420 NewRegions;
1421 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1422 DenseMap<unsigned, GCNRegPressure> NewPressure;
1423 BitVector NewRescheduleRegions;
1424 LiveIntervals *LIS = DAG.LIS;
1425
1426 NewRegions.resize(DAG.Regions.size());
1427 NewRescheduleRegions.resize(DAG.Regions.size());
1428
1429 // Collect only regions that has a rematerializable def as a live-in.
1430 SmallSet<unsigned, 16> ImpactedRegions;
1431 for (const auto &It : RematDefToLiveInRegions)
1432 ImpactedRegions.insert(It.second.begin(), It.second.end());
1433
1434 // Make copies of register pressure and live-ins cache that will be updated
1435 // as we rematerialize.
1436 for (auto Idx : ImpactedRegions) {
1437 NewPressure[Idx] = DAG.Pressure[Idx];
1438 NewLiveIns[Idx] = DAG.LiveIns[Idx];
1439 }
1440 NewRegions = DAG.Regions;
1441 NewRescheduleRegions.reset();
1442
1443 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1444 bool Improved = false;
1445 for (auto I : ImpactedRegions) {
1446 if (!DAG.RegionsWithMinOcc[I])
1447 continue;
1448
1449 Improved = false;
1450 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1451 int SGPRUsage = NewPressure[I].getSGPRNum();
1452
1453 // TODO: Handle occupancy drop due to AGPR and SGPR.
1454 // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1455 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1456 break;
1457
1458 // The occupancy of this region could have been improved by a previous
1459 // iteration's sinking of defs.
1460 if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1461 NewRescheduleRegions[I] = true;
1462 Improved = true;
1463 continue;
1464 }
1465
1466 // First check if we have enough trivially rematerializable instructions to
1467 // improve occupancy. Optimistically assume all instructions we are able to
1468 // sink decreased RP.
1469 int TotalSinkableRegs = 0;
1470 for (const auto &It : RematerializableInsts[I]) {
1471 MachineInstr *Def = It.first;
1472 Register DefReg = Def->getOperand(0).getReg();
1473 TotalSinkableRegs +=
1474 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1475 }
1476 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1477 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1478 // If in the most optimistic scenario, we cannot improve occupancy, then do
1479 // not attempt to sink any instructions.
1480 if (OptimisticOccupancy <= DAG.MinOccupancy)
1481 break;
1482
1483 unsigned ImproveOccupancy = 0;
1484 SmallVector<MachineInstr *, 4> SinkedDefs;
1485 for (auto &It : RematerializableInsts[I]) {
1486 MachineInstr *Def = It.first;
1487 MachineBasicBlock::iterator InsertPos =
1488 MachineBasicBlock::iterator(It.second);
1489 Register Reg = Def->getOperand(0).getReg();
1490 // Rematerialize MI to its use block. Since we are only rematerializing
1491 // instructions that do not have any virtual reg uses, we do not need to
1492 // call LiveRangeEdit::allUsesAvailableAt() and
1493 // LiveRangeEdit::canRematerializeAt().
1494 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1495 Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1496 MachineInstr *NewMI = &*std::prev(InsertPos);
1497 LIS->InsertMachineInstrInMaps(*NewMI);
1498 LIS->removeInterval(Reg);
1499 LIS->createAndComputeVirtRegInterval(Reg);
1500 InsertedMIToOldDef[NewMI] = Def;
1501
1502 // Update region boundaries in scheduling region we sinked from since we
1503 // may sink an instruction that was at the beginning or end of its region
1504 DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1505 /*Removing =*/true);
1506
1507 // Update region boundaries in region we sinked to.
1508 DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1509
1510 LaneBitmask PrevMask = NewLiveIns[I][Reg];
1511 // FIXME: Also update cached pressure for where the def was sinked from.
1512 // Update RP for all regions that has this reg as a live-in and remove
1513 // the reg from all regions as a live-in.
1514 for (auto Idx : RematDefToLiveInRegions[Def]) {
1515 NewLiveIns[Idx].erase(Reg);
1516 if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1517 // Def is live-through and not used in this block.
1518 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1519 } else {
1520 // Def is used and rematerialized into this block.
1521 GCNDownwardRPTracker RPT(*LIS);
1522 auto *NonDbgMI = &*skipDebugInstructionsForward(
1523 NewRegions[Idx].first, NewRegions[Idx].second);
1524 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1525 RPT.advance(NewRegions[Idx].second);
1526 NewPressure[Idx] = RPT.moveMaxPressure();
1527 }
1528 }
1529
1530 SinkedDefs.push_back(Def);
1531 ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1532 if (ImproveOccupancy > DAG.MinOccupancy)
1533 break;
1534 }
1535
1536 // Remove defs we just sinked from all regions' list of sinkable defs
1537 for (auto &Def : SinkedDefs)
1538 for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1539 RematerializableInsts[TrackedIdx].erase(Def);
1540
1541 if (ImproveOccupancy <= DAG.MinOccupancy)
1542 break;
1543
1544 NewRescheduleRegions[I] = true;
1545 Improved = true;
1546 }
1547
1548 if (!Improved) {
1549 // Occupancy was not improved for all regions that were at MinOccupancy.
1550 // Undo sinking and remove newly rematerialized instructions.
1551 for (auto &Entry : InsertedMIToOldDef) {
1552 MachineInstr *MI = Entry.first;
1553 MachineInstr *OldMI = Entry.second;
1554 Register Reg = MI->getOperand(0).getReg();
1555 LIS->RemoveMachineInstrFromMaps(*MI);
1556 MI->eraseFromParent();
1557 OldMI->clearRegisterDeads(Reg);
1558 LIS->removeInterval(Reg);
1559 LIS->createAndComputeVirtRegInterval(Reg);
1560 }
1561 return false;
1562 }
1563
1564 // Occupancy was improved for all regions.
1565 for (auto &Entry : InsertedMIToOldDef) {
1566 MachineInstr *MI = Entry.first;
1567 MachineInstr *OldMI = Entry.second;
1568
1569 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1570 DAG.BBLiveInMap.erase(OldMI);
1571
1572 // Remove OldMI and update LIS
1573 Register Reg = MI->getOperand(0).getReg();
1574 LIS->RemoveMachineInstrFromMaps(*OldMI);
1575 OldMI->eraseFromParent();
1576 LIS->removeInterval(Reg);
1577 LIS->createAndComputeVirtRegInterval(Reg);
1578 }
1579
1580 // Update live-ins, register pressure, and regions caches.
1581 for (auto Idx : ImpactedRegions) {
1582 DAG.LiveIns[Idx] = NewLiveIns[Idx];
1583 DAG.Pressure[Idx] = NewPressure[Idx];
1584 DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1585 }
1586 DAG.Regions = NewRegions;
1587 DAG.RescheduleRegions = NewRescheduleRegions;
1588
1589 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1590 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1591
1592 return true;
1593 }
1594
1595 // Copied from MachineLICM
isTriviallyReMaterializable(const MachineInstr & MI)1596 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1597 if (!DAG.TII->isTriviallyReMaterializable(MI))
1598 return false;
1599
1600 for (const MachineOperand &MO : MI.all_uses())
1601 if (MO.getReg().isVirtual())
1602 return false;
1603
1604 return true;
1605 }
1606
1607 // When removing, we will have to check both beginning and ending of the region.
1608 // When inserting, we will only have to check if we are inserting NewMI in front
1609 // of a scheduling region and do not need to check the ending since we will only
1610 // ever be inserting before an already existing MI.
updateRegionBoundaries(SmallVectorImpl<std::pair<MachineBasicBlock::iterator,MachineBasicBlock::iterator>> & RegionBoundaries,MachineBasicBlock::iterator MI,MachineInstr * NewMI,bool Removing)1611 void GCNScheduleDAGMILive::updateRegionBoundaries(
1612 SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1613 MachineBasicBlock::iterator>> &RegionBoundaries,
1614 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1615 unsigned I = 0, E = RegionBoundaries.size();
1616 // Search for first region of the block where MI is located
1617 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1618 ++I;
1619
1620 for (; I != E; ++I) {
1621 if (MI->getParent() != RegionBoundaries[I].first->getParent())
1622 return;
1623
1624 if (Removing && MI == RegionBoundaries[I].first &&
1625 MI == RegionBoundaries[I].second) {
1626 // MI is in a region with size 1, after removing, the region will be
1627 // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1628 RegionBoundaries[I] =
1629 std::pair(MI->getParent()->end(), MI->getParent()->end());
1630 return;
1631 }
1632 if (MI == RegionBoundaries[I].first) {
1633 if (Removing)
1634 RegionBoundaries[I] =
1635 std::pair(std::next(MI), RegionBoundaries[I].second);
1636 else
1637 // Inserted NewMI in front of region, set new RegionBegin to NewMI
1638 RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1639 RegionBoundaries[I].second);
1640 return;
1641 }
1642 if (Removing && MI == RegionBoundaries[I].second) {
1643 RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1644 return;
1645 }
1646 }
1647 }
1648
hasIGLPInstrs(ScheduleDAGInstrs * DAG)1649 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1650 return std::any_of(
1651 DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
1652 unsigned Opc = MI->getOpcode();
1653 return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1654 });
1655 }
1656
GCNPostScheduleDAGMILive(MachineSchedContext * C,std::unique_ptr<MachineSchedStrategy> S,bool RemoveKillFlags)1657 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1658 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1659 bool RemoveKillFlags)
1660 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1661
schedule()1662 void GCNPostScheduleDAGMILive::schedule() {
1663 HasIGLPInstrs = hasIGLPInstrs(this);
1664 if (HasIGLPInstrs) {
1665 SavedMutations.clear();
1666 SavedMutations.swap(Mutations);
1667 addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1668 }
1669
1670 ScheduleDAGMI::schedule();
1671 }
1672
finalizeSchedule()1673 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1674 if (HasIGLPInstrs)
1675 SavedMutations.swap(Mutations);
1676
1677 ScheduleDAGMI::finalizeSchedule();
1678 }
1679