1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 /// 13 /// This pass will apply multiple scheduling stages to the same function. 14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual 15 /// entry point for the scheduling of those regions is 16 /// GCNScheduleDAGMILive::runSchedStages. 17 18 /// Generally, the reason for having multiple scheduling stages is to account 19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a 20 /// few scheduling regions will have register pressure high enough to limit 21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in 22 /// other regions. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "GCNSchedStrategy.h" 27 #include "AMDGPUIGroupLP.h" 28 #include "SIMachineFunctionInfo.h" 29 #include "llvm/CodeGen/RegisterClassInfo.h" 30 31 #define DEBUG_TYPE "machine-scheduler" 32 33 using namespace llvm; 34 35 static cl::opt<bool> DisableUnclusterHighRP( 36 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, 37 cl::desc("Disable unclustered high register pressure " 38 "reduction scheduling stage."), 39 cl::init(false)); 40 41 static cl::opt<bool> DisableClusteredLowOccupancy( 42 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, 43 cl::desc("Disable clustered low occupancy " 44 "rescheduling for ILP scheduling stage."), 45 cl::init(false)); 46 47 static cl::opt<unsigned> ScheduleMetricBias( 48 "amdgpu-schedule-metric-bias", cl::Hidden, 49 cl::desc( 50 "Sets the bias which adds weight to occupancy vs latency. Set it to " 51 "100 to chase the occupancy only."), 52 cl::init(10)); 53 54 static cl::opt<bool> 55 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, 56 cl::desc("Relax occupancy targets for kernels which are memory " 57 "bound (amdgpu-membound-threshold), or " 58 "Wave Limited (amdgpu-limit-wave-threshold)."), 59 cl::init(false)); 60 61 const unsigned ScheduleMetrics::ScaleFactor = 100; 62 63 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) 64 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), 65 HasHighPressure(false) {} 66 67 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { 68 GenericScheduler::initialize(DAG); 69 70 MF = &DAG->MF; 71 72 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 73 74 SGPRExcessLimit = 75 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 76 VGPRExcessLimit = 77 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 78 79 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 80 // Set the initial TargetOccupnacy to the maximum occupancy that we can 81 // achieve for this function. This effectively sets a lower bound on the 82 // 'Critical' register limits in the scheduler. 83 // Allow for lower occupancy targets if kernel is wave limited or memory 84 // bound, and using the relaxed occupancy feature. 85 TargetOccupancy = 86 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); 87 SGPRCriticalLimit = 88 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 89 90 if (!KnownExcessRP) { 91 VGPRCriticalLimit = 92 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 93 } else { 94 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except 95 // returns a reasonably small number for targets with lots of VGPRs, such 96 // as GFX10 and GFX11. 97 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " 98 "VGPRCriticalLimit calculation method.\n"); 99 100 unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); 101 unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); 102 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); 103 VGPRBudget = std::max(VGPRBudget, Granule); 104 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); 105 } 106 107 // Subtract error margin and bias from register limits and avoid overflow. 108 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); 109 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); 110 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); 111 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); 112 113 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit 114 << ", VGPRExcessLimit = " << VGPRExcessLimit 115 << ", SGPRCriticalLimit = " << SGPRCriticalLimit 116 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); 117 } 118 119 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the 120 /// current register pressure. 121 /// 122 /// This works for the common case, but it has a few exceptions that have been 123 /// observed through trial and error: 124 /// - Explicit physical register operands 125 /// - Subregister definitions 126 /// 127 /// In both of those cases, PressureDiff doesn't represent the actual pressure, 128 /// and querying LiveIntervals through the RegPressureTracker is needed to get 129 /// an accurate value. 130 /// 131 /// We should eventually only use PressureDiff for maximum performance, but this 132 /// already allows 80% of SUs to take the fast path without changing scheduling 133 /// at all. Further changes would either change scheduling, or require a lot 134 /// more logic to recover an accurate pressure estimate from the PressureDiffs. 135 static bool canUsePressureDiffs(const SUnit &SU) { 136 if (!SU.isInstr()) 137 return false; 138 139 // Cannot use pressure diffs for subregister defs or with physregs, it's 140 // imprecise in both cases. 141 for (const auto &Op : SU.getInstr()->operands()) { 142 if (!Op.isReg() || Op.isImplicit()) 143 continue; 144 if (Op.getReg().isPhysical() || 145 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) 146 return false; 147 } 148 return true; 149 } 150 151 static void getRegisterPressures(bool AtTop, 152 const RegPressureTracker &RPTracker, SUnit *SU, 153 std::vector<unsigned> &Pressure, 154 std::vector<unsigned> &MaxPressure) { 155 // getDownwardPressure() and getUpwardPressure() make temporary changes to 156 // the tracker, so we need to pass those function a non-const copy. 157 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); 158 if (AtTop) 159 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); 160 else 161 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 162 } 163 164 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 165 bool AtTop, 166 const RegPressureTracker &RPTracker, 167 const SIRegisterInfo *SRI, 168 unsigned SGPRPressure, 169 unsigned VGPRPressure, bool IsBottomUp) { 170 Cand.SU = SU; 171 Cand.AtTop = AtTop; 172 173 if (!DAG->isTrackingPressure()) 174 return; 175 176 Pressure.clear(); 177 MaxPressure.clear(); 178 179 // We try to use the cached PressureDiffs in the ScheduleDAG whenever 180 // possible over querying the RegPressureTracker. 181 // 182 // RegPressureTracker will make a lot of LIS queries which are very 183 // expensive, it is considered a slow function in this context. 184 // 185 // PressureDiffs are precomputed and cached, and getPressureDiff is just a 186 // trivial lookup into an array. It is pretty much free. 187 // 188 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of 189 // PressureDiffs. 190 if (AtTop || !canUsePressureDiffs(*SU)) { 191 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); 192 } else { 193 // Reserve 4 slots. 194 Pressure.resize(4, 0); 195 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; 196 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; 197 198 for (const auto &Diff : DAG->getPressureDiff(SU)) { 199 if (!Diff.isValid()) 200 continue; 201 // PressureDiffs is always bottom-up so if we're working top-down we need 202 // to invert its sign. 203 Pressure[Diff.getPSet()] += 204 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); 205 } 206 207 #ifdef EXPENSIVE_CHECKS 208 std::vector<unsigned> CheckPressure, CheckMaxPressure; 209 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); 210 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != 211 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || 212 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != 213 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { 214 errs() << "Register Pressure is inaccurate when calculated through " 215 "PressureDiff\n" 216 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] 217 << ", expected " 218 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" 219 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] 220 << ", expected " 221 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; 222 report_fatal_error("inaccurate register pressure calculation"); 223 } 224 #endif 225 } 226 227 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 228 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 229 230 // If two instructions increase the pressure of different register sets 231 // by the same amount, the generic scheduler will prefer to schedule the 232 // instruction that increases the set with the least amount of registers, 233 // which in our case would be SGPRs. This is rarely what we want, so 234 // when we report excess/critical register pressure, we do it either 235 // only for VGPRs or only for SGPRs. 236 237 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 238 const unsigned MaxVGPRPressureInc = 16; 239 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 240 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 241 242 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 243 // to increase the likelihood we don't go over the limits. We should improve 244 // the analysis to look through dependencies to find the path with the least 245 // register pressure. 246 247 // We only need to update the RPDelta for instructions that increase register 248 // pressure. Instructions that decrease or keep reg pressure the same will be 249 // marked as RegExcess in tryCandidate() when they are compared with 250 // instructions that increase the register pressure. 251 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 252 HasHighPressure = true; 253 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 254 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 255 } 256 257 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 258 HasHighPressure = true; 259 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 260 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 261 } 262 263 // Register pressure is considered 'CRITICAL' if it is approaching a value 264 // that would reduce the wave occupancy for the execution unit. When 265 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 266 // has the same cost, so we don't need to prefer one over the other. 267 268 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 269 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 270 271 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 272 HasHighPressure = true; 273 if (SGPRDelta > VGPRDelta) { 274 Cand.RPDelta.CriticalMax = 275 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 276 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 277 } else { 278 Cand.RPDelta.CriticalMax = 279 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 280 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 281 } 282 } 283 } 284 285 // This function is mostly cut and pasted from 286 // GenericScheduler::pickNodeFromQueue() 287 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 288 const CandPolicy &ZonePolicy, 289 const RegPressureTracker &RPTracker, 290 SchedCandidate &Cand, 291 bool IsBottomUp) { 292 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 293 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 294 unsigned SGPRPressure = 0; 295 unsigned VGPRPressure = 0; 296 if (DAG->isTrackingPressure()) { 297 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 298 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 299 } 300 ReadyQueue &Q = Zone.Available; 301 for (SUnit *SU : Q) { 302 303 SchedCandidate TryCand(ZonePolicy); 304 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, 305 VGPRPressure, IsBottomUp); 306 // Pass SchedBoundary only when comparing nodes from the same boundary. 307 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 308 tryCandidate(Cand, TryCand, ZoneArg); 309 if (TryCand.Reason != NoCand) { 310 // Initialize resource delta if needed in case future heuristics query it. 311 if (TryCand.ResDelta == SchedResourceDelta()) 312 TryCand.initResourceDelta(Zone.DAG, SchedModel); 313 Cand.setBest(TryCand); 314 LLVM_DEBUG(traceCandidate(Cand)); 315 } 316 } 317 } 318 319 // This function is mostly cut and pasted from 320 // GenericScheduler::pickNodeBidirectional() 321 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 322 // Schedule as far as possible in the direction of no choice. This is most 323 // efficient, but also provides the best heuristics for CriticalPSets. 324 if (SUnit *SU = Bot.pickOnlyChoice()) { 325 IsTopNode = false; 326 return SU; 327 } 328 if (SUnit *SU = Top.pickOnlyChoice()) { 329 IsTopNode = true; 330 return SU; 331 } 332 // Set the bottom-up policy based on the state of the current bottom zone and 333 // the instructions outside the zone, including the top zone. 334 CandPolicy BotPolicy; 335 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 336 // Set the top-down policy based on the state of the current top zone and 337 // the instructions outside the zone, including the bottom zone. 338 CandPolicy TopPolicy; 339 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 340 341 // See if BotCand is still valid (because we previously scheduled from Top). 342 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 343 if (!BotCand.isValid() || BotCand.SU->isScheduled || 344 BotCand.Policy != BotPolicy) { 345 BotCand.reset(CandPolicy()); 346 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, 347 /*IsBottomUp=*/true); 348 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 349 } else { 350 LLVM_DEBUG(traceCandidate(BotCand)); 351 #ifndef NDEBUG 352 if (VerifyScheduling) { 353 SchedCandidate TCand; 354 TCand.reset(CandPolicy()); 355 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, 356 /*IsBottomUp=*/true); 357 assert(TCand.SU == BotCand.SU && 358 "Last pick result should correspond to re-picking right now"); 359 } 360 #endif 361 } 362 363 // Check if the top Q has a better candidate. 364 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 365 if (!TopCand.isValid() || TopCand.SU->isScheduled || 366 TopCand.Policy != TopPolicy) { 367 TopCand.reset(CandPolicy()); 368 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, 369 /*IsBottomUp=*/false); 370 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 371 } else { 372 LLVM_DEBUG(traceCandidate(TopCand)); 373 #ifndef NDEBUG 374 if (VerifyScheduling) { 375 SchedCandidate TCand; 376 TCand.reset(CandPolicy()); 377 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, 378 /*IsBottomUp=*/false); 379 assert(TCand.SU == TopCand.SU && 380 "Last pick result should correspond to re-picking right now"); 381 } 382 #endif 383 } 384 385 // Pick best from BotCand and TopCand. 386 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 387 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 388 SchedCandidate Cand = BotCand; 389 TopCand.Reason = NoCand; 390 tryCandidate(Cand, TopCand, nullptr); 391 if (TopCand.Reason != NoCand) { 392 Cand.setBest(TopCand); 393 } 394 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 395 396 IsTopNode = Cand.AtTop; 397 return Cand.SU; 398 } 399 400 // This function is mostly cut and pasted from 401 // GenericScheduler::pickNode() 402 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { 403 if (DAG->top() == DAG->bottom()) { 404 assert(Top.Available.empty() && Top.Pending.empty() && 405 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 406 return nullptr; 407 } 408 SUnit *SU; 409 do { 410 if (RegionPolicy.OnlyTopDown) { 411 SU = Top.pickOnlyChoice(); 412 if (!SU) { 413 CandPolicy NoPolicy; 414 TopCand.reset(NoPolicy); 415 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, 416 /*IsBottomUp=*/false); 417 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 418 SU = TopCand.SU; 419 } 420 IsTopNode = true; 421 } else if (RegionPolicy.OnlyBottomUp) { 422 SU = Bot.pickOnlyChoice(); 423 if (!SU) { 424 CandPolicy NoPolicy; 425 BotCand.reset(NoPolicy); 426 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, 427 /*IsBottomUp=*/true); 428 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 429 SU = BotCand.SU; 430 } 431 IsTopNode = false; 432 } else { 433 SU = pickNodeBidirectional(IsTopNode); 434 } 435 } while (SU->isScheduled); 436 437 if (SU->isTopReady()) 438 Top.removeReady(SU); 439 if (SU->isBottomReady()) 440 Bot.removeReady(SU); 441 442 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 443 << *SU->getInstr()); 444 return SU; 445 } 446 447 GCNSchedStageID GCNSchedStrategy::getCurrentStage() { 448 assert(CurrentStage && CurrentStage != SchedStages.end()); 449 return *CurrentStage; 450 } 451 452 bool GCNSchedStrategy::advanceStage() { 453 assert(CurrentStage != SchedStages.end()); 454 if (!CurrentStage) 455 CurrentStage = SchedStages.begin(); 456 else 457 CurrentStage++; 458 459 return CurrentStage != SchedStages.end(); 460 } 461 462 bool GCNSchedStrategy::hasNextStage() const { 463 assert(CurrentStage); 464 return std::next(CurrentStage) != SchedStages.end(); 465 } 466 467 GCNSchedStageID GCNSchedStrategy::getNextStage() const { 468 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); 469 return *std::next(CurrentStage); 470 } 471 472 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 473 const MachineSchedContext *C) 474 : GCNSchedStrategy(C) { 475 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); 476 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); 477 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); 478 SchedStages.push_back(GCNSchedStageID::PreRARematerialize); 479 } 480 481 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) 482 : GCNSchedStrategy(C) { 483 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); 484 } 485 486 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, 487 SchedCandidate &TryCand, 488 SchedBoundary *Zone) const { 489 // Initialize the candidate if needed. 490 if (!Cand.isValid()) { 491 TryCand.Reason = NodeOrder; 492 return true; 493 } 494 495 // Avoid spilling by exceeding the register limit. 496 if (DAG->isTrackingPressure() && 497 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 498 RegExcess, TRI, DAG->MF)) 499 return TryCand.Reason != NoCand; 500 501 // Bias PhysReg Defs and copies to their uses and defined respectively. 502 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 503 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 504 return TryCand.Reason != NoCand; 505 506 bool SameBoundary = Zone != nullptr; 507 if (SameBoundary) { 508 // Prioritize instructions that read unbuffered resources by stall cycles. 509 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 510 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 511 return TryCand.Reason != NoCand; 512 513 // Avoid critical resource consumption and balance the schedule. 514 TryCand.initResourceDelta(DAG, SchedModel); 515 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 516 TryCand, Cand, ResourceReduce)) 517 return TryCand.Reason != NoCand; 518 if (tryGreater(TryCand.ResDelta.DemandedResources, 519 Cand.ResDelta.DemandedResources, TryCand, Cand, 520 ResourceDemand)) 521 return TryCand.Reason != NoCand; 522 523 // Unconditionally try to reduce latency. 524 if (tryLatency(TryCand, Cand, *Zone)) 525 return TryCand.Reason != NoCand; 526 527 // Weak edges are for clustering and other constraints. 528 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 529 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 530 return TryCand.Reason != NoCand; 531 } 532 533 // Keep clustered nodes together to encourage downstream peephole 534 // optimizations which may reduce resource requirements. 535 // 536 // This is a best effort to set things up for a post-RA pass. Optimizations 537 // like generating loads of multiple registers should ideally be done within 538 // the scheduler pass by combining the loads during DAG postprocessing. 539 const SUnit *CandNextClusterSU = 540 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 541 const SUnit *TryCandNextClusterSU = 542 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 543 if (tryGreater(TryCand.SU == TryCandNextClusterSU, 544 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) 545 return TryCand.Reason != NoCand; 546 547 // Avoid increasing the max critical pressure in the scheduled region. 548 if (DAG->isTrackingPressure() && 549 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 550 TryCand, Cand, RegCritical, TRI, DAG->MF)) 551 return TryCand.Reason != NoCand; 552 553 // Avoid increasing the max pressure of the entire region. 554 if (DAG->isTrackingPressure() && 555 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 556 Cand, RegMax, TRI, DAG->MF)) 557 return TryCand.Reason != NoCand; 558 559 if (SameBoundary) { 560 // Fall through to original instruction order. 561 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || 562 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 563 TryCand.Reason = NodeOrder; 564 return true; 565 } 566 } 567 return false; 568 } 569 570 GCNScheduleDAGMILive::GCNScheduleDAGMILive( 571 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) 572 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), 573 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 574 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { 575 576 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 577 if (RelaxedOcc) { 578 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); 579 if (MinOccupancy != StartingOccupancy) 580 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy 581 << ".\n"); 582 } 583 } 584 585 std::unique_ptr<GCNSchedStage> 586 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { 587 switch (SchedStageID) { 588 case GCNSchedStageID::OccInitialSchedule: 589 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); 590 case GCNSchedStageID::UnclusteredHighRPReschedule: 591 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); 592 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 593 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); 594 case GCNSchedStageID::PreRARematerialize: 595 return std::make_unique<PreRARematStage>(SchedStageID, *this); 596 case GCNSchedStageID::ILPInitialSchedule: 597 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); 598 } 599 600 llvm_unreachable("Unknown SchedStageID."); 601 } 602 603 void GCNScheduleDAGMILive::schedule() { 604 // Collect all scheduling regions. The actual scheduling is performed in 605 // GCNScheduleDAGMILive::finalizeSchedule. 606 Regions.push_back(std::pair(RegionBegin, RegionEnd)); 607 } 608 609 GCNRegPressure 610 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { 611 GCNDownwardRPTracker RPTracker(*LIS); 612 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 613 return RPTracker.moveMaxPressure(); 614 } 615 616 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, 617 const MachineBasicBlock *MBB) { 618 GCNDownwardRPTracker RPTracker(*LIS); 619 620 // If the block has the only successor then live-ins of that successor are 621 // live-outs of the current block. We can reuse calculated live set if the 622 // successor will be sent to scheduling past current block. 623 624 // However, due to the bug in LiveInterval analysis it may happen that two 625 // predecessors of the same successor block have different lane bitmasks for 626 // a live-out register. Workaround that by sticking to one-to-one relationship 627 // i.e. one predecessor with one successor block. 628 const MachineBasicBlock *OnlySucc = nullptr; 629 if (MBB->succ_size() == 1) { 630 auto *Candidate = *MBB->succ_begin(); 631 if (!Candidate->empty() && Candidate->pred_size() == 1) { 632 SlotIndexes *Ind = LIS->getSlotIndexes(); 633 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) 634 OnlySucc = Candidate; 635 } 636 } 637 638 // Scheduler sends regions from the end of the block upwards. 639 size_t CurRegion = RegionIdx; 640 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 641 if (Regions[CurRegion].first->getParent() != MBB) 642 break; 643 --CurRegion; 644 645 auto I = MBB->begin(); 646 auto LiveInIt = MBBLiveIns.find(MBB); 647 auto &Rgn = Regions[CurRegion]; 648 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 649 if (LiveInIt != MBBLiveIns.end()) { 650 auto LiveIn = std::move(LiveInIt->second); 651 RPTracker.reset(*MBB->begin(), &LiveIn); 652 MBBLiveIns.erase(LiveInIt); 653 } else { 654 I = Rgn.first; 655 auto LRS = BBLiveInMap.lookup(NonDbgMI); 656 #ifdef EXPENSIVE_CHECKS 657 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 658 #endif 659 RPTracker.reset(*I, &LRS); 660 } 661 662 for (;;) { 663 I = RPTracker.getNext(); 664 665 if (Regions[CurRegion].first == I || NonDbgMI == I) { 666 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 667 RPTracker.clearMaxPressure(); 668 } 669 670 if (Regions[CurRegion].second == I) { 671 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 672 if (CurRegion-- == RegionIdx) 673 break; 674 } 675 RPTracker.advanceToNext(); 676 RPTracker.advanceBeforeNext(); 677 } 678 679 if (OnlySucc) { 680 if (I != MBB->end()) { 681 RPTracker.advanceToNext(); 682 RPTracker.advance(MBB->end()); 683 } 684 RPTracker.advanceBeforeNext(); 685 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 686 } 687 } 688 689 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 690 GCNScheduleDAGMILive::getBBLiveInMap() const { 691 assert(!Regions.empty()); 692 std::vector<MachineInstr *> BBStarters; 693 BBStarters.reserve(Regions.size()); 694 auto I = Regions.rbegin(), E = Regions.rend(); 695 auto *BB = I->first->getParent(); 696 do { 697 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 698 BBStarters.push_back(MI); 699 do { 700 ++I; 701 } while (I != E && I->first->getParent() == BB); 702 } while (I != E); 703 return getLiveRegMap(BBStarters, false /*After*/, *LIS); 704 } 705 706 void GCNScheduleDAGMILive::finalizeSchedule() { 707 // Start actual scheduling here. This function is called by the base 708 // MachineScheduler after all regions have been recorded by 709 // GCNScheduleDAGMILive::schedule(). 710 LiveIns.resize(Regions.size()); 711 Pressure.resize(Regions.size()); 712 RescheduleRegions.resize(Regions.size()); 713 RegionsWithHighRP.resize(Regions.size()); 714 RegionsWithExcessRP.resize(Regions.size()); 715 RegionsWithMinOcc.resize(Regions.size()); 716 RegionsWithIGLPInstrs.resize(Regions.size()); 717 RescheduleRegions.set(); 718 RegionsWithHighRP.reset(); 719 RegionsWithExcessRP.reset(); 720 RegionsWithMinOcc.reset(); 721 RegionsWithIGLPInstrs.reset(); 722 723 runSchedStages(); 724 } 725 726 void GCNScheduleDAGMILive::runSchedStages() { 727 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 728 729 if (!Regions.empty()) 730 BBLiveInMap = getBBLiveInMap(); 731 732 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); 733 while (S.advanceStage()) { 734 auto Stage = createSchedStage(S.getCurrentStage()); 735 if (!Stage->initGCNSchedStage()) 736 continue; 737 738 for (auto Region : Regions) { 739 RegionBegin = Region.first; 740 RegionEnd = Region.second; 741 // Setup for scheduling the region and check whether it should be skipped. 742 if (!Stage->initGCNRegion()) { 743 Stage->advanceRegion(); 744 exitRegion(); 745 continue; 746 } 747 748 ScheduleDAGMILive::schedule(); 749 Stage->finalizeGCNRegion(); 750 } 751 752 Stage->finalizeGCNSchedStage(); 753 } 754 } 755 756 #ifndef NDEBUG 757 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { 758 switch (StageID) { 759 case GCNSchedStageID::OccInitialSchedule: 760 OS << "Max Occupancy Initial Schedule"; 761 break; 762 case GCNSchedStageID::UnclusteredHighRPReschedule: 763 OS << "Unclustered High Register Pressure Reschedule"; 764 break; 765 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 766 OS << "Clustered Low Occupancy Reschedule"; 767 break; 768 case GCNSchedStageID::PreRARematerialize: 769 OS << "Pre-RA Rematerialize"; 770 break; 771 case GCNSchedStageID::ILPInitialSchedule: 772 OS << "Max ILP Initial Schedule"; 773 break; 774 } 775 776 return OS; 777 } 778 #endif 779 780 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) 781 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), 782 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} 783 784 bool GCNSchedStage::initGCNSchedStage() { 785 if (!DAG.LIS) 786 return false; 787 788 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); 789 return true; 790 } 791 792 bool UnclusteredHighRPStage::initGCNSchedStage() { 793 if (DisableUnclusterHighRP) 794 return false; 795 796 if (!GCNSchedStage::initGCNSchedStage()) 797 return false; 798 799 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) 800 return false; 801 802 SavedMutations.swap(DAG.Mutations); 803 DAG.addMutation( 804 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); 805 806 InitialOccupancy = DAG.MinOccupancy; 807 // Aggressivly try to reduce register pressure in the unclustered high RP 808 // stage. Temporarily increase occupancy target in the region. 809 S.SGPRLimitBias = S.HighRPSGPRBias; 810 S.VGPRLimitBias = S.HighRPVGPRBias; 811 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) 812 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 813 814 LLVM_DEBUG( 815 dbgs() 816 << "Retrying function scheduling without clustering. " 817 "Aggressivly try to reduce register pressure to achieve occupancy " 818 << DAG.MinOccupancy << ".\n"); 819 820 return true; 821 } 822 823 bool ClusteredLowOccStage::initGCNSchedStage() { 824 if (DisableClusteredLowOccupancy) 825 return false; 826 827 if (!GCNSchedStage::initGCNSchedStage()) 828 return false; 829 830 // Don't bother trying to improve ILP in lower RP regions if occupancy has not 831 // been dropped. All regions will have already been scheduled with the ideal 832 // occupancy targets. 833 if (DAG.StartingOccupancy <= DAG.MinOccupancy) 834 return false; 835 836 LLVM_DEBUG( 837 dbgs() << "Retrying function scheduling with lowest recorded occupancy " 838 << DAG.MinOccupancy << ".\n"); 839 return true; 840 } 841 842 bool PreRARematStage::initGCNSchedStage() { 843 if (!GCNSchedStage::initGCNSchedStage()) 844 return false; 845 846 if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) 847 return false; 848 849 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 850 // Check maximum occupancy 851 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == 852 DAG.MinOccupancy) 853 return false; 854 855 // FIXME: This pass will invalidate cached MBBLiveIns for regions 856 // inbetween the defs and region we sinked the def to. Cached pressure 857 // for regions where a def is sinked from will also be invalidated. Will 858 // need to be fixed if there is another pass after this pass. 859 assert(!S.hasNextStage()); 860 861 collectRematerializableInstructions(); 862 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) 863 return false; 864 865 LLVM_DEBUG( 866 dbgs() << "Retrying function scheduling with improved occupancy of " 867 << DAG.MinOccupancy << " from rematerializing\n"); 868 return true; 869 } 870 871 void GCNSchedStage::finalizeGCNSchedStage() { 872 DAG.finishBlock(); 873 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); 874 } 875 876 void UnclusteredHighRPStage::finalizeGCNSchedStage() { 877 SavedMutations.swap(DAG.Mutations); 878 S.SGPRLimitBias = S.VGPRLimitBias = 0; 879 if (DAG.MinOccupancy > InitialOccupancy) { 880 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) 881 DAG.RegionsWithMinOcc[IDX] = 882 DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; 883 884 LLVM_DEBUG(dbgs() << StageID 885 << " stage successfully increased occupancy to " 886 << DAG.MinOccupancy << '\n'); 887 } 888 889 GCNSchedStage::finalizeGCNSchedStage(); 890 } 891 892 bool GCNSchedStage::initGCNRegion() { 893 // Check whether this new region is also a new block. 894 if (DAG.RegionBegin->getParent() != CurrentMBB) 895 setupNewBlock(); 896 897 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); 898 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); 899 900 // Skip empty scheduling regions (0 or 1 schedulable instructions). 901 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) 902 return false; 903 904 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 905 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) 906 << " " << CurrentMBB->getName() 907 << "\n From: " << *DAG.begin() << " To: "; 908 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; 909 else dbgs() << "End"; 910 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 911 912 // Save original instruction order before scheduling for possible revert. 913 Unsched.clear(); 914 Unsched.reserve(DAG.NumRegionInstrs); 915 if (StageID == GCNSchedStageID::OccInitialSchedule || 916 StageID == GCNSchedStageID::ILPInitialSchedule) { 917 for (auto &I : DAG) { 918 Unsched.push_back(&I); 919 if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 920 I.getOpcode() == AMDGPU::IGLP_OPT) 921 DAG.RegionsWithIGLPInstrs[RegionIdx] = true; 922 } 923 } else { 924 for (auto &I : DAG) 925 Unsched.push_back(&I); 926 } 927 928 PressureBefore = DAG.Pressure[RegionIdx]; 929 930 LLVM_DEBUG( 931 dbgs() << "Pressure before scheduling:\nRegion live-ins:" 932 << print(DAG.LiveIns[RegionIdx], DAG.MRI) 933 << "Region live-in pressure: " 934 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) 935 << "Region register pressure: " << print(PressureBefore)); 936 937 S.HasHighPressure = false; 938 S.KnownExcessRP = isRegionWithExcessRP(); 939 940 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 941 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { 942 SavedMutations.clear(); 943 SavedMutations.swap(DAG.Mutations); 944 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || 945 StageID == GCNSchedStageID::ILPInitialSchedule; 946 DAG.addMutation(createIGroupLPDAGMutation( 947 IsInitialStage ? AMDGPU::SchedulingPhase::Initial 948 : AMDGPU::SchedulingPhase::PreRAReentry)); 949 } 950 951 return true; 952 } 953 954 bool UnclusteredHighRPStage::initGCNRegion() { 955 // Only reschedule regions with the minimum occupancy or regions that may have 956 // spilling (excess register pressure). 957 if ((!DAG.RegionsWithMinOcc[RegionIdx] || 958 DAG.MinOccupancy <= InitialOccupancy) && 959 !DAG.RegionsWithExcessRP[RegionIdx]) 960 return false; 961 962 return GCNSchedStage::initGCNRegion(); 963 } 964 965 bool ClusteredLowOccStage::initGCNRegion() { 966 // We may need to reschedule this region if it wasn't rescheduled in the last 967 // stage, or if we found it was testing critical register pressure limits in 968 // the unclustered reschedule stage. The later is because we may not have been 969 // able to raise the min occupancy in the previous stage so the region may be 970 // overly constrained even if it was already rescheduled. 971 if (!DAG.RegionsWithHighRP[RegionIdx]) 972 return false; 973 974 return GCNSchedStage::initGCNRegion(); 975 } 976 977 bool PreRARematStage::initGCNRegion() { 978 if (!DAG.RescheduleRegions[RegionIdx]) 979 return false; 980 981 return GCNSchedStage::initGCNRegion(); 982 } 983 984 void GCNSchedStage::setupNewBlock() { 985 if (CurrentMBB) 986 DAG.finishBlock(); 987 988 CurrentMBB = DAG.RegionBegin->getParent(); 989 DAG.startBlock(CurrentMBB); 990 // Get real RP for the region if it hasn't be calculated before. After the 991 // initial schedule stage real RP will be collected after scheduling. 992 if (StageID == GCNSchedStageID::OccInitialSchedule || 993 StageID == GCNSchedStageID::ILPInitialSchedule) 994 DAG.computeBlockPressure(RegionIdx, CurrentMBB); 995 } 996 997 void GCNSchedStage::finalizeGCNRegion() { 998 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 999 DAG.RescheduleRegions[RegionIdx] = false; 1000 if (S.HasHighPressure) 1001 DAG.RegionsWithHighRP[RegionIdx] = true; 1002 1003 // Revert scheduling if we have dropped occupancy or there is some other 1004 // reason that the original schedule is better. 1005 checkScheduling(); 1006 1007 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 1008 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) 1009 SavedMutations.swap(DAG.Mutations); 1010 1011 DAG.exitRegion(); 1012 RegionIdx++; 1013 } 1014 1015 void GCNSchedStage::checkScheduling() { 1016 // Check the results of scheduling. 1017 PressureAfter = DAG.getRealRegPressure(RegionIdx); 1018 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); 1019 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); 1020 1021 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 1022 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 1023 DAG.Pressure[RegionIdx] = PressureAfter; 1024 DAG.RegionsWithMinOcc[RegionIdx] = 1025 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 1026 1027 // Early out if we have achieved the occupancy target. 1028 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 1029 return; 1030 } 1031 1032 unsigned TargetOccupancy = 1033 std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); 1034 unsigned WavesAfter = 1035 std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); 1036 unsigned WavesBefore = 1037 std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); 1038 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 1039 << ", after " << WavesAfter << ".\n"); 1040 1041 // We may not be able to keep the current target occupancy because of the just 1042 // scheduled region. We might still be able to revert scheduling if the 1043 // occupancy before was higher, or if the current schedule has register 1044 // pressure higher than the excess limits which could lead to more spilling. 1045 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 1046 1047 // Allow memory bound functions to drop to 4 waves if not limited by an 1048 // attribute. 1049 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && 1050 WavesAfter >= MFI.getMinAllowedOccupancy()) { 1051 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 1052 << MFI.getMinAllowedOccupancy() << " waves\n"); 1053 NewOccupancy = WavesAfter; 1054 } 1055 1056 if (NewOccupancy < DAG.MinOccupancy) { 1057 DAG.MinOccupancy = NewOccupancy; 1058 MFI.limitOccupancy(DAG.MinOccupancy); 1059 DAG.RegionsWithMinOcc.reset(); 1060 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 1061 << DAG.MinOccupancy << ".\n"); 1062 } 1063 // The maximum number of arch VGPR on non-unified register file, or the 1064 // maximum VGPR + AGPR in the unified register file case. 1065 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 1066 // The maximum number of arch VGPR for both unified and non-unified register 1067 // file. 1068 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); 1069 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 1070 1071 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || 1072 PressureAfter.getVGPRNum(false) > MaxArchVGPRs || 1073 PressureAfter.getAGPRNum() > MaxArchVGPRs || 1074 PressureAfter.getSGPRNum() > MaxSGPRs) { 1075 DAG.RescheduleRegions[RegionIdx] = true; 1076 DAG.RegionsWithHighRP[RegionIdx] = true; 1077 DAG.RegionsWithExcessRP[RegionIdx] = true; 1078 } 1079 1080 // Revert if this region's schedule would cause a drop in occupancy or 1081 // spilling. 1082 if (shouldRevertScheduling(WavesAfter)) { 1083 revertScheduling(); 1084 } else { 1085 DAG.Pressure[RegionIdx] = PressureAfter; 1086 DAG.RegionsWithMinOcc[RegionIdx] = 1087 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 1088 } 1089 } 1090 1091 unsigned 1092 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, 1093 DenseMap<unsigned, unsigned> &ReadyCycles, 1094 const TargetSchedModel &SM) { 1095 unsigned ReadyCycle = CurrCycle; 1096 for (auto &D : SU.Preds) { 1097 if (D.isAssignedRegDep()) { 1098 MachineInstr *DefMI = D.getSUnit()->getInstr(); 1099 unsigned Latency = SM.computeInstrLatency(DefMI); 1100 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; 1101 ReadyCycle = std::max(ReadyCycle, DefReady + Latency); 1102 } 1103 } 1104 ReadyCycles[SU.NodeNum] = ReadyCycle; 1105 return ReadyCycle; 1106 } 1107 1108 #ifndef NDEBUG 1109 struct EarlierIssuingCycle { 1110 bool operator()(std::pair<MachineInstr *, unsigned> A, 1111 std::pair<MachineInstr *, unsigned> B) const { 1112 return A.second < B.second; 1113 } 1114 }; 1115 1116 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, 1117 EarlierIssuingCycle> &ReadyCycles) { 1118 if (ReadyCycles.empty()) 1119 return; 1120 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); 1121 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum 1122 << " ##################\n# Cycle #\t\t\tInstruction " 1123 " " 1124 " \n"; 1125 unsigned IPrev = 1; 1126 for (auto &I : ReadyCycles) { 1127 if (I.second > IPrev + 1) 1128 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev 1129 << " CYCLES DETECTED ******************************\n\n"; 1130 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; 1131 IPrev = I.second; 1132 } 1133 } 1134 #endif 1135 1136 ScheduleMetrics 1137 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { 1138 #ifndef NDEBUG 1139 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1140 ReadyCyclesSorted; 1141 #endif 1142 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1143 unsigned SumBubbles = 0; 1144 DenseMap<unsigned, unsigned> ReadyCycles; 1145 unsigned CurrCycle = 0; 1146 for (auto &SU : InputSchedule) { 1147 unsigned ReadyCycle = 1148 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); 1149 SumBubbles += ReadyCycle - CurrCycle; 1150 #ifndef NDEBUG 1151 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); 1152 #endif 1153 CurrCycle = ++ReadyCycle; 1154 } 1155 #ifndef NDEBUG 1156 LLVM_DEBUG( 1157 printScheduleModel(ReadyCyclesSorted); 1158 dbgs() << "\n\t" 1159 << "Metric: " 1160 << (SumBubbles 1161 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1162 : 1) 1163 << "\n\n"); 1164 #endif 1165 1166 return ScheduleMetrics(CurrCycle, SumBubbles); 1167 } 1168 1169 ScheduleMetrics 1170 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { 1171 #ifndef NDEBUG 1172 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1173 ReadyCyclesSorted; 1174 #endif 1175 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1176 unsigned SumBubbles = 0; 1177 DenseMap<unsigned, unsigned> ReadyCycles; 1178 unsigned CurrCycle = 0; 1179 for (auto &MI : DAG) { 1180 SUnit *SU = DAG.getSUnit(&MI); 1181 if (!SU) 1182 continue; 1183 unsigned ReadyCycle = 1184 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); 1185 SumBubbles += ReadyCycle - CurrCycle; 1186 #ifndef NDEBUG 1187 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); 1188 #endif 1189 CurrCycle = ++ReadyCycle; 1190 } 1191 #ifndef NDEBUG 1192 LLVM_DEBUG( 1193 printScheduleModel(ReadyCyclesSorted); 1194 dbgs() << "\n\t" 1195 << "Metric: " 1196 << (SumBubbles 1197 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1198 : 1) 1199 << "\n\n"); 1200 #endif 1201 1202 return ScheduleMetrics(CurrCycle, SumBubbles); 1203 } 1204 1205 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { 1206 if (WavesAfter < DAG.MinOccupancy) 1207 return true; 1208 1209 return false; 1210 } 1211 1212 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1213 if (PressureAfter == PressureBefore) 1214 return false; 1215 1216 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1217 return true; 1218 1219 if (mayCauseSpilling(WavesAfter)) 1220 return true; 1221 1222 return false; 1223 } 1224 1225 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { 1226 // If RP is not reduced in the unclustered reschedule stage, revert to the 1227 // old schedule. 1228 if ((WavesAfter <= PressureBefore.getOccupancy(ST) && 1229 mayCauseSpilling(WavesAfter)) || 1230 GCNSchedStage::shouldRevertScheduling(WavesAfter)) { 1231 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 1232 return true; 1233 } 1234 1235 // Do not attempt to relax schedule even more if we are already spilling. 1236 if (isRegionWithExcessRP()) 1237 return false; 1238 1239 LLVM_DEBUG( 1240 dbgs() 1241 << "\n\t *** In shouldRevertScheduling ***\n" 1242 << " *********** BEFORE UnclusteredHighRPStage ***********\n"); 1243 ScheduleMetrics MBefore = 1244 getScheduleMetrics(DAG.SUnits); 1245 LLVM_DEBUG( 1246 dbgs() 1247 << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); 1248 ScheduleMetrics MAfter = getScheduleMetrics(DAG); 1249 unsigned OldMetric = MBefore.getMetric(); 1250 unsigned NewMetric = MAfter.getMetric(); 1251 unsigned WavesBefore = 1252 std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); 1253 unsigned Profit = 1254 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * 1255 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / 1256 NewMetric) / 1257 ScheduleMetrics::ScaleFactor; 1258 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " 1259 << MAfter << "Profit: " << Profit << "\n"); 1260 return Profit < ScheduleMetrics::ScaleFactor; 1261 } 1262 1263 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { 1264 if (PressureAfter == PressureBefore) 1265 return false; 1266 1267 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1268 return true; 1269 1270 if (mayCauseSpilling(WavesAfter)) 1271 return true; 1272 1273 return false; 1274 } 1275 1276 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { 1277 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1278 return true; 1279 1280 if (mayCauseSpilling(WavesAfter)) 1281 return true; 1282 1283 return false; 1284 } 1285 1286 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1287 if (mayCauseSpilling(WavesAfter)) 1288 return true; 1289 1290 return false; 1291 } 1292 1293 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { 1294 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && 1295 !PressureAfter.less(MF, PressureBefore)) { 1296 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 1297 return true; 1298 } 1299 1300 return false; 1301 } 1302 1303 void GCNSchedStage::revertScheduling() { 1304 DAG.RegionsWithMinOcc[RegionIdx] = 1305 PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; 1306 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 1307 DAG.RescheduleRegions[RegionIdx] = 1308 S.hasNextStage() && 1309 S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; 1310 DAG.RegionEnd = DAG.RegionBegin; 1311 int SkippedDebugInstr = 0; 1312 for (MachineInstr *MI : Unsched) { 1313 if (MI->isDebugInstr()) { 1314 ++SkippedDebugInstr; 1315 continue; 1316 } 1317 1318 if (MI->getIterator() != DAG.RegionEnd) { 1319 DAG.BB->remove(MI); 1320 DAG.BB->insert(DAG.RegionEnd, MI); 1321 if (!MI->isDebugInstr()) 1322 DAG.LIS->handleMove(*MI, true); 1323 } 1324 1325 // Reset read-undef flags and update them later. 1326 for (auto &Op : MI->all_defs()) 1327 Op.setIsUndef(false); 1328 RegisterOperands RegOpers; 1329 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); 1330 if (!MI->isDebugInstr()) { 1331 if (DAG.ShouldTrackLaneMasks) { 1332 // Adjust liveness and add missing dead+read-undef flags. 1333 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); 1334 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); 1335 } else { 1336 // Adjust for missing dead-def flags. 1337 RegOpers.detectDeadDefs(*MI, *DAG.LIS); 1338 } 1339 } 1340 DAG.RegionEnd = MI->getIterator(); 1341 ++DAG.RegionEnd; 1342 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 1343 } 1344 1345 // After reverting schedule, debug instrs will now be at the end of the block 1346 // and RegionEnd will point to the first debug instr. Increment RegionEnd 1347 // pass debug instrs to the actual end of the scheduling region. 1348 while (SkippedDebugInstr-- > 0) 1349 ++DAG.RegionEnd; 1350 1351 // If Unsched.front() instruction is a debug instruction, this will actually 1352 // shrink the region since we moved all debug instructions to the end of the 1353 // block. Find the first instruction that is not a debug instruction. 1354 DAG.RegionBegin = Unsched.front()->getIterator(); 1355 if (DAG.RegionBegin->isDebugInstr()) { 1356 for (MachineInstr *MI : Unsched) { 1357 if (MI->isDebugInstr()) 1358 continue; 1359 DAG.RegionBegin = MI->getIterator(); 1360 break; 1361 } 1362 } 1363 1364 // Then move the debug instructions back into their correct place and set 1365 // RegionBegin and RegionEnd if needed. 1366 DAG.placeDebugValues(); 1367 1368 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1369 } 1370 1371 void PreRARematStage::collectRematerializableInstructions() { 1372 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); 1373 for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { 1374 Register Reg = Register::index2VirtReg(I); 1375 if (!DAG.LIS->hasInterval(Reg)) 1376 continue; 1377 1378 // TODO: Handle AGPR and SGPR rematerialization 1379 if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || 1380 !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) 1381 continue; 1382 1383 MachineOperand *Op = DAG.MRI.getOneDef(Reg); 1384 MachineInstr *Def = Op->getParent(); 1385 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) 1386 continue; 1387 1388 MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); 1389 if (Def->getParent() == UseI->getParent()) 1390 continue; 1391 1392 // We are only collecting defs that are defined in another block and are 1393 // live-through or used inside regions at MinOccupancy. This means that the 1394 // register must be in the live-in set for the region. 1395 bool AddedToRematList = false; 1396 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1397 auto It = DAG.LiveIns[I].find(Reg); 1398 if (It != DAG.LiveIns[I].end() && !It->second.none()) { 1399 if (DAG.RegionsWithMinOcc[I]) { 1400 RematerializableInsts[I][Def] = UseI; 1401 AddedToRematList = true; 1402 } 1403 1404 // Collect regions with rematerializable reg as live-in to avoid 1405 // searching later when updating RP. 1406 RematDefToLiveInRegions[Def].push_back(I); 1407 } 1408 } 1409 if (!AddedToRematList) 1410 RematDefToLiveInRegions.erase(Def); 1411 } 1412 } 1413 1414 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, 1415 const TargetInstrInfo *TII) { 1416 // Temporary copies of cached variables we will be modifying and replacing if 1417 // sinking succeeds. 1418 SmallVector< 1419 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> 1420 NewRegions; 1421 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; 1422 DenseMap<unsigned, GCNRegPressure> NewPressure; 1423 BitVector NewRescheduleRegions; 1424 LiveIntervals *LIS = DAG.LIS; 1425 1426 NewRegions.resize(DAG.Regions.size()); 1427 NewRescheduleRegions.resize(DAG.Regions.size()); 1428 1429 // Collect only regions that has a rematerializable def as a live-in. 1430 SmallSet<unsigned, 16> ImpactedRegions; 1431 for (const auto &It : RematDefToLiveInRegions) 1432 ImpactedRegions.insert(It.second.begin(), It.second.end()); 1433 1434 // Make copies of register pressure and live-ins cache that will be updated 1435 // as we rematerialize. 1436 for (auto Idx : ImpactedRegions) { 1437 NewPressure[Idx] = DAG.Pressure[Idx]; 1438 NewLiveIns[Idx] = DAG.LiveIns[Idx]; 1439 } 1440 NewRegions = DAG.Regions; 1441 NewRescheduleRegions.reset(); 1442 1443 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 1444 bool Improved = false; 1445 for (auto I : ImpactedRegions) { 1446 if (!DAG.RegionsWithMinOcc[I]) 1447 continue; 1448 1449 Improved = false; 1450 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); 1451 int SGPRUsage = NewPressure[I].getSGPRNum(); 1452 1453 // TODO: Handle occupancy drop due to AGPR and SGPR. 1454 // Check if cause of occupancy drop is due to VGPR usage and not SGPR. 1455 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) 1456 break; 1457 1458 // The occupancy of this region could have been improved by a previous 1459 // iteration's sinking of defs. 1460 if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { 1461 NewRescheduleRegions[I] = true; 1462 Improved = true; 1463 continue; 1464 } 1465 1466 // First check if we have enough trivially rematerializable instructions to 1467 // improve occupancy. Optimistically assume all instructions we are able to 1468 // sink decreased RP. 1469 int TotalSinkableRegs = 0; 1470 for (const auto &It : RematerializableInsts[I]) { 1471 MachineInstr *Def = It.first; 1472 Register DefReg = Def->getOperand(0).getReg(); 1473 TotalSinkableRegs += 1474 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); 1475 } 1476 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 1477 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 1478 // If in the most optimistic scenario, we cannot improve occupancy, then do 1479 // not attempt to sink any instructions. 1480 if (OptimisticOccupancy <= DAG.MinOccupancy) 1481 break; 1482 1483 unsigned ImproveOccupancy = 0; 1484 SmallVector<MachineInstr *, 4> SinkedDefs; 1485 for (auto &It : RematerializableInsts[I]) { 1486 MachineInstr *Def = It.first; 1487 MachineBasicBlock::iterator InsertPos = 1488 MachineBasicBlock::iterator(It.second); 1489 Register Reg = Def->getOperand(0).getReg(); 1490 // Rematerialize MI to its use block. Since we are only rematerializing 1491 // instructions that do not have any virtual reg uses, we do not need to 1492 // call LiveRangeEdit::allUsesAvailableAt() and 1493 // LiveRangeEdit::canRematerializeAt(). 1494 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 1495 Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); 1496 MachineInstr *NewMI = &*std::prev(InsertPos); 1497 LIS->InsertMachineInstrInMaps(*NewMI); 1498 LIS->removeInterval(Reg); 1499 LIS->createAndComputeVirtRegInterval(Reg); 1500 InsertedMIToOldDef[NewMI] = Def; 1501 1502 // Update region boundaries in scheduling region we sinked from since we 1503 // may sink an instruction that was at the beginning or end of its region 1504 DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, 1505 /*Removing =*/true); 1506 1507 // Update region boundaries in region we sinked to. 1508 DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); 1509 1510 LaneBitmask PrevMask = NewLiveIns[I][Reg]; 1511 // FIXME: Also update cached pressure for where the def was sinked from. 1512 // Update RP for all regions that has this reg as a live-in and remove 1513 // the reg from all regions as a live-in. 1514 for (auto Idx : RematDefToLiveInRegions[Def]) { 1515 NewLiveIns[Idx].erase(Reg); 1516 if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { 1517 // Def is live-through and not used in this block. 1518 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); 1519 } else { 1520 // Def is used and rematerialized into this block. 1521 GCNDownwardRPTracker RPT(*LIS); 1522 auto *NonDbgMI = &*skipDebugInstructionsForward( 1523 NewRegions[Idx].first, NewRegions[Idx].second); 1524 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); 1525 RPT.advance(NewRegions[Idx].second); 1526 NewPressure[Idx] = RPT.moveMaxPressure(); 1527 } 1528 } 1529 1530 SinkedDefs.push_back(Def); 1531 ImproveOccupancy = NewPressure[I].getOccupancy(ST); 1532 if (ImproveOccupancy > DAG.MinOccupancy) 1533 break; 1534 } 1535 1536 // Remove defs we just sinked from all regions' list of sinkable defs 1537 for (auto &Def : SinkedDefs) 1538 for (auto TrackedIdx : RematDefToLiveInRegions[Def]) 1539 RematerializableInsts[TrackedIdx].erase(Def); 1540 1541 if (ImproveOccupancy <= DAG.MinOccupancy) 1542 break; 1543 1544 NewRescheduleRegions[I] = true; 1545 Improved = true; 1546 } 1547 1548 if (!Improved) { 1549 // Occupancy was not improved for all regions that were at MinOccupancy. 1550 // Undo sinking and remove newly rematerialized instructions. 1551 for (auto &Entry : InsertedMIToOldDef) { 1552 MachineInstr *MI = Entry.first; 1553 MachineInstr *OldMI = Entry.second; 1554 Register Reg = MI->getOperand(0).getReg(); 1555 LIS->RemoveMachineInstrFromMaps(*MI); 1556 MI->eraseFromParent(); 1557 OldMI->clearRegisterDeads(Reg); 1558 LIS->removeInterval(Reg); 1559 LIS->createAndComputeVirtRegInterval(Reg); 1560 } 1561 return false; 1562 } 1563 1564 // Occupancy was improved for all regions. 1565 for (auto &Entry : InsertedMIToOldDef) { 1566 MachineInstr *MI = Entry.first; 1567 MachineInstr *OldMI = Entry.second; 1568 1569 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 1570 DAG.BBLiveInMap.erase(OldMI); 1571 1572 // Remove OldMI and update LIS 1573 Register Reg = MI->getOperand(0).getReg(); 1574 LIS->RemoveMachineInstrFromMaps(*OldMI); 1575 OldMI->eraseFromParent(); 1576 LIS->removeInterval(Reg); 1577 LIS->createAndComputeVirtRegInterval(Reg); 1578 } 1579 1580 // Update live-ins, register pressure, and regions caches. 1581 for (auto Idx : ImpactedRegions) { 1582 DAG.LiveIns[Idx] = NewLiveIns[Idx]; 1583 DAG.Pressure[Idx] = NewPressure[Idx]; 1584 DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); 1585 } 1586 DAG.Regions = NewRegions; 1587 DAG.RescheduleRegions = NewRescheduleRegions; 1588 1589 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1590 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 1591 1592 return true; 1593 } 1594 1595 // Copied from MachineLICM 1596 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { 1597 if (!DAG.TII->isTriviallyReMaterializable(MI)) 1598 return false; 1599 1600 for (const MachineOperand &MO : MI.all_uses()) 1601 if (MO.getReg().isVirtual()) 1602 return false; 1603 1604 return true; 1605 } 1606 1607 // When removing, we will have to check both beginning and ending of the region. 1608 // When inserting, we will only have to check if we are inserting NewMI in front 1609 // of a scheduling region and do not need to check the ending since we will only 1610 // ever be inserting before an already existing MI. 1611 void GCNScheduleDAGMILive::updateRegionBoundaries( 1612 SmallVectorImpl<std::pair<MachineBasicBlock::iterator, 1613 MachineBasicBlock::iterator>> &RegionBoundaries, 1614 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 1615 unsigned I = 0, E = RegionBoundaries.size(); 1616 // Search for first region of the block where MI is located 1617 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) 1618 ++I; 1619 1620 for (; I != E; ++I) { 1621 if (MI->getParent() != RegionBoundaries[I].first->getParent()) 1622 return; 1623 1624 if (Removing && MI == RegionBoundaries[I].first && 1625 MI == RegionBoundaries[I].second) { 1626 // MI is in a region with size 1, after removing, the region will be 1627 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 1628 RegionBoundaries[I] = 1629 std::pair(MI->getParent()->end(), MI->getParent()->end()); 1630 return; 1631 } 1632 if (MI == RegionBoundaries[I].first) { 1633 if (Removing) 1634 RegionBoundaries[I] = 1635 std::pair(std::next(MI), RegionBoundaries[I].second); 1636 else 1637 // Inserted NewMI in front of region, set new RegionBegin to NewMI 1638 RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), 1639 RegionBoundaries[I].second); 1640 return; 1641 } 1642 if (Removing && MI == RegionBoundaries[I].second) { 1643 RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI)); 1644 return; 1645 } 1646 } 1647 } 1648 1649 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { 1650 return std::any_of( 1651 DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) { 1652 unsigned Opc = MI->getOpcode(); 1653 return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; 1654 }); 1655 } 1656 1657 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( 1658 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, 1659 bool RemoveKillFlags) 1660 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} 1661 1662 void GCNPostScheduleDAGMILive::schedule() { 1663 HasIGLPInstrs = hasIGLPInstrs(this); 1664 if (HasIGLPInstrs) { 1665 SavedMutations.clear(); 1666 SavedMutations.swap(Mutations); 1667 addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); 1668 } 1669 1670 ScheduleDAGMI::schedule(); 1671 } 1672 1673 void GCNPostScheduleDAGMILive::finalizeSchedule() { 1674 if (HasIGLPInstrs) 1675 SavedMutations.swap(Mutations); 1676 1677 ScheduleDAGMI::finalizeSchedule(); 1678 } 1679