1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 /// 13 /// This pass will apply multiple scheduling stages to the same function. 14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual 15 /// entry point for the scheduling of those regions is 16 /// GCNScheduleDAGMILive::runSchedStages. 17 18 /// Generally, the reason for having multiple scheduling stages is to account 19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a 20 /// few scheduling regions will have register pressure high enough to limit 21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in 22 /// other regions. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "GCNSchedStrategy.h" 27 #include "AMDGPUIGroupLP.h" 28 #include "GCNRegPressure.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/CodeGen/RegisterClassInfo.h" 33 #include "llvm/MC/LaneBitmask.h" 34 #include "llvm/Support/ErrorHandling.h" 35 36 #define DEBUG_TYPE "machine-scheduler" 37 38 using namespace llvm; 39 40 static cl::opt<bool> DisableUnclusterHighRP( 41 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, 42 cl::desc("Disable unclustered high register pressure " 43 "reduction scheduling stage."), 44 cl::init(false)); 45 46 static cl::opt<bool> DisableClusteredLowOccupancy( 47 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, 48 cl::desc("Disable clustered low occupancy " 49 "rescheduling for ILP scheduling stage."), 50 cl::init(false)); 51 52 static cl::opt<unsigned> ScheduleMetricBias( 53 "amdgpu-schedule-metric-bias", cl::Hidden, 54 cl::desc( 55 "Sets the bias which adds weight to occupancy vs latency. Set it to " 56 "100 to chase the occupancy only."), 57 cl::init(10)); 58 59 static cl::opt<bool> 60 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, 61 cl::desc("Relax occupancy targets for kernels which are memory " 62 "bound (amdgpu-membound-threshold), or " 63 "Wave Limited (amdgpu-limit-wave-threshold)."), 64 cl::init(false)); 65 66 static cl::opt<bool> GCNTrackers( 67 "amdgpu-use-amdgpu-trackers", cl::Hidden, 68 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), 69 cl::init(false)); 70 71 const unsigned ScheduleMetrics::ScaleFactor = 100; 72 73 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) 74 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), 75 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { 76 } 77 78 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { 79 GenericScheduler::initialize(DAG); 80 81 MF = &DAG->MF; 82 83 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 84 85 SGPRExcessLimit = 86 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 87 VGPRExcessLimit = 88 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 89 90 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 91 // Set the initial TargetOccupnacy to the maximum occupancy that we can 92 // achieve for this function. This effectively sets a lower bound on the 93 // 'Critical' register limits in the scheduler. 94 // Allow for lower occupancy targets if kernel is wave limited or memory 95 // bound, and using the relaxed occupancy feature. 96 TargetOccupancy = 97 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); 98 SGPRCriticalLimit = 99 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 100 101 if (!KnownExcessRP) { 102 VGPRCriticalLimit = std::min( 103 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()), 104 VGPRExcessLimit); 105 } else { 106 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except 107 // returns a reasonably small number for targets with lots of VGPRs, such 108 // as GFX10 and GFX11. 109 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " 110 "VGPRCriticalLimit calculation method.\n"); 111 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); 112 unsigned Granule = 113 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize); 114 unsigned Addressable = 115 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize); 116 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); 117 VGPRBudget = std::max(VGPRBudget, Granule); 118 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); 119 } 120 121 // Subtract error margin and bias from register limits and avoid overflow. 122 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); 123 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); 124 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); 125 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); 126 127 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit 128 << ", VGPRExcessLimit = " << VGPRExcessLimit 129 << ", SGPRCriticalLimit = " << SGPRCriticalLimit 130 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); 131 } 132 133 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the 134 /// current register pressure. 135 /// 136 /// This works for the common case, but it has a few exceptions that have been 137 /// observed through trial and error: 138 /// - Explicit physical register operands 139 /// - Subregister definitions 140 /// 141 /// In both of those cases, PressureDiff doesn't represent the actual pressure, 142 /// and querying LiveIntervals through the RegPressureTracker is needed to get 143 /// an accurate value. 144 /// 145 /// We should eventually only use PressureDiff for maximum performance, but this 146 /// already allows 80% of SUs to take the fast path without changing scheduling 147 /// at all. Further changes would either change scheduling, or require a lot 148 /// more logic to recover an accurate pressure estimate from the PressureDiffs. 149 static bool canUsePressureDiffs(const SUnit &SU) { 150 if (!SU.isInstr()) 151 return false; 152 153 // Cannot use pressure diffs for subregister defs or with physregs, it's 154 // imprecise in both cases. 155 for (const auto &Op : SU.getInstr()->operands()) { 156 if (!Op.isReg() || Op.isImplicit()) 157 continue; 158 if (Op.getReg().isPhysical() || 159 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) 160 return false; 161 } 162 return true; 163 } 164 165 static void getRegisterPressures( 166 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, 167 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, 168 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, 169 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { 170 // getDownwardPressure() and getUpwardPressure() make temporary changes to 171 // the tracker, so we need to pass those function a non-const copy. 172 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); 173 if (!GCNTrackers) { 174 AtTop 175 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) 176 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 177 178 return; 179 } 180 181 // GCNTrackers 182 Pressure.resize(4, 0); 183 MachineInstr *MI = SU->getInstr(); 184 GCNRegPressure NewPressure; 185 if (AtTop) { 186 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); 187 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); 188 } else { 189 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); 190 TempUpwardTracker.recede(*MI); 191 NewPressure = TempUpwardTracker.getPressure(); 192 } 193 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); 194 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = 195 NewPressure.getArchVGPRNum(); 196 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); 197 } 198 199 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 200 bool AtTop, 201 const RegPressureTracker &RPTracker, 202 const SIRegisterInfo *SRI, 203 unsigned SGPRPressure, 204 unsigned VGPRPressure, bool IsBottomUp) { 205 Cand.SU = SU; 206 Cand.AtTop = AtTop; 207 208 if (!DAG->isTrackingPressure()) 209 return; 210 211 Pressure.clear(); 212 MaxPressure.clear(); 213 214 // We try to use the cached PressureDiffs in the ScheduleDAG whenever 215 // possible over querying the RegPressureTracker. 216 // 217 // RegPressureTracker will make a lot of LIS queries which are very 218 // expensive, it is considered a slow function in this context. 219 // 220 // PressureDiffs are precomputed and cached, and getPressureDiff is just a 221 // trivial lookup into an array. It is pretty much free. 222 // 223 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of 224 // PressureDiffs. 225 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { 226 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, 227 DownwardTracker, UpwardTracker, DAG, SRI); 228 } else { 229 // Reserve 4 slots. 230 Pressure.resize(4, 0); 231 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; 232 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; 233 234 for (const auto &Diff : DAG->getPressureDiff(SU)) { 235 if (!Diff.isValid()) 236 continue; 237 // PressureDiffs is always bottom-up so if we're working top-down we need 238 // to invert its sign. 239 Pressure[Diff.getPSet()] += 240 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); 241 } 242 243 #ifdef EXPENSIVE_CHECKS 244 std::vector<unsigned> CheckPressure, CheckMaxPressure; 245 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, 246 DownwardTracker, UpwardTracker, DAG, SRI); 247 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != 248 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || 249 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != 250 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { 251 errs() << "Register Pressure is inaccurate when calculated through " 252 "PressureDiff\n" 253 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] 254 << ", expected " 255 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" 256 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] 257 << ", expected " 258 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; 259 report_fatal_error("inaccurate register pressure calculation"); 260 } 261 #endif 262 } 263 264 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 265 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 266 267 // If two instructions increase the pressure of different register sets 268 // by the same amount, the generic scheduler will prefer to schedule the 269 // instruction that increases the set with the least amount of registers, 270 // which in our case would be SGPRs. This is rarely what we want, so 271 // when we report excess/critical register pressure, we do it either 272 // only for VGPRs or only for SGPRs. 273 274 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 275 const unsigned MaxVGPRPressureInc = 16; 276 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 277 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 278 279 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 280 // to increase the likelihood we don't go over the limits. We should improve 281 // the analysis to look through dependencies to find the path with the least 282 // register pressure. 283 284 // We only need to update the RPDelta for instructions that increase register 285 // pressure. Instructions that decrease or keep reg pressure the same will be 286 // marked as RegExcess in tryCandidate() when they are compared with 287 // instructions that increase the register pressure. 288 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 289 HasHighPressure = true; 290 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 291 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 292 } 293 294 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 295 HasHighPressure = true; 296 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 297 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 298 } 299 300 // Register pressure is considered 'CRITICAL' if it is approaching a value 301 // that would reduce the wave occupancy for the execution unit. When 302 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 303 // has the same cost, so we don't need to prefer one over the other. 304 305 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 306 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 307 308 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 309 HasHighPressure = true; 310 if (SGPRDelta > VGPRDelta) { 311 Cand.RPDelta.CriticalMax = 312 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 313 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 314 } else { 315 Cand.RPDelta.CriticalMax = 316 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 317 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 318 } 319 } 320 } 321 322 // This function is mostly cut and pasted from 323 // GenericScheduler::pickNodeFromQueue() 324 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 325 const CandPolicy &ZonePolicy, 326 const RegPressureTracker &RPTracker, 327 SchedCandidate &Cand, 328 bool IsBottomUp) { 329 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); 330 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 331 unsigned SGPRPressure = 0; 332 unsigned VGPRPressure = 0; 333 if (DAG->isTrackingPressure()) { 334 if (!GCNTrackers) { 335 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 336 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 337 } else { 338 GCNRPTracker *T = IsBottomUp 339 ? static_cast<GCNRPTracker *>(&UpwardTracker) 340 : static_cast<GCNRPTracker *>(&DownwardTracker); 341 SGPRPressure = T->getPressure().getSGPRNum(); 342 VGPRPressure = T->getPressure().getArchVGPRNum(); 343 } 344 } 345 ReadyQueue &Q = Zone.Available; 346 for (SUnit *SU : Q) { 347 348 SchedCandidate TryCand(ZonePolicy); 349 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, 350 VGPRPressure, IsBottomUp); 351 // Pass SchedBoundary only when comparing nodes from the same boundary. 352 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 353 tryCandidate(Cand, TryCand, ZoneArg); 354 if (TryCand.Reason != NoCand) { 355 // Initialize resource delta if needed in case future heuristics query it. 356 if (TryCand.ResDelta == SchedResourceDelta()) 357 TryCand.initResourceDelta(Zone.DAG, SchedModel); 358 Cand.setBest(TryCand); 359 LLVM_DEBUG(traceCandidate(Cand)); 360 } 361 } 362 } 363 364 // This function is mostly cut and pasted from 365 // GenericScheduler::pickNodeBidirectional() 366 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 367 // Schedule as far as possible in the direction of no choice. This is most 368 // efficient, but also provides the best heuristics for CriticalPSets. 369 if (SUnit *SU = Bot.pickOnlyChoice()) { 370 IsTopNode = false; 371 return SU; 372 } 373 if (SUnit *SU = Top.pickOnlyChoice()) { 374 IsTopNode = true; 375 return SU; 376 } 377 // Set the bottom-up policy based on the state of the current bottom zone and 378 // the instructions outside the zone, including the top zone. 379 CandPolicy BotPolicy; 380 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 381 // Set the top-down policy based on the state of the current top zone and 382 // the instructions outside the zone, including the bottom zone. 383 CandPolicy TopPolicy; 384 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 385 386 // See if BotCand is still valid (because we previously scheduled from Top). 387 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 388 if (!BotCand.isValid() || BotCand.SU->isScheduled || 389 BotCand.Policy != BotPolicy) { 390 BotCand.reset(CandPolicy()); 391 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, 392 /*IsBottomUp=*/true); 393 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 394 } else { 395 LLVM_DEBUG(traceCandidate(BotCand)); 396 #ifndef NDEBUG 397 if (VerifyScheduling) { 398 SchedCandidate TCand; 399 TCand.reset(CandPolicy()); 400 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, 401 /*IsBottomUp=*/true); 402 assert(TCand.SU == BotCand.SU && 403 "Last pick result should correspond to re-picking right now"); 404 } 405 #endif 406 } 407 408 // Check if the top Q has a better candidate. 409 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 410 if (!TopCand.isValid() || TopCand.SU->isScheduled || 411 TopCand.Policy != TopPolicy) { 412 TopCand.reset(CandPolicy()); 413 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, 414 /*IsBottomUp=*/false); 415 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 416 } else { 417 LLVM_DEBUG(traceCandidate(TopCand)); 418 #ifndef NDEBUG 419 if (VerifyScheduling) { 420 SchedCandidate TCand; 421 TCand.reset(CandPolicy()); 422 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, 423 /*IsBottomUp=*/false); 424 assert(TCand.SU == TopCand.SU && 425 "Last pick result should correspond to re-picking right now"); 426 } 427 #endif 428 } 429 430 // Pick best from BotCand and TopCand. 431 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 432 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 433 SchedCandidate Cand = BotCand; 434 TopCand.Reason = NoCand; 435 tryCandidate(Cand, TopCand, nullptr); 436 if (TopCand.Reason != NoCand) { 437 Cand.setBest(TopCand); 438 } 439 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 440 441 IsTopNode = Cand.AtTop; 442 return Cand.SU; 443 } 444 445 // This function is mostly cut and pasted from 446 // GenericScheduler::pickNode() 447 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { 448 if (DAG->top() == DAG->bottom()) { 449 assert(Top.Available.empty() && Top.Pending.empty() && 450 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 451 return nullptr; 452 } 453 SUnit *SU; 454 do { 455 if (RegionPolicy.OnlyTopDown) { 456 SU = Top.pickOnlyChoice(); 457 if (!SU) { 458 CandPolicy NoPolicy; 459 TopCand.reset(NoPolicy); 460 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, 461 /*IsBottomUp=*/false); 462 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 463 SU = TopCand.SU; 464 } 465 IsTopNode = true; 466 } else if (RegionPolicy.OnlyBottomUp) { 467 SU = Bot.pickOnlyChoice(); 468 if (!SU) { 469 CandPolicy NoPolicy; 470 BotCand.reset(NoPolicy); 471 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, 472 /*IsBottomUp=*/true); 473 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 474 SU = BotCand.SU; 475 } 476 IsTopNode = false; 477 } else { 478 SU = pickNodeBidirectional(IsTopNode); 479 } 480 } while (SU->isScheduled); 481 482 if (SU->isTopReady()) 483 Top.removeReady(SU); 484 if (SU->isBottomReady()) 485 Bot.removeReady(SU); 486 487 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 488 << *SU->getInstr()); 489 return SU; 490 } 491 492 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 493 if (GCNTrackers) { 494 MachineInstr *MI = SU->getInstr(); 495 IsTopNode ? (void)DownwardTracker.advance(MI, false) 496 : UpwardTracker.recede(*MI); 497 } 498 499 return GenericScheduler::schedNode(SU, IsTopNode); 500 } 501 502 GCNSchedStageID GCNSchedStrategy::getCurrentStage() { 503 assert(CurrentStage && CurrentStage != SchedStages.end()); 504 return *CurrentStage; 505 } 506 507 bool GCNSchedStrategy::advanceStage() { 508 assert(CurrentStage != SchedStages.end()); 509 if (!CurrentStage) 510 CurrentStage = SchedStages.begin(); 511 else 512 CurrentStage++; 513 514 return CurrentStage != SchedStages.end(); 515 } 516 517 bool GCNSchedStrategy::hasNextStage() const { 518 assert(CurrentStage); 519 return std::next(CurrentStage) != SchedStages.end(); 520 } 521 522 GCNSchedStageID GCNSchedStrategy::getNextStage() const { 523 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); 524 return *std::next(CurrentStage); 525 } 526 527 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 528 const MachineSchedContext *C, bool IsLegacyScheduler) 529 : GCNSchedStrategy(C) { 530 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); 531 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); 532 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); 533 SchedStages.push_back(GCNSchedStageID::PreRARematerialize); 534 GCNTrackers = GCNTrackers & !IsLegacyScheduler; 535 } 536 537 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) 538 : GCNSchedStrategy(C) { 539 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); 540 } 541 542 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, 543 SchedCandidate &TryCand, 544 SchedBoundary *Zone) const { 545 // Initialize the candidate if needed. 546 if (!Cand.isValid()) { 547 TryCand.Reason = NodeOrder; 548 return true; 549 } 550 551 // Avoid spilling by exceeding the register limit. 552 if (DAG->isTrackingPressure() && 553 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 554 RegExcess, TRI, DAG->MF)) 555 return TryCand.Reason != NoCand; 556 557 // Bias PhysReg Defs and copies to their uses and defined respectively. 558 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 559 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 560 return TryCand.Reason != NoCand; 561 562 bool SameBoundary = Zone != nullptr; 563 if (SameBoundary) { 564 // Prioritize instructions that read unbuffered resources by stall cycles. 565 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 566 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 567 return TryCand.Reason != NoCand; 568 569 // Avoid critical resource consumption and balance the schedule. 570 TryCand.initResourceDelta(DAG, SchedModel); 571 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 572 TryCand, Cand, ResourceReduce)) 573 return TryCand.Reason != NoCand; 574 if (tryGreater(TryCand.ResDelta.DemandedResources, 575 Cand.ResDelta.DemandedResources, TryCand, Cand, 576 ResourceDemand)) 577 return TryCand.Reason != NoCand; 578 579 // Unconditionally try to reduce latency. 580 if (tryLatency(TryCand, Cand, *Zone)) 581 return TryCand.Reason != NoCand; 582 583 // Weak edges are for clustering and other constraints. 584 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 585 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 586 return TryCand.Reason != NoCand; 587 } 588 589 // Keep clustered nodes together to encourage downstream peephole 590 // optimizations which may reduce resource requirements. 591 // 592 // This is a best effort to set things up for a post-RA pass. Optimizations 593 // like generating loads of multiple registers should ideally be done within 594 // the scheduler pass by combining the loads during DAG postprocessing. 595 const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; 596 const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; 597 if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), 598 CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, 599 Cluster)) 600 return TryCand.Reason != NoCand; 601 602 // Avoid increasing the max critical pressure in the scheduled region. 603 if (DAG->isTrackingPressure() && 604 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 605 TryCand, Cand, RegCritical, TRI, DAG->MF)) 606 return TryCand.Reason != NoCand; 607 608 // Avoid increasing the max pressure of the entire region. 609 if (DAG->isTrackingPressure() && 610 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 611 Cand, RegMax, TRI, DAG->MF)) 612 return TryCand.Reason != NoCand; 613 614 if (SameBoundary) { 615 // Fall through to original instruction order. 616 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || 617 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 618 TryCand.Reason = NodeOrder; 619 return true; 620 } 621 } 622 return false; 623 } 624 625 GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( 626 const MachineSchedContext *C) 627 : GCNSchedStrategy(C) { 628 SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule); 629 } 630 631 /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as 632 /// much as possible. This is achieved by: 633 // 1. Prioritize clustered operations before stall latency heuristic. 634 // 2. Prioritize long-latency-load before stall latency heuristic. 635 /// 636 /// \param Cand provides the policy and current best candidate. 637 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. 638 /// \param Zone describes the scheduled zone that we are extending, or nullptr 639 /// if Cand is from a different zone than TryCand. 640 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) 641 bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, 642 SchedCandidate &TryCand, 643 SchedBoundary *Zone) const { 644 // Initialize the candidate if needed. 645 if (!Cand.isValid()) { 646 TryCand.Reason = NodeOrder; 647 return true; 648 } 649 650 // Bias PhysReg Defs and copies to their uses and defined respectively. 651 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 652 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 653 return TryCand.Reason != NoCand; 654 655 if (DAG->isTrackingPressure()) { 656 // Avoid exceeding the target's limit. 657 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 658 RegExcess, TRI, DAG->MF)) 659 return TryCand.Reason != NoCand; 660 661 // Avoid increasing the max critical pressure in the scheduled region. 662 if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 663 TryCand, Cand, RegCritical, TRI, DAG->MF)) 664 return TryCand.Reason != NoCand; 665 } 666 667 // MaxMemoryClause-specific: We prioritize clustered instructions as we would 668 // get more benefit from clausing these memory instructions. 669 const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; 670 const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; 671 if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), 672 CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, 673 Cluster)) 674 return TryCand.Reason != NoCand; 675 676 // We only compare a subset of features when comparing nodes between 677 // Top and Bottom boundary. Some properties are simply incomparable, in many 678 // other instances we should only override the other boundary if something 679 // is a clear good pick on one boundary. Skip heuristics that are more 680 // "tie-breaking" in nature. 681 bool SameBoundary = Zone != nullptr; 682 if (SameBoundary) { 683 // For loops that are acyclic path limited, aggressively schedule for 684 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal 685 // heuristics to take precedence. 686 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && 687 tryLatency(TryCand, Cand, *Zone)) 688 return TryCand.Reason != NoCand; 689 690 // MaxMemoryClause-specific: Prioritize long latency memory load 691 // instructions in top-bottom order to hide more latency. The mayLoad check 692 // is used to exclude store-like instructions, which we do not want to 693 // scheduler them too early. 694 bool TryMayLoad = 695 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); 696 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); 697 698 if (TryMayLoad || CandMayLoad) { 699 bool TryLongLatency = 700 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; 701 bool CandLongLatency = 702 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; 703 704 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency, 705 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, 706 Cand, Stall)) 707 return TryCand.Reason != NoCand; 708 } 709 // Prioritize instructions that read unbuffered resources by stall cycles. 710 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 711 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 712 return TryCand.Reason != NoCand; 713 } 714 715 if (SameBoundary) { 716 // Weak edges are for clustering and other constraints. 717 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 718 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 719 return TryCand.Reason != NoCand; 720 } 721 722 // Avoid increasing the max pressure of the entire region. 723 if (DAG->isTrackingPressure() && 724 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 725 Cand, RegMax, TRI, DAG->MF)) 726 return TryCand.Reason != NoCand; 727 728 if (SameBoundary) { 729 // Avoid critical resource consumption and balance the schedule. 730 TryCand.initResourceDelta(DAG, SchedModel); 731 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 732 TryCand, Cand, ResourceReduce)) 733 return TryCand.Reason != NoCand; 734 if (tryGreater(TryCand.ResDelta.DemandedResources, 735 Cand.ResDelta.DemandedResources, TryCand, Cand, 736 ResourceDemand)) 737 return TryCand.Reason != NoCand; 738 739 // Avoid serializing long latency dependence chains. 740 // For acyclic path limited loops, latency was already checked above. 741 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && 742 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) 743 return TryCand.Reason != NoCand; 744 745 // Fall through to original instruction order. 746 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { 747 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); 748 TryCand.Reason = NodeOrder; 749 return true; 750 } 751 } 752 753 return false; 754 } 755 756 GCNScheduleDAGMILive::GCNScheduleDAGMILive( 757 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) 758 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), 759 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 760 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), 761 RegionLiveOuts(this, /*IsLiveOut=*/true) { 762 763 // We want regions with a single MI to be scheduled so that we can reason 764 // about them correctly during scheduling stages that move MIs between regions 765 // (e.g., rematerialization). 766 ScheduleSingleMIRegions = true; 767 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 768 if (RelaxedOcc) { 769 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); 770 if (MinOccupancy != StartingOccupancy) 771 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy 772 << ".\n"); 773 } 774 } 775 776 std::unique_ptr<GCNSchedStage> 777 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { 778 switch (SchedStageID) { 779 case GCNSchedStageID::OccInitialSchedule: 780 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); 781 case GCNSchedStageID::UnclusteredHighRPReschedule: 782 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); 783 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 784 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); 785 case GCNSchedStageID::PreRARematerialize: 786 return std::make_unique<PreRARematStage>(SchedStageID, *this); 787 case GCNSchedStageID::ILPInitialSchedule: 788 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); 789 case GCNSchedStageID::MemoryClauseInitialSchedule: 790 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID, 791 *this); 792 } 793 794 llvm_unreachable("Unknown SchedStageID."); 795 } 796 797 void GCNScheduleDAGMILive::schedule() { 798 // Collect all scheduling regions. The actual scheduling is performed in 799 // GCNScheduleDAGMILive::finalizeSchedule. 800 Regions.push_back(std::pair(RegionBegin, RegionEnd)); 801 } 802 803 GCNRegPressure 804 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { 805 GCNDownwardRPTracker RPTracker(*LIS); 806 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 807 return RPTracker.moveMaxPressure(); 808 } 809 810 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, 811 MachineBasicBlock::iterator RegionEnd) { 812 auto REnd = RegionEnd == RegionBegin->getParent()->end() 813 ? std::prev(RegionEnd) 814 : RegionEnd; 815 return &*skipDebugInstructionsBackward(REnd, RegionBegin); 816 } 817 818 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, 819 const MachineBasicBlock *MBB) { 820 GCNDownwardRPTracker RPTracker(*LIS); 821 822 // If the block has the only successor then live-ins of that successor are 823 // live-outs of the current block. We can reuse calculated live set if the 824 // successor will be sent to scheduling past current block. 825 826 // However, due to the bug in LiveInterval analysis it may happen that two 827 // predecessors of the same successor block have different lane bitmasks for 828 // a live-out register. Workaround that by sticking to one-to-one relationship 829 // i.e. one predecessor with one successor block. 830 const MachineBasicBlock *OnlySucc = nullptr; 831 if (MBB->succ_size() == 1) { 832 auto *Candidate = *MBB->succ_begin(); 833 if (!Candidate->empty() && Candidate->pred_size() == 1) { 834 SlotIndexes *Ind = LIS->getSlotIndexes(); 835 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) 836 OnlySucc = Candidate; 837 } 838 } 839 840 // Scheduler sends regions from the end of the block upwards. 841 size_t CurRegion = RegionIdx; 842 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 843 if (Regions[CurRegion].first->getParent() != MBB) 844 break; 845 --CurRegion; 846 847 auto I = MBB->begin(); 848 auto LiveInIt = MBBLiveIns.find(MBB); 849 auto &Rgn = Regions[CurRegion]; 850 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 851 if (LiveInIt != MBBLiveIns.end()) { 852 auto LiveIn = std::move(LiveInIt->second); 853 RPTracker.reset(*MBB->begin(), &LiveIn); 854 MBBLiveIns.erase(LiveInIt); 855 } else { 856 I = Rgn.first; 857 auto LRS = BBLiveInMap.lookup(NonDbgMI); 858 #ifdef EXPENSIVE_CHECKS 859 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 860 #endif 861 RPTracker.reset(*I, &LRS); 862 } 863 864 for (;;) { 865 I = RPTracker.getNext(); 866 867 if (Regions[CurRegion].first == I || NonDbgMI == I) { 868 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 869 RPTracker.clearMaxPressure(); 870 } 871 872 if (Regions[CurRegion].second == I) { 873 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 874 if (CurRegion-- == RegionIdx) 875 break; 876 auto &Rgn = Regions[CurRegion]; 877 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 878 } 879 RPTracker.advanceToNext(); 880 RPTracker.advanceBeforeNext(); 881 } 882 883 if (OnlySucc) { 884 if (I != MBB->end()) { 885 RPTracker.advanceToNext(); 886 RPTracker.advance(MBB->end()); 887 } 888 RPTracker.advanceBeforeNext(); 889 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 890 } 891 } 892 893 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 894 GCNScheduleDAGMILive::getRegionLiveInMap() const { 895 assert(!Regions.empty()); 896 std::vector<MachineInstr *> RegionFirstMIs; 897 RegionFirstMIs.reserve(Regions.size()); 898 auto I = Regions.rbegin(), E = Regions.rend(); 899 do { 900 const MachineBasicBlock *MBB = I->first->getParent(); 901 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 902 RegionFirstMIs.push_back(MI); 903 do { 904 ++I; 905 } while (I != E && I->first->getParent() == MBB); 906 } while (I != E); 907 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); 908 } 909 910 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 911 GCNScheduleDAGMILive::getRegionLiveOutMap() const { 912 assert(!Regions.empty()); 913 std::vector<MachineInstr *> RegionLastMIs; 914 RegionLastMIs.reserve(Regions.size()); 915 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) 916 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); 917 918 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); 919 } 920 921 void RegionPressureMap::buildLiveRegMap() { 922 IdxToInstruction.clear(); 923 924 RegionLiveRegMap = 925 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); 926 for (unsigned I = 0; I < DAG->Regions.size(); I++) { 927 MachineInstr *RegionKey = 928 IsLiveOut 929 ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) 930 : &*DAG->Regions[I].first; 931 IdxToInstruction[I] = RegionKey; 932 } 933 } 934 935 void GCNScheduleDAGMILive::finalizeSchedule() { 936 // Start actual scheduling here. This function is called by the base 937 // MachineScheduler after all regions have been recorded by 938 // GCNScheduleDAGMILive::schedule(). 939 LiveIns.resize(Regions.size()); 940 Pressure.resize(Regions.size()); 941 RegionsWithHighRP.resize(Regions.size()); 942 RegionsWithExcessRP.resize(Regions.size()); 943 RegionsWithMinOcc.resize(Regions.size()); 944 RegionsWithIGLPInstrs.resize(Regions.size()); 945 RegionsWithHighRP.reset(); 946 RegionsWithExcessRP.reset(); 947 RegionsWithMinOcc.reset(); 948 RegionsWithIGLPInstrs.reset(); 949 950 runSchedStages(); 951 } 952 953 void GCNScheduleDAGMILive::runSchedStages() { 954 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 955 956 if (!Regions.empty()) { 957 BBLiveInMap = getRegionLiveInMap(); 958 if (GCNTrackers) 959 RegionLiveOuts.buildLiveRegMap(); 960 } 961 962 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); 963 while (S.advanceStage()) { 964 auto Stage = createSchedStage(S.getCurrentStage()); 965 if (!Stage->initGCNSchedStage()) 966 continue; 967 968 for (auto Region : Regions) { 969 RegionBegin = Region.first; 970 RegionEnd = Region.second; 971 // Setup for scheduling the region and check whether it should be skipped. 972 if (!Stage->initGCNRegion()) { 973 Stage->advanceRegion(); 974 exitRegion(); 975 continue; 976 } 977 978 if (GCNTrackers) { 979 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); 980 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); 981 GCNRPTracker::LiveRegSet *RegionLiveIns = 982 &LiveIns[Stage->getRegionIdx()]; 983 984 reinterpret_cast<GCNRPTracker *>(DownwardTracker) 985 ->reset(MRI, *RegionLiveIns); 986 reinterpret_cast<GCNRPTracker *>(UpwardTracker) 987 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( 988 Stage->getRegionIdx())); 989 } 990 991 ScheduleDAGMILive::schedule(); 992 Stage->finalizeGCNRegion(); 993 } 994 995 Stage->finalizeGCNSchedStage(); 996 } 997 } 998 999 #ifndef NDEBUG 1000 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { 1001 switch (StageID) { 1002 case GCNSchedStageID::OccInitialSchedule: 1003 OS << "Max Occupancy Initial Schedule"; 1004 break; 1005 case GCNSchedStageID::UnclusteredHighRPReschedule: 1006 OS << "Unclustered High Register Pressure Reschedule"; 1007 break; 1008 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 1009 OS << "Clustered Low Occupancy Reschedule"; 1010 break; 1011 case GCNSchedStageID::PreRARematerialize: 1012 OS << "Pre-RA Rematerialize"; 1013 break; 1014 case GCNSchedStageID::ILPInitialSchedule: 1015 OS << "Max ILP Initial Schedule"; 1016 break; 1017 case GCNSchedStageID::MemoryClauseInitialSchedule: 1018 OS << "Max memory clause Initial Schedule"; 1019 break; 1020 } 1021 1022 return OS; 1023 } 1024 #endif 1025 1026 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) 1027 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), 1028 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} 1029 1030 bool GCNSchedStage::initGCNSchedStage() { 1031 if (!DAG.LIS) 1032 return false; 1033 1034 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); 1035 return true; 1036 } 1037 1038 bool UnclusteredHighRPStage::initGCNSchedStage() { 1039 if (DisableUnclusterHighRP) 1040 return false; 1041 1042 if (!GCNSchedStage::initGCNSchedStage()) 1043 return false; 1044 1045 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) 1046 return false; 1047 1048 SavedMutations.swap(DAG.Mutations); 1049 DAG.addMutation( 1050 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); 1051 1052 InitialOccupancy = DAG.MinOccupancy; 1053 // Aggressivly try to reduce register pressure in the unclustered high RP 1054 // stage. Temporarily increase occupancy target in the region. 1055 S.SGPRLimitBias = S.HighRPSGPRBias; 1056 S.VGPRLimitBias = S.HighRPVGPRBias; 1057 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) 1058 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 1059 1060 LLVM_DEBUG( 1061 dbgs() 1062 << "Retrying function scheduling without clustering. " 1063 "Aggressivly try to reduce register pressure to achieve occupancy " 1064 << DAG.MinOccupancy << ".\n"); 1065 1066 return true; 1067 } 1068 1069 bool ClusteredLowOccStage::initGCNSchedStage() { 1070 if (DisableClusteredLowOccupancy) 1071 return false; 1072 1073 if (!GCNSchedStage::initGCNSchedStage()) 1074 return false; 1075 1076 // Don't bother trying to improve ILP in lower RP regions if occupancy has not 1077 // been dropped. All regions will have already been scheduled with the ideal 1078 // occupancy targets. 1079 if (DAG.StartingOccupancy <= DAG.MinOccupancy) 1080 return false; 1081 1082 LLVM_DEBUG( 1083 dbgs() << "Retrying function scheduling with lowest recorded occupancy " 1084 << DAG.MinOccupancy << ".\n"); 1085 return true; 1086 } 1087 1088 /// Allows to easily filter for this stage's debug output. 1089 #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) 1090 1091 bool PreRARematStage::initGCNSchedStage() { 1092 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for 1093 // regions inbetween the defs and region we sinked the def to. Will need to be 1094 // fixed if there is another pass after this pass. 1095 assert(!S.hasNextStage()); 1096 1097 if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() || 1098 DAG.Regions.size() == 1) 1099 return false; 1100 1101 // Before performing any IR modification record the parent region of each MI 1102 // and the parent MBB of each region. 1103 const unsigned NumRegions = DAG.Regions.size(); 1104 RegionBB.reserve(NumRegions); 1105 for (unsigned I = 0; I < NumRegions; ++I) { 1106 RegionBoundaries Region = DAG.Regions[I]; 1107 for (auto MI = Region.first; MI != Region.second; ++MI) 1108 MIRegion.insert({&*MI, I}); 1109 RegionBB.push_back(Region.first->getParent()); 1110 } 1111 1112 if (!canIncreaseOccupancyOrReduceSpill()) 1113 return false; 1114 1115 // Rematerialize identified instructions and update scheduler's state. 1116 rematerialize(); 1117 if (GCNTrackers) 1118 DAG.RegionLiveOuts.buildLiveRegMap(); 1119 REMAT_DEBUG( 1120 dbgs() << "Retrying function scheduling with new min. occupancy of " 1121 << AchievedOcc << " from rematerializing (original was " 1122 << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); 1123 if (AchievedOcc > DAG.MinOccupancy) { 1124 DAG.MinOccupancy = AchievedOcc; 1125 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1126 MFI.increaseOccupancy(MF, DAG.MinOccupancy); 1127 } 1128 return true; 1129 } 1130 1131 void GCNSchedStage::finalizeGCNSchedStage() { 1132 DAG.finishBlock(); 1133 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); 1134 } 1135 1136 void UnclusteredHighRPStage::finalizeGCNSchedStage() { 1137 SavedMutations.swap(DAG.Mutations); 1138 S.SGPRLimitBias = S.VGPRLimitBias = 0; 1139 if (DAG.MinOccupancy > InitialOccupancy) { 1140 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) 1141 DAG.RegionsWithMinOcc[IDX] = 1142 DAG.Pressure[IDX].getOccupancy( 1143 DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; 1144 1145 LLVM_DEBUG(dbgs() << StageID 1146 << " stage successfully increased occupancy to " 1147 << DAG.MinOccupancy << '\n'); 1148 } 1149 1150 GCNSchedStage::finalizeGCNSchedStage(); 1151 } 1152 1153 bool GCNSchedStage::initGCNRegion() { 1154 // Check whether this new region is also a new block. 1155 if (DAG.RegionBegin->getParent() != CurrentMBB) 1156 setupNewBlock(); 1157 1158 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); 1159 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); 1160 1161 // Skip empty scheduling regions (0 or 1 schedulable instructions). 1162 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) 1163 return false; 1164 1165 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 1166 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) 1167 << " " << CurrentMBB->getName() 1168 << "\n From: " << *DAG.begin() << " To: "; 1169 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; 1170 else dbgs() << "End"; 1171 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 1172 1173 // Save original instruction order before scheduling for possible revert. 1174 Unsched.clear(); 1175 Unsched.reserve(DAG.NumRegionInstrs); 1176 if (StageID == GCNSchedStageID::OccInitialSchedule || 1177 StageID == GCNSchedStageID::ILPInitialSchedule) { 1178 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII); 1179 for (auto &I : DAG) { 1180 Unsched.push_back(&I); 1181 if (SII->isIGLPMutationOnly(I.getOpcode())) 1182 DAG.RegionsWithIGLPInstrs[RegionIdx] = true; 1183 } 1184 } else { 1185 for (auto &I : DAG) 1186 Unsched.push_back(&I); 1187 } 1188 1189 PressureBefore = DAG.Pressure[RegionIdx]; 1190 1191 LLVM_DEBUG( 1192 dbgs() << "Pressure before scheduling:\nRegion live-ins:" 1193 << print(DAG.LiveIns[RegionIdx], DAG.MRI) 1194 << "Region live-in pressure: " 1195 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) 1196 << "Region register pressure: " << print(PressureBefore)); 1197 1198 S.HasHighPressure = false; 1199 S.KnownExcessRP = isRegionWithExcessRP(); 1200 1201 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 1202 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { 1203 SavedMutations.clear(); 1204 SavedMutations.swap(DAG.Mutations); 1205 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || 1206 StageID == GCNSchedStageID::ILPInitialSchedule; 1207 DAG.addMutation(createIGroupLPDAGMutation( 1208 IsInitialStage ? AMDGPU::SchedulingPhase::Initial 1209 : AMDGPU::SchedulingPhase::PreRAReentry)); 1210 } 1211 1212 return true; 1213 } 1214 1215 bool UnclusteredHighRPStage::initGCNRegion() { 1216 // Only reschedule regions with the minimum occupancy or regions that may have 1217 // spilling (excess register pressure). 1218 if ((!DAG.RegionsWithMinOcc[RegionIdx] || 1219 DAG.MinOccupancy <= InitialOccupancy) && 1220 !DAG.RegionsWithExcessRP[RegionIdx]) 1221 return false; 1222 1223 return GCNSchedStage::initGCNRegion(); 1224 } 1225 1226 bool ClusteredLowOccStage::initGCNRegion() { 1227 // We may need to reschedule this region if it wasn't rescheduled in the last 1228 // stage, or if we found it was testing critical register pressure limits in 1229 // the unclustered reschedule stage. The later is because we may not have been 1230 // able to raise the min occupancy in the previous stage so the region may be 1231 // overly constrained even if it was already rescheduled. 1232 if (!DAG.RegionsWithHighRP[RegionIdx]) 1233 return false; 1234 1235 return GCNSchedStage::initGCNRegion(); 1236 } 1237 1238 bool PreRARematStage::initGCNRegion() { 1239 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion(); 1240 } 1241 1242 void GCNSchedStage::setupNewBlock() { 1243 if (CurrentMBB) 1244 DAG.finishBlock(); 1245 1246 CurrentMBB = DAG.RegionBegin->getParent(); 1247 DAG.startBlock(CurrentMBB); 1248 // Get real RP for the region if it hasn't be calculated before. After the 1249 // initial schedule stage real RP will be collected after scheduling. 1250 if (StageID == GCNSchedStageID::OccInitialSchedule || 1251 StageID == GCNSchedStageID::ILPInitialSchedule || 1252 StageID == GCNSchedStageID::MemoryClauseInitialSchedule) 1253 DAG.computeBlockPressure(RegionIdx, CurrentMBB); 1254 } 1255 1256 void GCNSchedStage::finalizeGCNRegion() { 1257 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1258 if (S.HasHighPressure) 1259 DAG.RegionsWithHighRP[RegionIdx] = true; 1260 1261 // Revert scheduling if we have dropped occupancy or there is some other 1262 // reason that the original schedule is better. 1263 checkScheduling(); 1264 1265 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 1266 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) 1267 SavedMutations.swap(DAG.Mutations); 1268 1269 DAG.exitRegion(); 1270 advanceRegion(); 1271 } 1272 1273 void GCNSchedStage::checkScheduling() { 1274 // Check the results of scheduling. 1275 PressureAfter = DAG.getRealRegPressure(RegionIdx); 1276 1277 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); 1278 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); 1279 1280 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); 1281 1282 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 1283 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 1284 DAG.Pressure[RegionIdx] = PressureAfter; 1285 DAG.RegionsWithMinOcc[RegionIdx] = 1286 PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == 1287 DAG.MinOccupancy; 1288 1289 // Early out if we have achieved the occupancy target. 1290 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 1291 return; 1292 } 1293 1294 unsigned TargetOccupancy = std::min( 1295 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); 1296 unsigned WavesAfter = std::min( 1297 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); 1298 unsigned WavesBefore = std::min( 1299 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); 1300 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 1301 << ", after " << WavesAfter << ".\n"); 1302 1303 // We may not be able to keep the current target occupancy because of the just 1304 // scheduled region. We might still be able to revert scheduling if the 1305 // occupancy before was higher, or if the current schedule has register 1306 // pressure higher than the excess limits which could lead to more spilling. 1307 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 1308 1309 // Allow memory bound functions to drop to 4 waves if not limited by an 1310 // attribute. 1311 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && 1312 WavesAfter >= MFI.getMinAllowedOccupancy()) { 1313 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 1314 << MFI.getMinAllowedOccupancy() << " waves\n"); 1315 NewOccupancy = WavesAfter; 1316 } 1317 1318 if (NewOccupancy < DAG.MinOccupancy) { 1319 DAG.MinOccupancy = NewOccupancy; 1320 MFI.limitOccupancy(DAG.MinOccupancy); 1321 DAG.RegionsWithMinOcc.reset(); 1322 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 1323 << DAG.MinOccupancy << ".\n"); 1324 } 1325 // The maximum number of arch VGPR on non-unified register file, or the 1326 // maximum VGPR + AGPR in the unified register file case. 1327 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 1328 // The maximum number of arch VGPR for both unified and non-unified register 1329 // file. 1330 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); 1331 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 1332 1333 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || 1334 PressureAfter.getArchVGPRNum() > MaxArchVGPRs || 1335 PressureAfter.getAGPRNum() > MaxArchVGPRs || 1336 PressureAfter.getSGPRNum() > MaxSGPRs) { 1337 DAG.RegionsWithHighRP[RegionIdx] = true; 1338 DAG.RegionsWithExcessRP[RegionIdx] = true; 1339 } 1340 1341 // Revert if this region's schedule would cause a drop in occupancy or 1342 // spilling. 1343 if (shouldRevertScheduling(WavesAfter)) { 1344 revertScheduling(); 1345 } else { 1346 DAG.Pressure[RegionIdx] = PressureAfter; 1347 DAG.RegionsWithMinOcc[RegionIdx] = 1348 PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == 1349 DAG.MinOccupancy; 1350 } 1351 } 1352 1353 unsigned 1354 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, 1355 DenseMap<unsigned, unsigned> &ReadyCycles, 1356 const TargetSchedModel &SM) { 1357 unsigned ReadyCycle = CurrCycle; 1358 for (auto &D : SU.Preds) { 1359 if (D.isAssignedRegDep()) { 1360 MachineInstr *DefMI = D.getSUnit()->getInstr(); 1361 unsigned Latency = SM.computeInstrLatency(DefMI); 1362 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; 1363 ReadyCycle = std::max(ReadyCycle, DefReady + Latency); 1364 } 1365 } 1366 ReadyCycles[SU.NodeNum] = ReadyCycle; 1367 return ReadyCycle; 1368 } 1369 1370 #ifndef NDEBUG 1371 struct EarlierIssuingCycle { 1372 bool operator()(std::pair<MachineInstr *, unsigned> A, 1373 std::pair<MachineInstr *, unsigned> B) const { 1374 return A.second < B.second; 1375 } 1376 }; 1377 1378 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, 1379 EarlierIssuingCycle> &ReadyCycles) { 1380 if (ReadyCycles.empty()) 1381 return; 1382 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); 1383 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum 1384 << " ##################\n# Cycle #\t\t\tInstruction " 1385 " " 1386 " \n"; 1387 unsigned IPrev = 1; 1388 for (auto &I : ReadyCycles) { 1389 if (I.second > IPrev + 1) 1390 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev 1391 << " CYCLES DETECTED ******************************\n\n"; 1392 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; 1393 IPrev = I.second; 1394 } 1395 } 1396 #endif 1397 1398 ScheduleMetrics 1399 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { 1400 #ifndef NDEBUG 1401 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1402 ReadyCyclesSorted; 1403 #endif 1404 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1405 unsigned SumBubbles = 0; 1406 DenseMap<unsigned, unsigned> ReadyCycles; 1407 unsigned CurrCycle = 0; 1408 for (auto &SU : InputSchedule) { 1409 unsigned ReadyCycle = 1410 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); 1411 SumBubbles += ReadyCycle - CurrCycle; 1412 #ifndef NDEBUG 1413 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); 1414 #endif 1415 CurrCycle = ++ReadyCycle; 1416 } 1417 #ifndef NDEBUG 1418 LLVM_DEBUG( 1419 printScheduleModel(ReadyCyclesSorted); 1420 dbgs() << "\n\t" 1421 << "Metric: " 1422 << (SumBubbles 1423 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1424 : 1) 1425 << "\n\n"); 1426 #endif 1427 1428 return ScheduleMetrics(CurrCycle, SumBubbles); 1429 } 1430 1431 ScheduleMetrics 1432 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { 1433 #ifndef NDEBUG 1434 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1435 ReadyCyclesSorted; 1436 #endif 1437 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1438 unsigned SumBubbles = 0; 1439 DenseMap<unsigned, unsigned> ReadyCycles; 1440 unsigned CurrCycle = 0; 1441 for (auto &MI : DAG) { 1442 SUnit *SU = DAG.getSUnit(&MI); 1443 if (!SU) 1444 continue; 1445 unsigned ReadyCycle = 1446 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); 1447 SumBubbles += ReadyCycle - CurrCycle; 1448 #ifndef NDEBUG 1449 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); 1450 #endif 1451 CurrCycle = ++ReadyCycle; 1452 } 1453 #ifndef NDEBUG 1454 LLVM_DEBUG( 1455 printScheduleModel(ReadyCyclesSorted); 1456 dbgs() << "\n\t" 1457 << "Metric: " 1458 << (SumBubbles 1459 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1460 : 1) 1461 << "\n\n"); 1462 #endif 1463 1464 return ScheduleMetrics(CurrCycle, SumBubbles); 1465 } 1466 1467 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { 1468 if (WavesAfter < DAG.MinOccupancy) 1469 return true; 1470 1471 // For dynamic VGPR mode, we don't want to waste any VGPR blocks. 1472 if (DAG.MFI.isDynamicVGPREnabled()) { 1473 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( 1474 &ST, DAG.MFI.getDynamicVGPRBlockSize(), 1475 PressureBefore.getVGPRNum(false)); 1476 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( 1477 &ST, DAG.MFI.getDynamicVGPRBlockSize(), 1478 PressureAfter.getVGPRNum(false)); 1479 if (BlocksAfter > BlocksBefore) 1480 return true; 1481 } 1482 1483 return false; 1484 } 1485 1486 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1487 if (PressureAfter == PressureBefore) 1488 return false; 1489 1490 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1491 return true; 1492 1493 if (mayCauseSpilling(WavesAfter)) 1494 return true; 1495 1496 return false; 1497 } 1498 1499 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { 1500 // If RP is not reduced in the unclustered reschedule stage, revert to the 1501 // old schedule. 1502 if ((WavesAfter <= 1503 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) && 1504 mayCauseSpilling(WavesAfter)) || 1505 GCNSchedStage::shouldRevertScheduling(WavesAfter)) { 1506 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 1507 return true; 1508 } 1509 1510 // Do not attempt to relax schedule even more if we are already spilling. 1511 if (isRegionWithExcessRP()) 1512 return false; 1513 1514 LLVM_DEBUG( 1515 dbgs() 1516 << "\n\t *** In shouldRevertScheduling ***\n" 1517 << " *********** BEFORE UnclusteredHighRPStage ***********\n"); 1518 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits); 1519 LLVM_DEBUG( 1520 dbgs() 1521 << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); 1522 ScheduleMetrics MAfter = getScheduleMetrics(DAG); 1523 unsigned OldMetric = MBefore.getMetric(); 1524 unsigned NewMetric = MAfter.getMetric(); 1525 unsigned WavesBefore = std::min( 1526 S.getTargetOccupancy(), 1527 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize())); 1528 unsigned Profit = 1529 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * 1530 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / 1531 NewMetric) / 1532 ScheduleMetrics::ScaleFactor; 1533 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " 1534 << MAfter << "Profit: " << Profit << "\n"); 1535 return Profit < ScheduleMetrics::ScaleFactor; 1536 } 1537 1538 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { 1539 if (PressureAfter == PressureBefore) 1540 return false; 1541 1542 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1543 return true; 1544 1545 if (mayCauseSpilling(WavesAfter)) 1546 return true; 1547 1548 return false; 1549 } 1550 1551 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { 1552 return GCNSchedStage::shouldRevertScheduling(WavesAfter) || 1553 mayCauseSpilling(WavesAfter) || 1554 (IncreaseOccupancy && WavesAfter < TargetOcc); 1555 } 1556 1557 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1558 if (mayCauseSpilling(WavesAfter)) 1559 return true; 1560 1561 return false; 1562 } 1563 1564 bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( 1565 unsigned WavesAfter) { 1566 return mayCauseSpilling(WavesAfter); 1567 } 1568 1569 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { 1570 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && 1571 !PressureAfter.less(MF, PressureBefore)) { 1572 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 1573 return true; 1574 } 1575 1576 return false; 1577 } 1578 1579 void GCNSchedStage::revertScheduling() { 1580 DAG.RegionsWithMinOcc[RegionIdx] = 1581 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) == 1582 DAG.MinOccupancy; 1583 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 1584 DAG.RegionEnd = DAG.RegionBegin; 1585 int SkippedDebugInstr = 0; 1586 for (MachineInstr *MI : Unsched) { 1587 if (MI->isDebugInstr()) { 1588 ++SkippedDebugInstr; 1589 continue; 1590 } 1591 1592 if (MI->getIterator() != DAG.RegionEnd) { 1593 DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI); 1594 if (!MI->isDebugInstr()) 1595 DAG.LIS->handleMove(*MI, true); 1596 } 1597 1598 // Reset read-undef flags and update them later. 1599 for (auto &Op : MI->all_defs()) 1600 Op.setIsUndef(false); 1601 RegisterOperands RegOpers; 1602 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); 1603 if (!MI->isDebugInstr()) { 1604 if (DAG.ShouldTrackLaneMasks) { 1605 // Adjust liveness and add missing dead+read-undef flags. 1606 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); 1607 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); 1608 } else { 1609 // Adjust for missing dead-def flags. 1610 RegOpers.detectDeadDefs(*MI, *DAG.LIS); 1611 } 1612 } 1613 DAG.RegionEnd = MI->getIterator(); 1614 ++DAG.RegionEnd; 1615 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 1616 } 1617 1618 // After reverting schedule, debug instrs will now be at the end of the block 1619 // and RegionEnd will point to the first debug instr. Increment RegionEnd 1620 // pass debug instrs to the actual end of the scheduling region. 1621 while (SkippedDebugInstr-- > 0) 1622 ++DAG.RegionEnd; 1623 1624 // If Unsched.front() instruction is a debug instruction, this will actually 1625 // shrink the region since we moved all debug instructions to the end of the 1626 // block. Find the first instruction that is not a debug instruction. 1627 DAG.RegionBegin = Unsched.front()->getIterator(); 1628 if (DAG.RegionBegin->isDebugInstr()) { 1629 for (MachineInstr *MI : Unsched) { 1630 if (MI->isDebugInstr()) 1631 continue; 1632 DAG.RegionBegin = MI->getIterator(); 1633 break; 1634 } 1635 } 1636 1637 // Then move the debug instructions back into their correct place and set 1638 // RegionBegin and RegionEnd if needed. 1639 DAG.placeDebugValues(); 1640 1641 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1642 } 1643 1644 bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, 1645 SlotIndex OriginalIdx, 1646 SlotIndex RematIdx) const { 1647 1648 LiveIntervals *LIS = DAG.LIS; 1649 MachineRegisterInfo &MRI = DAG.MRI; 1650 OriginalIdx = OriginalIdx.getRegSlot(true); 1651 RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true)); 1652 for (const MachineOperand &MO : InstToRemat->operands()) { 1653 if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) 1654 continue; 1655 1656 if (!MO.getReg().isVirtual()) { 1657 // Do not attempt to reason about PhysRegs 1658 // TODO: better analysis of PhysReg livness 1659 if (!DAG.MRI.isConstantPhysReg(MO.getReg()) && 1660 !DAG.TII->isIgnorableUse(MO)) 1661 return false; 1662 1663 // Constant PhysRegs and IgnorableUses are okay 1664 continue; 1665 } 1666 1667 LiveInterval &LI = LIS->getInterval(MO.getReg()); 1668 const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx); 1669 assert(OVNI); 1670 1671 // Don't allow rematerialization immediately after the original def. 1672 // It would be incorrect if InstToRemat redefines the register. 1673 // See PR14098. 1674 if (SlotIndex::isSameInstr(OriginalIdx, RematIdx)) 1675 return false; 1676 1677 if (OVNI != LI.getVNInfoAt(RematIdx)) 1678 return false; 1679 1680 // Check that subrange is live at RematIdx. 1681 if (LI.hasSubRanges()) { 1682 const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); 1683 unsigned SubReg = MO.getSubReg(); 1684 LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) 1685 : MRI.getMaxLaneMaskForVReg(MO.getReg()); 1686 for (LiveInterval::SubRange &SR : LI.subranges()) { 1687 if ((SR.LaneMask & LM).none()) 1688 continue; 1689 if (!SR.liveAt(RematIdx)) 1690 return false; 1691 1692 // Early exit if all used lanes are checked. No need to continue. 1693 LM &= ~SR.LaneMask; 1694 if (LM.none()) 1695 break; 1696 } 1697 } 1698 } 1699 return true; 1700 } 1701 1702 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { 1703 REMAT_DEBUG({ 1704 dbgs() << "Collecting rematerializable instructions in "; 1705 MF.getFunction().printAsOperand(dbgs(), false); 1706 dbgs() << '\n'; 1707 }); 1708 1709 // Maps optimizable regions (i.e., regions at minimum and register-limited 1710 // occupancy, or regions with spilling) to the target RP we would like to 1711 // reach. 1712 DenseMap<unsigned, GCNRPTarget> OptRegions; 1713 const Function &F = MF.getFunction(); 1714 unsigned DynamicVGPRBlockSize = 1715 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); 1716 1717 std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); 1718 const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); 1719 const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); 1720 const unsigned MaxSGPRsIncOcc = 1721 ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); 1722 const unsigned MaxVGPRsIncOcc = 1723 ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); 1724 IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; 1725 1726 // Collect optimizable regions. If there is spilling in any region we will 1727 // just try to reduce spilling. Otherwise we will try to increase occupancy by 1728 // one in the whole function. 1729 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1730 GCNRegPressure &RP = DAG.Pressure[I]; 1731 // We allow ArchVGPR or AGPR savings to count as savings of the other kind 1732 // of VGPR only when trying to eliminate spilling. We cannot do this when 1733 // trying to increase occupancy since VGPR class swaps only occur later in 1734 // the register allocator i.e., the scheduler will not be able to reason 1735 // about these savings and will not report an increase in the achievable 1736 // occupancy, triggering rollbacks. 1737 GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, 1738 /*CombineVGPRSavings=*/true); 1739 if (!Target.satisfied() && IncreaseOccupancy) { 1740 // There is spilling in the region and we were so far trying to increase 1741 // occupancy. Strop trying that and focus on reducing spilling. 1742 IncreaseOccupancy = false; 1743 OptRegions.clear(); 1744 } else if (IncreaseOccupancy) { 1745 // There is no spilling in the region, try to increase occupancy. 1746 Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, 1747 /*CombineVGPRSavings=*/false); 1748 } 1749 if (!Target.satisfied()) 1750 OptRegions.insert({I, Target}); 1751 } 1752 if (OptRegions.empty()) 1753 return false; 1754 1755 #ifndef NDEBUG 1756 if (IncreaseOccupancy) { 1757 REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy 1758 << ") in regions:\n"); 1759 } else { 1760 REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" 1761 << WavesPerEU.first << ") in regions:\n"); 1762 } 1763 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1764 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) 1765 REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); 1766 } 1767 #endif 1768 1769 // When we are reducing spilling, the target is the minimum target number of 1770 // waves/EU determined by the subtarget. In cases where either one of 1771 // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current 1772 // minimum region occupancy may be higher than the latter. 1773 TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 1774 : std::max(DAG.MinOccupancy, WavesPerEU.first); 1775 1776 // Accounts for a reduction in RP in an optimizable region. Returns whether we 1777 // estimate that we have identified enough rematerialization opportunities to 1778 // achieve our goal, and sets Progress to true when this particular reduction 1779 // in pressure was helpful toward that goal. 1780 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, 1781 bool &Progress) -> bool { 1782 GCNRPTarget &Target = OptIt->getSecond(); 1783 if (!Target.isSaveBeneficial(Reg, DAG.MRI)) 1784 return false; 1785 Progress = true; 1786 Target.saveReg(Reg, Mask, DAG.MRI); 1787 if (Target.satisfied()) 1788 OptRegions.erase(OptIt->getFirst()); 1789 return OptRegions.empty(); 1790 }; 1791 1792 // We need up-to-date live-out info. to query live-out register masks in 1793 // regions containing rematerializable instructions. 1794 DAG.RegionLiveOuts.buildLiveRegMap(); 1795 1796 // Cache set of registers that are going to be rematerialized. 1797 DenseSet<unsigned> RematRegs; 1798 1799 // Identify rematerializable instructions in the function. 1800 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1801 auto Region = DAG.Regions[I]; 1802 for (auto MI = Region.first; MI != Region.second; ++MI) { 1803 // The instruction must be trivially rematerializable. 1804 MachineInstr &DefMI = *MI; 1805 if (!isTriviallyReMaterializable(DefMI)) 1806 continue; 1807 1808 // We only support rematerializing virtual registers with one definition. 1809 Register Reg = DefMI.getOperand(0).getReg(); 1810 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg)) 1811 continue; 1812 1813 // We only care to rematerialize the instruction if it has a single 1814 // non-debug user in a different region. The using MI may not belong to a 1815 // region if it is a lone region terminator. 1816 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg); 1817 if (!UseMI) 1818 continue; 1819 auto UseRegion = MIRegion.find(UseMI); 1820 if (UseRegion != MIRegion.end() && UseRegion->second == I) 1821 continue; 1822 1823 // Do not rematerialize an instruction if it uses or is used by an 1824 // instruction that we have designated for rematerialization. 1825 // FIXME: Allow for rematerialization chains: this requires 1. updating 1826 // remat points to account for uses that are rematerialized, and 2. either 1827 // rematerializing the candidates in careful ordering, or deferring the 1828 // MBB RP walk until the entire chain has been rematerialized. 1829 if (Rematerializations.contains(UseMI) || 1830 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) { 1831 return MO.isReg() && RematRegs.contains(MO.getReg()); 1832 })) 1833 continue; 1834 1835 // Do not rematerialize an instruction it it uses registers that aren't 1836 // available at its use. This ensures that we are not extending any live 1837 // range while rematerializing. 1838 SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI); 1839 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true); 1840 if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx)) 1841 continue; 1842 1843 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); 1844 RematInstruction &Remat = 1845 Rematerializations.try_emplace(&DefMI, UseMI).first->second; 1846 1847 bool RematUseful = false; 1848 if (auto It = OptRegions.find(I); It != OptRegions.end()) { 1849 // Optimistically consider that moving the instruction out of its 1850 // defining region will reduce RP in the latter; this assumes that 1851 // maximum RP in the region is reached somewhere between the defining 1852 // instruction and the end of the region. 1853 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n"); 1854 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg]; 1855 if (ReduceRPInRegion(It, Reg, Mask, RematUseful)) 1856 return true; 1857 } 1858 1859 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) { 1860 // We are only collecting regions in which the register is a live-in 1861 // (and may be live-through). 1862 auto It = DAG.LiveIns[LIRegion].find(Reg); 1863 if (It == DAG.LiveIns[LIRegion].end() || It->second.none()) 1864 continue; 1865 Remat.LiveInRegions.insert(LIRegion); 1866 1867 // Account for the reduction in RP due to the rematerialization in an 1868 // optimizable region in which the defined register is a live-in. This 1869 // is exact for live-through region but optimistic in the using region, 1870 // where RP is actually reduced only if maximum RP is reached somewhere 1871 // between the beginning of the region and the rematerializable 1872 // instruction's use. 1873 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) { 1874 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n'); 1875 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg], 1876 RematUseful)) 1877 return true; 1878 } 1879 } 1880 1881 // If the instruction is not a live-in or live-out in any optimizable 1882 // region then there is no point in rematerializing it. 1883 if (!RematUseful) { 1884 Rematerializations.pop_back(); 1885 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n"); 1886 } else { 1887 RematRegs.insert(Reg); 1888 } 1889 } 1890 } 1891 1892 if (IncreaseOccupancy) { 1893 // We were trying to increase occupancy but failed, abort the stage. 1894 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); 1895 Rematerializations.clear(); 1896 return false; 1897 } 1898 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n"); 1899 return !Rematerializations.empty(); 1900 } 1901 1902 void PreRARematStage::rematerialize() { 1903 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); 1904 1905 // Collect regions whose RP changes in unpredictable way; we will have to 1906 // fully recompute their RP after all rematerailizations. 1907 DenseSet<unsigned> RecomputeRP; 1908 1909 // Rematerialize all instructions. 1910 for (auto &[DefMI, Remat] : Rematerializations) { 1911 MachineBasicBlock::iterator InsertPos(Remat.UseMI); 1912 Register Reg = DefMI->getOperand(0).getReg(); 1913 unsigned SubReg = DefMI->getOperand(0).getSubReg(); 1914 unsigned DefRegion = MIRegion.at(DefMI); 1915 1916 // Rematerialize DefMI to its use block. 1917 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI, 1918 *DAG.TRI); 1919 Remat.RematMI = &*std::prev(InsertPos); 1920 Remat.RematMI->getOperand(0).setSubReg(SubReg); 1921 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); 1922 1923 // Update region boundaries in regions we sinked from (remove defining MI) 1924 // and to (insert MI rematerialized in use block). Only then we can erase 1925 // the original MI. 1926 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr); 1927 auto UseRegion = MIRegion.find(Remat.UseMI); 1928 if (UseRegion != MIRegion.end()) { 1929 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos, 1930 Remat.RematMI); 1931 } 1932 DAG.LIS->RemoveMachineInstrFromMaps(*DefMI); 1933 DefMI->eraseFromParent(); 1934 1935 // Collect all regions impacted by the rematerialization and update their 1936 // live-in/RP information. 1937 for (unsigned I : Remat.LiveInRegions) { 1938 ImpactedRegions.insert({I, DAG.Pressure[I]}); 1939 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I]; 1940 1941 #ifdef EXPENSIVE_CHECKS 1942 // All uses are known to be available / live at the remat point. Thus, the 1943 // uses should already be live in to the region. 1944 for (MachineOperand &MO : DefMI->operands()) { 1945 if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) 1946 continue; 1947 1948 Register UseReg = MO.getReg(); 1949 if (!UseReg.isVirtual()) 1950 continue; 1951 1952 LiveInterval &LI = DAG.LIS->getInterval(UseReg); 1953 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); 1954 if (LI.hasSubRanges() && MO.getSubReg()) 1955 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); 1956 1957 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg); 1958 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); 1959 // If this register has lanes not covered by the LiveIns, be sure they 1960 // do not map to any subrange. ref: 1961 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange 1962 if (UncoveredLanes.any()) { 1963 assert(LI.hasSubRanges()); 1964 for (LiveInterval::SubRange &SR : LI.subranges()) 1965 assert((SR.LaneMask & UncoveredLanes).none()); 1966 } 1967 } 1968 #endif 1969 1970 // The register is no longer a live-in in all regions but the one that 1971 // contains the single use. In live-through regions, maximum register 1972 // pressure decreases predictably so we can directly update it. In the 1973 // using region, maximum RP may or may not decrease, so we will mark it 1974 // for re-computation after all materializations have taken place. 1975 LaneBitmask PrevMask = RegionLiveIns[Reg]; 1976 RegionLiveIns.erase(Reg); 1977 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask}); 1978 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) 1979 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); 1980 else 1981 RecomputeRP.insert(I); 1982 } 1983 // RP in the region from which the instruction was rematerialized may or may 1984 // not decrease. 1985 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]}); 1986 RecomputeRP.insert(DefRegion); 1987 1988 // Recompute live interval to reflect the register's rematerialization. 1989 Register RematReg = Remat.RematMI->getOperand(0).getReg(); 1990 DAG.LIS->removeInterval(RematReg); 1991 DAG.LIS->createAndComputeVirtRegInterval(RematReg); 1992 } 1993 1994 // All regions impacted by at least one rematerialization must be rescheduled. 1995 // Maximum pressure must also be recomputed for all regions where it changed 1996 // non-predictably and checked against the target occupancy. 1997 AchievedOcc = TargetOcc; 1998 for (auto &[I, OriginalRP] : ImpactedRegions) { 1999 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; 2000 RescheduleRegions[I] = !IsEmptyRegion; 2001 if (!RecomputeRP.contains(I)) 2002 continue; 2003 2004 GCNRegPressure RP; 2005 if (IsEmptyRegion) { 2006 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); 2007 } else { 2008 GCNDownwardRPTracker RPT(*DAG.LIS); 2009 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first, 2010 DAG.Regions[I].second); 2011 if (NonDbgMI == DAG.Regions[I].second) { 2012 // Region is non-empty but contains only debug instructions. 2013 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); 2014 } else { 2015 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]); 2016 RPT.advance(DAG.Regions[I].second); 2017 RP = RPT.moveMaxPressure(); 2018 } 2019 } 2020 DAG.Pressure[I] = RP; 2021 AchievedOcc = std::min( 2022 AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() 2023 ->getDynamicVGPRBlockSize())); 2024 } 2025 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); 2026 } 2027 2028 // Copied from MachineLICM 2029 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { 2030 if (!DAG.TII->isTriviallyReMaterializable(MI)) 2031 return false; 2032 2033 for (const MachineOperand &MO : MI.all_uses()) { 2034 // We can't remat physreg uses, unless it is a constant or an ignorable 2035 // use (e.g. implicit exec use on VALU instructions) 2036 if (MO.getReg().isPhysical()) { 2037 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO)) 2038 continue; 2039 return false; 2040 } 2041 } 2042 2043 return true; 2044 } 2045 2046 void PreRARematStage::finalizeGCNSchedStage() { 2047 // We consider that reducing spilling is always beneficial so we never 2048 // rollback rematerializations in such cases. It's also possible that 2049 // rescheduling lowers occupancy over the one achieved just through remats, in 2050 // which case we do not want to rollback either (the rescheduling was already 2051 // reverted in PreRARematStage::shouldRevertScheduling in such cases). 2052 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); 2053 if (!IncreaseOccupancy || MaxOcc >= TargetOcc) 2054 return; 2055 2056 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); 2057 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); 2058 2059 // Rollback the rematerializations. 2060 for (const auto &[DefMI, Remat] : Rematerializations) { 2061 MachineInstr &RematMI = *Remat.RematMI; 2062 unsigned DefRegion = MIRegion.at(DefMI); 2063 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); 2064 MachineBasicBlock *MBB = RegionBB[DefRegion]; 2065 Register Reg = RematMI.getOperand(0).getReg(); 2066 unsigned SubReg = RematMI.getOperand(0).getSubReg(); 2067 2068 // Re-rematerialize MI at the end of its original region. Note that it may 2069 // not be rematerialized exactly in the same position as originally within 2070 // the region, but it should not matter much. 2071 TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI); 2072 MachineInstr *NewMI = &*std::prev(InsertPos); 2073 NewMI->getOperand(0).setSubReg(SubReg); 2074 DAG.LIS->InsertMachineInstrInMaps(*NewMI); 2075 2076 auto UseRegion = MIRegion.find(Remat.UseMI); 2077 if (UseRegion != MIRegion.end()) { 2078 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI, 2079 nullptr); 2080 } 2081 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI); 2082 2083 // Erase rematerialized MI. 2084 DAG.LIS->RemoveMachineInstrFromMaps(RematMI); 2085 RematMI.eraseFromParent(); 2086 2087 // Recompute live interval for the re-rematerialized register 2088 DAG.LIS->removeInterval(Reg); 2089 DAG.LIS->createAndComputeVirtRegInterval(Reg); 2090 2091 // Re-add the register as a live-in in all regions it used to be one in. 2092 for (unsigned LIRegion : Remat.LiveInRegions) 2093 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})}); 2094 } 2095 2096 // Reset RP in all impacted regions. 2097 for (auto &[I, OriginalRP] : ImpactedRegions) 2098 DAG.Pressure[I] = OriginalRP; 2099 2100 GCNSchedStage::finalizeGCNSchedStage(); 2101 } 2102 2103 void GCNScheduleDAGMILive::updateRegionBoundaries( 2104 RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, 2105 MachineInstr *NewMI) { 2106 assert((!NewMI || NewMI != RegionBounds.second) && 2107 "cannot remove at region end"); 2108 2109 if (RegionBounds.first == RegionBounds.second) { 2110 assert(NewMI && "cannot remove from an empty region"); 2111 RegionBounds.first = NewMI; 2112 return; 2113 } 2114 2115 // We only care for modifications at the beginning of a non-empty region since 2116 // the upper region boundary is exclusive. 2117 if (MI != RegionBounds.first) 2118 return; 2119 if (!NewMI) 2120 RegionBounds.first = std::next(MI); // Removal 2121 else 2122 RegionBounds.first = NewMI; // Insertion 2123 } 2124 2125 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { 2126 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII); 2127 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { 2128 return SII->isIGLPMutationOnly(MI->getOpcode()); 2129 }); 2130 } 2131 2132 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( 2133 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, 2134 bool RemoveKillFlags) 2135 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} 2136 2137 void GCNPostScheduleDAGMILive::schedule() { 2138 HasIGLPInstrs = hasIGLPInstrs(this); 2139 if (HasIGLPInstrs) { 2140 SavedMutations.clear(); 2141 SavedMutations.swap(Mutations); 2142 addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); 2143 } 2144 2145 ScheduleDAGMI::schedule(); 2146 } 2147 2148 void GCNPostScheduleDAGMILive::finalizeSchedule() { 2149 if (HasIGLPInstrs) 2150 SavedMutations.swap(Mutations); 2151 2152 ScheduleDAGMI::finalizeSchedule(); 2153 } 2154