1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 /// 13 /// This pass will apply multiple scheduling stages to the same function. 14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual 15 /// entry point for the scheduling of those regions is 16 /// GCNScheduleDAGMILive::runSchedStages. 17 18 /// Generally, the reason for having multiple scheduling stages is to account 19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a 20 /// few scheduling regions will have register pressure high enough to limit 21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in 22 /// other regions. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "GCNSchedStrategy.h" 27 #include "AMDGPUIGroupLP.h" 28 #include "SIMachineFunctionInfo.h" 29 #include "llvm/CodeGen/RegisterClassInfo.h" 30 31 #define DEBUG_TYPE "machine-scheduler" 32 33 using namespace llvm; 34 35 static cl::opt<bool> 36 DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule", 37 cl::Hidden, 38 cl::desc("Disable unclustred high register pressure " 39 "reduction scheduling stage."), 40 cl::init(false)); 41 static cl::opt<unsigned> ScheduleMetricBias( 42 "amdgpu-schedule-metric-bias", cl::Hidden, 43 cl::desc( 44 "Sets the bias which adds weight to occupancy vs latency. Set it to " 45 "100 to chase the occupancy only."), 46 cl::init(10)); 47 48 static cl::opt<bool> 49 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, 50 cl::desc("Relax occupancy targets for kernels which are memory " 51 "bound (amdgpu-membound-threshold), or " 52 "Wave Limited (amdgpu-limit-wave-threshold)."), 53 cl::init(false)); 54 55 const unsigned ScheduleMetrics::ScaleFactor = 100; 56 57 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) 58 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), 59 HasHighPressure(false) {} 60 61 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { 62 GenericScheduler::initialize(DAG); 63 64 MF = &DAG->MF; 65 66 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 67 68 SGPRExcessLimit = 69 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 70 VGPRExcessLimit = 71 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 72 73 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 74 // Set the initial TargetOccupnacy to the maximum occupancy that we can 75 // achieve for this function. This effectively sets a lower bound on the 76 // 'Critical' register limits in the scheduler. 77 // Allow for lower occupancy targets if kernel is wave limited or memory 78 // bound, and using the relaxed occupancy feature. 79 TargetOccupancy = 80 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); 81 SGPRCriticalLimit = 82 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 83 84 if (!KnownExcessRP) { 85 VGPRCriticalLimit = 86 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 87 } else { 88 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except 89 // returns a reasonably small number for targets with lots of VGPRs, such 90 // as GFX10 and GFX11. 91 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " 92 "VGPRCriticalLimit calculation method.\n"); 93 94 unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); 95 unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); 96 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); 97 VGPRBudget = std::max(VGPRBudget, Granule); 98 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); 99 } 100 101 // Subtract error margin and bias from register limits and avoid overflow. 102 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); 103 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); 104 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); 105 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); 106 107 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit 108 << ", VGPRExcessLimit = " << VGPRExcessLimit 109 << ", SGPRCriticalLimit = " << SGPRCriticalLimit 110 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); 111 } 112 113 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 114 bool AtTop, 115 const RegPressureTracker &RPTracker, 116 const SIRegisterInfo *SRI, 117 unsigned SGPRPressure, 118 unsigned VGPRPressure) { 119 Cand.SU = SU; 120 Cand.AtTop = AtTop; 121 122 if (!DAG->isTrackingPressure()) 123 return; 124 125 // getDownwardPressure() and getUpwardPressure() make temporary changes to 126 // the tracker, so we need to pass those function a non-const copy. 127 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 128 129 Pressure.clear(); 130 MaxPressure.clear(); 131 132 if (AtTop) 133 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); 134 else { 135 // FIXME: I think for bottom up scheduling, the register pressure is cached 136 // and can be retrieved by DAG->getPressureDif(SU). 137 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 138 } 139 140 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 141 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 142 143 // If two instructions increase the pressure of different register sets 144 // by the same amount, the generic scheduler will prefer to schedule the 145 // instruction that increases the set with the least amount of registers, 146 // which in our case would be SGPRs. This is rarely what we want, so 147 // when we report excess/critical register pressure, we do it either 148 // only for VGPRs or only for SGPRs. 149 150 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 151 const unsigned MaxVGPRPressureInc = 16; 152 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 153 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 154 155 156 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 157 // to increase the likelihood we don't go over the limits. We should improve 158 // the analysis to look through dependencies to find the path with the least 159 // register pressure. 160 161 // We only need to update the RPDelta for instructions that increase register 162 // pressure. Instructions that decrease or keep reg pressure the same will be 163 // marked as RegExcess in tryCandidate() when they are compared with 164 // instructions that increase the register pressure. 165 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 166 HasHighPressure = true; 167 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 168 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 169 } 170 171 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 172 HasHighPressure = true; 173 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 174 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 175 } 176 177 // Register pressure is considered 'CRITICAL' if it is approaching a value 178 // that would reduce the wave occupancy for the execution unit. When 179 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 180 // has the same cost, so we don't need to prefer one over the other. 181 182 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 183 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 184 185 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 186 HasHighPressure = true; 187 if (SGPRDelta > VGPRDelta) { 188 Cand.RPDelta.CriticalMax = 189 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 190 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 191 } else { 192 Cand.RPDelta.CriticalMax = 193 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 194 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 195 } 196 } 197 } 198 199 // This function is mostly cut and pasted from 200 // GenericScheduler::pickNodeFromQueue() 201 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 202 const CandPolicy &ZonePolicy, 203 const RegPressureTracker &RPTracker, 204 SchedCandidate &Cand) { 205 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 206 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 207 unsigned SGPRPressure = 0; 208 unsigned VGPRPressure = 0; 209 if (DAG->isTrackingPressure()) { 210 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 211 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 212 } 213 ReadyQueue &Q = Zone.Available; 214 for (SUnit *SU : Q) { 215 216 SchedCandidate TryCand(ZonePolicy); 217 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, 218 SGPRPressure, VGPRPressure); 219 // Pass SchedBoundary only when comparing nodes from the same boundary. 220 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 221 tryCandidate(Cand, TryCand, ZoneArg); 222 if (TryCand.Reason != NoCand) { 223 // Initialize resource delta if needed in case future heuristics query it. 224 if (TryCand.ResDelta == SchedResourceDelta()) 225 TryCand.initResourceDelta(Zone.DAG, SchedModel); 226 Cand.setBest(TryCand); 227 LLVM_DEBUG(traceCandidate(Cand)); 228 } 229 } 230 } 231 232 // This function is mostly cut and pasted from 233 // GenericScheduler::pickNodeBidirectional() 234 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 235 // Schedule as far as possible in the direction of no choice. This is most 236 // efficient, but also provides the best heuristics for CriticalPSets. 237 if (SUnit *SU = Bot.pickOnlyChoice()) { 238 IsTopNode = false; 239 return SU; 240 } 241 if (SUnit *SU = Top.pickOnlyChoice()) { 242 IsTopNode = true; 243 return SU; 244 } 245 // Set the bottom-up policy based on the state of the current bottom zone and 246 // the instructions outside the zone, including the top zone. 247 CandPolicy BotPolicy; 248 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 249 // Set the top-down policy based on the state of the current top zone and 250 // the instructions outside the zone, including the bottom zone. 251 CandPolicy TopPolicy; 252 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 253 254 // See if BotCand is still valid (because we previously scheduled from Top). 255 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 256 if (!BotCand.isValid() || BotCand.SU->isScheduled || 257 BotCand.Policy != BotPolicy) { 258 BotCand.reset(CandPolicy()); 259 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); 260 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 261 } else { 262 LLVM_DEBUG(traceCandidate(BotCand)); 263 #ifndef NDEBUG 264 if (VerifyScheduling) { 265 SchedCandidate TCand; 266 TCand.reset(CandPolicy()); 267 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); 268 assert(TCand.SU == BotCand.SU && 269 "Last pick result should correspond to re-picking right now"); 270 } 271 #endif 272 } 273 274 // Check if the top Q has a better candidate. 275 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 276 if (!TopCand.isValid() || TopCand.SU->isScheduled || 277 TopCand.Policy != TopPolicy) { 278 TopCand.reset(CandPolicy()); 279 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); 280 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 281 } else { 282 LLVM_DEBUG(traceCandidate(TopCand)); 283 #ifndef NDEBUG 284 if (VerifyScheduling) { 285 SchedCandidate TCand; 286 TCand.reset(CandPolicy()); 287 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); 288 assert(TCand.SU == TopCand.SU && 289 "Last pick result should correspond to re-picking right now"); 290 } 291 #endif 292 } 293 294 // Pick best from BotCand and TopCand. 295 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 296 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 297 SchedCandidate Cand = BotCand; 298 TopCand.Reason = NoCand; 299 tryCandidate(Cand, TopCand, nullptr); 300 if (TopCand.Reason != NoCand) { 301 Cand.setBest(TopCand); 302 } 303 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 304 305 IsTopNode = Cand.AtTop; 306 return Cand.SU; 307 } 308 309 // This function is mostly cut and pasted from 310 // GenericScheduler::pickNode() 311 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { 312 if (DAG->top() == DAG->bottom()) { 313 assert(Top.Available.empty() && Top.Pending.empty() && 314 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 315 return nullptr; 316 } 317 SUnit *SU; 318 do { 319 if (RegionPolicy.OnlyTopDown) { 320 SU = Top.pickOnlyChoice(); 321 if (!SU) { 322 CandPolicy NoPolicy; 323 TopCand.reset(NoPolicy); 324 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); 325 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 326 SU = TopCand.SU; 327 } 328 IsTopNode = true; 329 } else if (RegionPolicy.OnlyBottomUp) { 330 SU = Bot.pickOnlyChoice(); 331 if (!SU) { 332 CandPolicy NoPolicy; 333 BotCand.reset(NoPolicy); 334 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); 335 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 336 SU = BotCand.SU; 337 } 338 IsTopNode = false; 339 } else { 340 SU = pickNodeBidirectional(IsTopNode); 341 } 342 } while (SU->isScheduled); 343 344 if (SU->isTopReady()) 345 Top.removeReady(SU); 346 if (SU->isBottomReady()) 347 Bot.removeReady(SU); 348 349 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 350 << *SU->getInstr()); 351 return SU; 352 } 353 354 GCNSchedStageID GCNSchedStrategy::getCurrentStage() { 355 assert(CurrentStage && CurrentStage != SchedStages.end()); 356 return *CurrentStage; 357 } 358 359 bool GCNSchedStrategy::advanceStage() { 360 assert(CurrentStage != SchedStages.end()); 361 if (!CurrentStage) 362 CurrentStage = SchedStages.begin(); 363 else 364 CurrentStage++; 365 366 return CurrentStage != SchedStages.end(); 367 } 368 369 bool GCNSchedStrategy::hasNextStage() const { 370 assert(CurrentStage); 371 return std::next(CurrentStage) != SchedStages.end(); 372 } 373 374 GCNSchedStageID GCNSchedStrategy::getNextStage() const { 375 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); 376 return *std::next(CurrentStage); 377 } 378 379 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 380 const MachineSchedContext *C) 381 : GCNSchedStrategy(C) { 382 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); 383 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); 384 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); 385 SchedStages.push_back(GCNSchedStageID::PreRARematerialize); 386 } 387 388 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) 389 : GCNSchedStrategy(C) { 390 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); 391 } 392 393 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, 394 SchedCandidate &TryCand, 395 SchedBoundary *Zone) const { 396 // Initialize the candidate if needed. 397 if (!Cand.isValid()) { 398 TryCand.Reason = NodeOrder; 399 return true; 400 } 401 402 // Avoid spilling by exceeding the register limit. 403 if (DAG->isTrackingPressure() && 404 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 405 RegExcess, TRI, DAG->MF)) 406 return TryCand.Reason != NoCand; 407 408 // Bias PhysReg Defs and copies to their uses and defined respectively. 409 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 410 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 411 return TryCand.Reason != NoCand; 412 413 bool SameBoundary = Zone != nullptr; 414 if (SameBoundary) { 415 // Prioritize instructions that read unbuffered resources by stall cycles. 416 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 417 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 418 return TryCand.Reason != NoCand; 419 420 // Avoid critical resource consumption and balance the schedule. 421 TryCand.initResourceDelta(DAG, SchedModel); 422 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 423 TryCand, Cand, ResourceReduce)) 424 return TryCand.Reason != NoCand; 425 if (tryGreater(TryCand.ResDelta.DemandedResources, 426 Cand.ResDelta.DemandedResources, TryCand, Cand, 427 ResourceDemand)) 428 return TryCand.Reason != NoCand; 429 430 // Unconditionally try to reduce latency. 431 if (tryLatency(TryCand, Cand, *Zone)) 432 return TryCand.Reason != NoCand; 433 434 // Weak edges are for clustering and other constraints. 435 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 436 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 437 return TryCand.Reason != NoCand; 438 } 439 440 // Keep clustered nodes together to encourage downstream peephole 441 // optimizations which may reduce resource requirements. 442 // 443 // This is a best effort to set things up for a post-RA pass. Optimizations 444 // like generating loads of multiple registers should ideally be done within 445 // the scheduler pass by combining the loads during DAG postprocessing. 446 const SUnit *CandNextClusterSU = 447 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 448 const SUnit *TryCandNextClusterSU = 449 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 450 if (tryGreater(TryCand.SU == TryCandNextClusterSU, 451 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) 452 return TryCand.Reason != NoCand; 453 454 // Avoid increasing the max critical pressure in the scheduled region. 455 if (DAG->isTrackingPressure() && 456 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 457 TryCand, Cand, RegCritical, TRI, DAG->MF)) 458 return TryCand.Reason != NoCand; 459 460 // Avoid increasing the max pressure of the entire region. 461 if (DAG->isTrackingPressure() && 462 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 463 Cand, RegMax, TRI, DAG->MF)) 464 return TryCand.Reason != NoCand; 465 466 if (SameBoundary) { 467 // Fall through to original instruction order. 468 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || 469 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 470 TryCand.Reason = NodeOrder; 471 return true; 472 } 473 } 474 return false; 475 } 476 477 GCNScheduleDAGMILive::GCNScheduleDAGMILive( 478 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) 479 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), 480 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 481 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { 482 483 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 484 if (RelaxedOcc) { 485 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); 486 if (MinOccupancy != StartingOccupancy) 487 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy 488 << ".\n"); 489 } 490 } 491 492 std::unique_ptr<GCNSchedStage> 493 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { 494 switch (SchedStageID) { 495 case GCNSchedStageID::OccInitialSchedule: 496 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); 497 case GCNSchedStageID::UnclusteredHighRPReschedule: 498 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); 499 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 500 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); 501 case GCNSchedStageID::PreRARematerialize: 502 return std::make_unique<PreRARematStage>(SchedStageID, *this); 503 case GCNSchedStageID::ILPInitialSchedule: 504 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); 505 } 506 507 llvm_unreachable("Unknown SchedStageID."); 508 } 509 510 void GCNScheduleDAGMILive::schedule() { 511 // Collect all scheduling regions. The actual scheduling is performed in 512 // GCNScheduleDAGMILive::finalizeSchedule. 513 Regions.push_back(std::pair(RegionBegin, RegionEnd)); 514 } 515 516 GCNRegPressure 517 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { 518 GCNDownwardRPTracker RPTracker(*LIS); 519 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 520 return RPTracker.moveMaxPressure(); 521 } 522 523 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, 524 const MachineBasicBlock *MBB) { 525 GCNDownwardRPTracker RPTracker(*LIS); 526 527 // If the block has the only successor then live-ins of that successor are 528 // live-outs of the current block. We can reuse calculated live set if the 529 // successor will be sent to scheduling past current block. 530 531 // However, due to the bug in LiveInterval analysis it may happen that two 532 // predecessors of the same successor block have different lane bitmasks for 533 // a live-out register. Workaround that by sticking to one-to-one relationship 534 // i.e. one predecessor with one successor block. 535 const MachineBasicBlock *OnlySucc = nullptr; 536 if (MBB->succ_size() == 1) { 537 auto *Candidate = *MBB->succ_begin(); 538 if (!Candidate->empty() && Candidate->pred_size() == 1) { 539 SlotIndexes *Ind = LIS->getSlotIndexes(); 540 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) 541 OnlySucc = Candidate; 542 } 543 } 544 545 // Scheduler sends regions from the end of the block upwards. 546 size_t CurRegion = RegionIdx; 547 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 548 if (Regions[CurRegion].first->getParent() != MBB) 549 break; 550 --CurRegion; 551 552 auto I = MBB->begin(); 553 auto LiveInIt = MBBLiveIns.find(MBB); 554 auto &Rgn = Regions[CurRegion]; 555 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 556 if (LiveInIt != MBBLiveIns.end()) { 557 auto LiveIn = std::move(LiveInIt->second); 558 RPTracker.reset(*MBB->begin(), &LiveIn); 559 MBBLiveIns.erase(LiveInIt); 560 } else { 561 I = Rgn.first; 562 auto LRS = BBLiveInMap.lookup(NonDbgMI); 563 #ifdef EXPENSIVE_CHECKS 564 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 565 #endif 566 RPTracker.reset(*I, &LRS); 567 } 568 569 for (;;) { 570 I = RPTracker.getNext(); 571 572 if (Regions[CurRegion].first == I || NonDbgMI == I) { 573 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 574 RPTracker.clearMaxPressure(); 575 } 576 577 if (Regions[CurRegion].second == I) { 578 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 579 if (CurRegion-- == RegionIdx) 580 break; 581 } 582 RPTracker.advanceToNext(); 583 RPTracker.advanceBeforeNext(); 584 } 585 586 if (OnlySucc) { 587 if (I != MBB->end()) { 588 RPTracker.advanceToNext(); 589 RPTracker.advance(MBB->end()); 590 } 591 RPTracker.advanceBeforeNext(); 592 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 593 } 594 } 595 596 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 597 GCNScheduleDAGMILive::getBBLiveInMap() const { 598 assert(!Regions.empty()); 599 std::vector<MachineInstr *> BBStarters; 600 BBStarters.reserve(Regions.size()); 601 auto I = Regions.rbegin(), E = Regions.rend(); 602 auto *BB = I->first->getParent(); 603 do { 604 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 605 BBStarters.push_back(MI); 606 do { 607 ++I; 608 } while (I != E && I->first->getParent() == BB); 609 } while (I != E); 610 return getLiveRegMap(BBStarters, false /*After*/, *LIS); 611 } 612 613 void GCNScheduleDAGMILive::finalizeSchedule() { 614 // Start actual scheduling here. This function is called by the base 615 // MachineScheduler after all regions have been recorded by 616 // GCNScheduleDAGMILive::schedule(). 617 LiveIns.resize(Regions.size()); 618 Pressure.resize(Regions.size()); 619 RescheduleRegions.resize(Regions.size()); 620 RegionsWithHighRP.resize(Regions.size()); 621 RegionsWithExcessRP.resize(Regions.size()); 622 RegionsWithMinOcc.resize(Regions.size()); 623 RegionsWithIGLPInstrs.resize(Regions.size()); 624 RescheduleRegions.set(); 625 RegionsWithHighRP.reset(); 626 RegionsWithExcessRP.reset(); 627 RegionsWithMinOcc.reset(); 628 RegionsWithIGLPInstrs.reset(); 629 630 runSchedStages(); 631 } 632 633 void GCNScheduleDAGMILive::runSchedStages() { 634 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 635 636 if (!Regions.empty()) 637 BBLiveInMap = getBBLiveInMap(); 638 639 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); 640 while (S.advanceStage()) { 641 auto Stage = createSchedStage(S.getCurrentStage()); 642 if (!Stage->initGCNSchedStage()) 643 continue; 644 645 for (auto Region : Regions) { 646 RegionBegin = Region.first; 647 RegionEnd = Region.second; 648 // Setup for scheduling the region and check whether it should be skipped. 649 if (!Stage->initGCNRegion()) { 650 Stage->advanceRegion(); 651 exitRegion(); 652 continue; 653 } 654 655 ScheduleDAGMILive::schedule(); 656 Stage->finalizeGCNRegion(); 657 } 658 659 Stage->finalizeGCNSchedStage(); 660 } 661 } 662 663 #ifndef NDEBUG 664 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { 665 switch (StageID) { 666 case GCNSchedStageID::OccInitialSchedule: 667 OS << "Max Occupancy Initial Schedule"; 668 break; 669 case GCNSchedStageID::UnclusteredHighRPReschedule: 670 OS << "Unclustered High Register Pressure Reschedule"; 671 break; 672 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 673 OS << "Clustered Low Occupancy Reschedule"; 674 break; 675 case GCNSchedStageID::PreRARematerialize: 676 OS << "Pre-RA Rematerialize"; 677 break; 678 case GCNSchedStageID::ILPInitialSchedule: 679 OS << "Max ILP Initial Schedule"; 680 break; 681 } 682 683 return OS; 684 } 685 #endif 686 687 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) 688 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), 689 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} 690 691 bool GCNSchedStage::initGCNSchedStage() { 692 if (!DAG.LIS) 693 return false; 694 695 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); 696 return true; 697 } 698 699 bool UnclusteredHighRPStage::initGCNSchedStage() { 700 if (DisableUnclusterHighRP) 701 return false; 702 703 if (!GCNSchedStage::initGCNSchedStage()) 704 return false; 705 706 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) 707 return false; 708 709 SavedMutations.swap(DAG.Mutations); 710 DAG.addMutation(createIGroupLPDAGMutation()); 711 712 InitialOccupancy = DAG.MinOccupancy; 713 // Aggressivly try to reduce register pressure in the unclustered high RP 714 // stage. Temporarily increase occupancy target in the region. 715 S.SGPRLimitBias = S.HighRPSGPRBias; 716 S.VGPRLimitBias = S.HighRPVGPRBias; 717 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) 718 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 719 720 LLVM_DEBUG( 721 dbgs() 722 << "Retrying function scheduling without clustering. " 723 "Aggressivly try to reduce register pressure to achieve occupancy " 724 << DAG.MinOccupancy << ".\n"); 725 726 return true; 727 } 728 729 bool ClusteredLowOccStage::initGCNSchedStage() { 730 if (!GCNSchedStage::initGCNSchedStage()) 731 return false; 732 733 // Don't bother trying to improve ILP in lower RP regions if occupancy has not 734 // been dropped. All regions will have already been scheduled with the ideal 735 // occupancy targets. 736 if (DAG.StartingOccupancy <= DAG.MinOccupancy) 737 return false; 738 739 LLVM_DEBUG( 740 dbgs() << "Retrying function scheduling with lowest recorded occupancy " 741 << DAG.MinOccupancy << ".\n"); 742 return true; 743 } 744 745 bool PreRARematStage::initGCNSchedStage() { 746 if (!GCNSchedStage::initGCNSchedStage()) 747 return false; 748 749 if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) 750 return false; 751 752 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 753 // Check maximum occupancy 754 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == 755 DAG.MinOccupancy) 756 return false; 757 758 // FIXME: This pass will invalidate cached MBBLiveIns for regions 759 // inbetween the defs and region we sinked the def to. Cached pressure 760 // for regions where a def is sinked from will also be invalidated. Will 761 // need to be fixed if there is another pass after this pass. 762 assert(!S.hasNextStage()); 763 764 collectRematerializableInstructions(); 765 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) 766 return false; 767 768 LLVM_DEBUG( 769 dbgs() << "Retrying function scheduling with improved occupancy of " 770 << DAG.MinOccupancy << " from rematerializing\n"); 771 return true; 772 } 773 774 void GCNSchedStage::finalizeGCNSchedStage() { 775 DAG.finishBlock(); 776 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); 777 } 778 779 void UnclusteredHighRPStage::finalizeGCNSchedStage() { 780 SavedMutations.swap(DAG.Mutations); 781 S.SGPRLimitBias = S.VGPRLimitBias = 0; 782 if (DAG.MinOccupancy > InitialOccupancy) { 783 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) 784 DAG.RegionsWithMinOcc[IDX] = 785 DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; 786 787 LLVM_DEBUG(dbgs() << StageID 788 << " stage successfully increased occupancy to " 789 << DAG.MinOccupancy << '\n'); 790 } 791 792 GCNSchedStage::finalizeGCNSchedStage(); 793 } 794 795 bool GCNSchedStage::initGCNRegion() { 796 // Check whether this new region is also a new block. 797 if (DAG.RegionBegin->getParent() != CurrentMBB) 798 setupNewBlock(); 799 800 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); 801 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); 802 803 // Skip empty scheduling regions (0 or 1 schedulable instructions). 804 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) 805 return false; 806 807 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 808 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) 809 << " " << CurrentMBB->getName() 810 << "\n From: " << *DAG.begin() << " To: "; 811 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; 812 else dbgs() << "End"; 813 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 814 815 // Save original instruction order before scheduling for possible revert. 816 Unsched.clear(); 817 Unsched.reserve(DAG.NumRegionInstrs); 818 if (StageID == GCNSchedStageID::OccInitialSchedule || 819 StageID == GCNSchedStageID::ILPInitialSchedule) { 820 for (auto &I : DAG) { 821 Unsched.push_back(&I); 822 if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 823 I.getOpcode() == AMDGPU::IGLP_OPT) 824 DAG.RegionsWithIGLPInstrs[RegionIdx] = true; 825 } 826 } else { 827 for (auto &I : DAG) 828 Unsched.push_back(&I); 829 } 830 831 PressureBefore = DAG.Pressure[RegionIdx]; 832 833 LLVM_DEBUG( 834 dbgs() << "Pressure before scheduling:\nRegion live-ins:" 835 << print(DAG.LiveIns[RegionIdx], DAG.MRI) 836 << "Region live-in pressure: " 837 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) 838 << "Region register pressure: " << print(PressureBefore)); 839 840 S.HasHighPressure = false; 841 S.KnownExcessRP = isRegionWithExcessRP(); 842 843 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 844 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { 845 SavedMutations.clear(); 846 SavedMutations.swap(DAG.Mutations); 847 DAG.addMutation(createIGroupLPDAGMutation()); 848 } 849 850 return true; 851 } 852 853 bool UnclusteredHighRPStage::initGCNRegion() { 854 // Only reschedule regions with the minimum occupancy or regions that may have 855 // spilling (excess register pressure). 856 if ((!DAG.RegionsWithMinOcc[RegionIdx] || 857 DAG.MinOccupancy <= InitialOccupancy) && 858 !DAG.RegionsWithExcessRP[RegionIdx]) 859 return false; 860 861 return GCNSchedStage::initGCNRegion(); 862 } 863 864 bool ClusteredLowOccStage::initGCNRegion() { 865 // We may need to reschedule this region if it wasn't rescheduled in the last 866 // stage, or if we found it was testing critical register pressure limits in 867 // the unclustered reschedule stage. The later is because we may not have been 868 // able to raise the min occupancy in the previous stage so the region may be 869 // overly constrained even if it was already rescheduled. 870 if (!DAG.RegionsWithHighRP[RegionIdx]) 871 return false; 872 873 return GCNSchedStage::initGCNRegion(); 874 } 875 876 bool PreRARematStage::initGCNRegion() { 877 if (!DAG.RescheduleRegions[RegionIdx]) 878 return false; 879 880 return GCNSchedStage::initGCNRegion(); 881 } 882 883 void GCNSchedStage::setupNewBlock() { 884 if (CurrentMBB) 885 DAG.finishBlock(); 886 887 CurrentMBB = DAG.RegionBegin->getParent(); 888 DAG.startBlock(CurrentMBB); 889 // Get real RP for the region if it hasn't be calculated before. After the 890 // initial schedule stage real RP will be collected after scheduling. 891 if (StageID == GCNSchedStageID::OccInitialSchedule || 892 StageID == GCNSchedStageID::ILPInitialSchedule) 893 DAG.computeBlockPressure(RegionIdx, CurrentMBB); 894 } 895 896 void GCNSchedStage::finalizeGCNRegion() { 897 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 898 DAG.RescheduleRegions[RegionIdx] = false; 899 if (S.HasHighPressure) 900 DAG.RegionsWithHighRP[RegionIdx] = true; 901 902 // Revert scheduling if we have dropped occupancy or there is some other 903 // reason that the original schedule is better. 904 checkScheduling(); 905 906 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 907 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) 908 SavedMutations.swap(DAG.Mutations); 909 910 DAG.exitRegion(); 911 RegionIdx++; 912 } 913 914 void GCNSchedStage::checkScheduling() { 915 // Check the results of scheduling. 916 PressureAfter = DAG.getRealRegPressure(RegionIdx); 917 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); 918 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); 919 920 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 921 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 922 DAG.Pressure[RegionIdx] = PressureAfter; 923 DAG.RegionsWithMinOcc[RegionIdx] = 924 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 925 926 // Early out if we have achieve the occupancy target. 927 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 928 return; 929 } 930 931 unsigned TargetOccupancy = 932 std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); 933 unsigned WavesAfter = 934 std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); 935 unsigned WavesBefore = 936 std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); 937 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 938 << ", after " << WavesAfter << ".\n"); 939 940 // We may not be able to keep the current target occupancy because of the just 941 // scheduled region. We might still be able to revert scheduling if the 942 // occupancy before was higher, or if the current schedule has register 943 // pressure higher than the excess limits which could lead to more spilling. 944 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 945 946 // Allow memory bound functions to drop to 4 waves if not limited by an 947 // attribute. 948 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && 949 WavesAfter >= MFI.getMinAllowedOccupancy()) { 950 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 951 << MFI.getMinAllowedOccupancy() << " waves\n"); 952 NewOccupancy = WavesAfter; 953 } 954 955 if (NewOccupancy < DAG.MinOccupancy) { 956 DAG.MinOccupancy = NewOccupancy; 957 MFI.limitOccupancy(DAG.MinOccupancy); 958 DAG.RegionsWithMinOcc.reset(); 959 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 960 << DAG.MinOccupancy << ".\n"); 961 } 962 963 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 964 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 965 if (PressureAfter.getVGPRNum(false) > MaxVGPRs || 966 PressureAfter.getAGPRNum() > MaxVGPRs || 967 PressureAfter.getSGPRNum() > MaxSGPRs) { 968 DAG.RescheduleRegions[RegionIdx] = true; 969 DAG.RegionsWithHighRP[RegionIdx] = true; 970 DAG.RegionsWithExcessRP[RegionIdx] = true; 971 } 972 973 // Revert if this region's schedule would cause a drop in occupancy or 974 // spilling. 975 if (shouldRevertScheduling(WavesAfter)) { 976 revertScheduling(); 977 } else { 978 DAG.Pressure[RegionIdx] = PressureAfter; 979 DAG.RegionsWithMinOcc[RegionIdx] = 980 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 981 } 982 } 983 984 unsigned 985 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, 986 DenseMap<unsigned, unsigned> &ReadyCycles, 987 const TargetSchedModel &SM) { 988 unsigned ReadyCycle = CurrCycle; 989 for (auto &D : SU.Preds) { 990 if (D.isAssignedRegDep()) { 991 MachineInstr *DefMI = D.getSUnit()->getInstr(); 992 unsigned Latency = SM.computeInstrLatency(DefMI); 993 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; 994 ReadyCycle = std::max(ReadyCycle, DefReady + Latency); 995 } 996 } 997 ReadyCycles[SU.NodeNum] = ReadyCycle; 998 return ReadyCycle; 999 } 1000 1001 #ifndef NDEBUG 1002 struct EarlierIssuingCycle { 1003 bool operator()(std::pair<MachineInstr *, unsigned> A, 1004 std::pair<MachineInstr *, unsigned> B) const { 1005 return A.second < B.second; 1006 } 1007 }; 1008 1009 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, 1010 EarlierIssuingCycle> &ReadyCycles) { 1011 if (ReadyCycles.empty()) 1012 return; 1013 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); 1014 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum 1015 << " ##################\n# Cycle #\t\t\tInstruction " 1016 " " 1017 " \n"; 1018 unsigned IPrev = 1; 1019 for (auto &I : ReadyCycles) { 1020 if (I.second > IPrev + 1) 1021 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev 1022 << " CYCLES DETECTED ******************************\n\n"; 1023 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; 1024 IPrev = I.second; 1025 } 1026 } 1027 #endif 1028 1029 ScheduleMetrics 1030 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { 1031 #ifndef NDEBUG 1032 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1033 ReadyCyclesSorted; 1034 #endif 1035 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1036 unsigned SumBubbles = 0; 1037 DenseMap<unsigned, unsigned> ReadyCycles; 1038 unsigned CurrCycle = 0; 1039 for (auto &SU : InputSchedule) { 1040 unsigned ReadyCycle = 1041 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); 1042 SumBubbles += ReadyCycle - CurrCycle; 1043 #ifndef NDEBUG 1044 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); 1045 #endif 1046 CurrCycle = ++ReadyCycle; 1047 } 1048 #ifndef NDEBUG 1049 LLVM_DEBUG( 1050 printScheduleModel(ReadyCyclesSorted); 1051 dbgs() << "\n\t" 1052 << "Metric: " 1053 << (SumBubbles 1054 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1055 : 1) 1056 << "\n\n"); 1057 #endif 1058 1059 return ScheduleMetrics(CurrCycle, SumBubbles); 1060 } 1061 1062 ScheduleMetrics 1063 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { 1064 #ifndef NDEBUG 1065 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1066 ReadyCyclesSorted; 1067 #endif 1068 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1069 unsigned SumBubbles = 0; 1070 DenseMap<unsigned, unsigned> ReadyCycles; 1071 unsigned CurrCycle = 0; 1072 for (auto &MI : DAG) { 1073 SUnit *SU = DAG.getSUnit(&MI); 1074 if (!SU) 1075 continue; 1076 unsigned ReadyCycle = 1077 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); 1078 SumBubbles += ReadyCycle - CurrCycle; 1079 #ifndef NDEBUG 1080 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); 1081 #endif 1082 CurrCycle = ++ReadyCycle; 1083 } 1084 #ifndef NDEBUG 1085 LLVM_DEBUG( 1086 printScheduleModel(ReadyCyclesSorted); 1087 dbgs() << "\n\t" 1088 << "Metric: " 1089 << (SumBubbles 1090 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1091 : 1) 1092 << "\n\n"); 1093 #endif 1094 1095 return ScheduleMetrics(CurrCycle, SumBubbles); 1096 } 1097 1098 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { 1099 if (WavesAfter < DAG.MinOccupancy) 1100 return true; 1101 1102 return false; 1103 } 1104 1105 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1106 if (PressureAfter == PressureBefore) 1107 return false; 1108 1109 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1110 return true; 1111 1112 if (mayCauseSpilling(WavesAfter)) 1113 return true; 1114 1115 return false; 1116 } 1117 1118 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { 1119 // If RP is not reduced in the unclustred reschedule stage, revert to the 1120 // old schedule. 1121 if ((WavesAfter <= PressureBefore.getOccupancy(ST) && 1122 mayCauseSpilling(WavesAfter)) || 1123 GCNSchedStage::shouldRevertScheduling(WavesAfter)) { 1124 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 1125 return true; 1126 } 1127 1128 // Do not attempt to relax schedule even more if we are already spilling. 1129 if (isRegionWithExcessRP()) 1130 return false; 1131 1132 LLVM_DEBUG( 1133 dbgs() 1134 << "\n\t *** In shouldRevertScheduling ***\n" 1135 << " *********** BEFORE UnclusteredHighRPStage ***********\n"); 1136 ScheduleMetrics MBefore = 1137 getScheduleMetrics(DAG.SUnits); 1138 LLVM_DEBUG( 1139 dbgs() 1140 << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); 1141 ScheduleMetrics MAfter = getScheduleMetrics(DAG); 1142 unsigned OldMetric = MBefore.getMetric(); 1143 unsigned NewMetric = MAfter.getMetric(); 1144 unsigned WavesBefore = 1145 std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); 1146 unsigned Profit = 1147 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * 1148 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / 1149 NewMetric) / 1150 ScheduleMetrics::ScaleFactor; 1151 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " 1152 << MAfter << "Profit: " << Profit << "\n"); 1153 return Profit < ScheduleMetrics::ScaleFactor; 1154 } 1155 1156 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { 1157 if (PressureAfter == PressureBefore) 1158 return false; 1159 1160 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1161 return true; 1162 1163 if (mayCauseSpilling(WavesAfter)) 1164 return true; 1165 1166 return false; 1167 } 1168 1169 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { 1170 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1171 return true; 1172 1173 if (mayCauseSpilling(WavesAfter)) 1174 return true; 1175 1176 return false; 1177 } 1178 1179 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1180 if (mayCauseSpilling(WavesAfter)) 1181 return true; 1182 1183 return false; 1184 } 1185 1186 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { 1187 if (WavesAfter <= MFI.getMinWavesPerEU() && 1188 !PressureAfter.less(ST, PressureBefore) && 1189 isRegionWithExcessRP()) { 1190 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 1191 return true; 1192 } 1193 1194 return false; 1195 } 1196 1197 void GCNSchedStage::revertScheduling() { 1198 DAG.RegionsWithMinOcc[RegionIdx] = 1199 PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; 1200 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 1201 DAG.RescheduleRegions[RegionIdx] = 1202 S.hasNextStage() && 1203 S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; 1204 DAG.RegionEnd = DAG.RegionBegin; 1205 int SkippedDebugInstr = 0; 1206 for (MachineInstr *MI : Unsched) { 1207 if (MI->isDebugInstr()) { 1208 ++SkippedDebugInstr; 1209 continue; 1210 } 1211 1212 if (MI->getIterator() != DAG.RegionEnd) { 1213 DAG.BB->remove(MI); 1214 DAG.BB->insert(DAG.RegionEnd, MI); 1215 if (!MI->isDebugInstr()) 1216 DAG.LIS->handleMove(*MI, true); 1217 } 1218 1219 // Reset read-undef flags and update them later. 1220 for (auto &Op : MI->all_defs()) 1221 Op.setIsUndef(false); 1222 RegisterOperands RegOpers; 1223 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); 1224 if (!MI->isDebugInstr()) { 1225 if (DAG.ShouldTrackLaneMasks) { 1226 // Adjust liveness and add missing dead+read-undef flags. 1227 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); 1228 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); 1229 } else { 1230 // Adjust for missing dead-def flags. 1231 RegOpers.detectDeadDefs(*MI, *DAG.LIS); 1232 } 1233 } 1234 DAG.RegionEnd = MI->getIterator(); 1235 ++DAG.RegionEnd; 1236 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 1237 } 1238 1239 // After reverting schedule, debug instrs will now be at the end of the block 1240 // and RegionEnd will point to the first debug instr. Increment RegionEnd 1241 // pass debug instrs to the actual end of the scheduling region. 1242 while (SkippedDebugInstr-- > 0) 1243 ++DAG.RegionEnd; 1244 1245 // If Unsched.front() instruction is a debug instruction, this will actually 1246 // shrink the region since we moved all debug instructions to the end of the 1247 // block. Find the first instruction that is not a debug instruction. 1248 DAG.RegionBegin = Unsched.front()->getIterator(); 1249 if (DAG.RegionBegin->isDebugInstr()) { 1250 for (MachineInstr *MI : Unsched) { 1251 if (MI->isDebugInstr()) 1252 continue; 1253 DAG.RegionBegin = MI->getIterator(); 1254 break; 1255 } 1256 } 1257 1258 // Then move the debug instructions back into their correct place and set 1259 // RegionBegin and RegionEnd if needed. 1260 DAG.placeDebugValues(); 1261 1262 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1263 } 1264 1265 void PreRARematStage::collectRematerializableInstructions() { 1266 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); 1267 for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { 1268 Register Reg = Register::index2VirtReg(I); 1269 if (!DAG.LIS->hasInterval(Reg)) 1270 continue; 1271 1272 // TODO: Handle AGPR and SGPR rematerialization 1273 if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || 1274 !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) 1275 continue; 1276 1277 MachineOperand *Op = DAG.MRI.getOneDef(Reg); 1278 MachineInstr *Def = Op->getParent(); 1279 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) 1280 continue; 1281 1282 MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); 1283 if (Def->getParent() == UseI->getParent()) 1284 continue; 1285 1286 // We are only collecting defs that are defined in another block and are 1287 // live-through or used inside regions at MinOccupancy. This means that the 1288 // register must be in the live-in set for the region. 1289 bool AddedToRematList = false; 1290 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1291 auto It = DAG.LiveIns[I].find(Reg); 1292 if (It != DAG.LiveIns[I].end() && !It->second.none()) { 1293 if (DAG.RegionsWithMinOcc[I]) { 1294 RematerializableInsts[I][Def] = UseI; 1295 AddedToRematList = true; 1296 } 1297 1298 // Collect regions with rematerializable reg as live-in to avoid 1299 // searching later when updating RP. 1300 RematDefToLiveInRegions[Def].push_back(I); 1301 } 1302 } 1303 if (!AddedToRematList) 1304 RematDefToLiveInRegions.erase(Def); 1305 } 1306 } 1307 1308 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, 1309 const TargetInstrInfo *TII) { 1310 // Temporary copies of cached variables we will be modifying and replacing if 1311 // sinking succeeds. 1312 SmallVector< 1313 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> 1314 NewRegions; 1315 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; 1316 DenseMap<unsigned, GCNRegPressure> NewPressure; 1317 BitVector NewRescheduleRegions; 1318 LiveIntervals *LIS = DAG.LIS; 1319 1320 NewRegions.resize(DAG.Regions.size()); 1321 NewRescheduleRegions.resize(DAG.Regions.size()); 1322 1323 // Collect only regions that has a rematerializable def as a live-in. 1324 SmallSet<unsigned, 16> ImpactedRegions; 1325 for (const auto &It : RematDefToLiveInRegions) 1326 ImpactedRegions.insert(It.second.begin(), It.second.end()); 1327 1328 // Make copies of register pressure and live-ins cache that will be updated 1329 // as we rematerialize. 1330 for (auto Idx : ImpactedRegions) { 1331 NewPressure[Idx] = DAG.Pressure[Idx]; 1332 NewLiveIns[Idx] = DAG.LiveIns[Idx]; 1333 } 1334 NewRegions = DAG.Regions; 1335 NewRescheduleRegions.reset(); 1336 1337 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 1338 bool Improved = false; 1339 for (auto I : ImpactedRegions) { 1340 if (!DAG.RegionsWithMinOcc[I]) 1341 continue; 1342 1343 Improved = false; 1344 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); 1345 int SGPRUsage = NewPressure[I].getSGPRNum(); 1346 1347 // TODO: Handle occupancy drop due to AGPR and SGPR. 1348 // Check if cause of occupancy drop is due to VGPR usage and not SGPR. 1349 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) 1350 break; 1351 1352 // The occupancy of this region could have been improved by a previous 1353 // iteration's sinking of defs. 1354 if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { 1355 NewRescheduleRegions[I] = true; 1356 Improved = true; 1357 continue; 1358 } 1359 1360 // First check if we have enough trivially rematerializable instructions to 1361 // improve occupancy. Optimistically assume all instructions we are able to 1362 // sink decreased RP. 1363 int TotalSinkableRegs = 0; 1364 for (const auto &It : RematerializableInsts[I]) { 1365 MachineInstr *Def = It.first; 1366 Register DefReg = Def->getOperand(0).getReg(); 1367 TotalSinkableRegs += 1368 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); 1369 } 1370 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 1371 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 1372 // If in the most optimistic scenario, we cannot improve occupancy, then do 1373 // not attempt to sink any instructions. 1374 if (OptimisticOccupancy <= DAG.MinOccupancy) 1375 break; 1376 1377 unsigned ImproveOccupancy = 0; 1378 SmallVector<MachineInstr *, 4> SinkedDefs; 1379 for (auto &It : RematerializableInsts[I]) { 1380 MachineInstr *Def = It.first; 1381 MachineBasicBlock::iterator InsertPos = 1382 MachineBasicBlock::iterator(It.second); 1383 Register Reg = Def->getOperand(0).getReg(); 1384 // Rematerialize MI to its use block. Since we are only rematerializing 1385 // instructions that do not have any virtual reg uses, we do not need to 1386 // call LiveRangeEdit::allUsesAvailableAt() and 1387 // LiveRangeEdit::canRematerializeAt(). 1388 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 1389 Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); 1390 MachineInstr *NewMI = &*std::prev(InsertPos); 1391 LIS->InsertMachineInstrInMaps(*NewMI); 1392 LIS->removeInterval(Reg); 1393 LIS->createAndComputeVirtRegInterval(Reg); 1394 InsertedMIToOldDef[NewMI] = Def; 1395 1396 // Update region boundaries in scheduling region we sinked from since we 1397 // may sink an instruction that was at the beginning or end of its region 1398 DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, 1399 /*Removing =*/true); 1400 1401 // Update region boundaries in region we sinked to. 1402 DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); 1403 1404 LaneBitmask PrevMask = NewLiveIns[I][Reg]; 1405 // FIXME: Also update cached pressure for where the def was sinked from. 1406 // Update RP for all regions that has this reg as a live-in and remove 1407 // the reg from all regions as a live-in. 1408 for (auto Idx : RematDefToLiveInRegions[Def]) { 1409 NewLiveIns[Idx].erase(Reg); 1410 if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { 1411 // Def is live-through and not used in this block. 1412 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); 1413 } else { 1414 // Def is used and rematerialized into this block. 1415 GCNDownwardRPTracker RPT(*LIS); 1416 auto *NonDbgMI = &*skipDebugInstructionsForward( 1417 NewRegions[Idx].first, NewRegions[Idx].second); 1418 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); 1419 RPT.advance(NewRegions[Idx].second); 1420 NewPressure[Idx] = RPT.moveMaxPressure(); 1421 } 1422 } 1423 1424 SinkedDefs.push_back(Def); 1425 ImproveOccupancy = NewPressure[I].getOccupancy(ST); 1426 if (ImproveOccupancy > DAG.MinOccupancy) 1427 break; 1428 } 1429 1430 // Remove defs we just sinked from all regions' list of sinkable defs 1431 for (auto &Def : SinkedDefs) 1432 for (auto TrackedIdx : RematDefToLiveInRegions[Def]) 1433 RematerializableInsts[TrackedIdx].erase(Def); 1434 1435 if (ImproveOccupancy <= DAG.MinOccupancy) 1436 break; 1437 1438 NewRescheduleRegions[I] = true; 1439 Improved = true; 1440 } 1441 1442 if (!Improved) { 1443 // Occupancy was not improved for all regions that were at MinOccupancy. 1444 // Undo sinking and remove newly rematerialized instructions. 1445 for (auto &Entry : InsertedMIToOldDef) { 1446 MachineInstr *MI = Entry.first; 1447 MachineInstr *OldMI = Entry.second; 1448 Register Reg = MI->getOperand(0).getReg(); 1449 LIS->RemoveMachineInstrFromMaps(*MI); 1450 MI->eraseFromParent(); 1451 OldMI->clearRegisterDeads(Reg); 1452 LIS->removeInterval(Reg); 1453 LIS->createAndComputeVirtRegInterval(Reg); 1454 } 1455 return false; 1456 } 1457 1458 // Occupancy was improved for all regions. 1459 for (auto &Entry : InsertedMIToOldDef) { 1460 MachineInstr *MI = Entry.first; 1461 MachineInstr *OldMI = Entry.second; 1462 1463 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 1464 DAG.BBLiveInMap.erase(OldMI); 1465 1466 // Remove OldMI and update LIS 1467 Register Reg = MI->getOperand(0).getReg(); 1468 LIS->RemoveMachineInstrFromMaps(*OldMI); 1469 OldMI->eraseFromParent(); 1470 LIS->removeInterval(Reg); 1471 LIS->createAndComputeVirtRegInterval(Reg); 1472 } 1473 1474 // Update live-ins, register pressure, and regions caches. 1475 for (auto Idx : ImpactedRegions) { 1476 DAG.LiveIns[Idx] = NewLiveIns[Idx]; 1477 DAG.Pressure[Idx] = NewPressure[Idx]; 1478 DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); 1479 } 1480 DAG.Regions = NewRegions; 1481 DAG.RescheduleRegions = NewRescheduleRegions; 1482 1483 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1484 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 1485 1486 return true; 1487 } 1488 1489 // Copied from MachineLICM 1490 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { 1491 if (!DAG.TII->isTriviallyReMaterializable(MI)) 1492 return false; 1493 1494 for (const MachineOperand &MO : MI.all_uses()) 1495 if (MO.getReg().isVirtual()) 1496 return false; 1497 1498 return true; 1499 } 1500 1501 // When removing, we will have to check both beginning and ending of the region. 1502 // When inserting, we will only have to check if we are inserting NewMI in front 1503 // of a scheduling region and do not need to check the ending since we will only 1504 // ever be inserting before an already existing MI. 1505 void GCNScheduleDAGMILive::updateRegionBoundaries( 1506 SmallVectorImpl<std::pair<MachineBasicBlock::iterator, 1507 MachineBasicBlock::iterator>> &RegionBoundaries, 1508 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 1509 unsigned I = 0, E = RegionBoundaries.size(); 1510 // Search for first region of the block where MI is located 1511 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) 1512 ++I; 1513 1514 for (; I != E; ++I) { 1515 if (MI->getParent() != RegionBoundaries[I].first->getParent()) 1516 return; 1517 1518 if (Removing && MI == RegionBoundaries[I].first && 1519 MI == RegionBoundaries[I].second) { 1520 // MI is in a region with size 1, after removing, the region will be 1521 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 1522 RegionBoundaries[I] = 1523 std::pair(MI->getParent()->end(), MI->getParent()->end()); 1524 return; 1525 } 1526 if (MI == RegionBoundaries[I].first) { 1527 if (Removing) 1528 RegionBoundaries[I] = 1529 std::pair(std::next(MI), RegionBoundaries[I].second); 1530 else 1531 // Inserted NewMI in front of region, set new RegionBegin to NewMI 1532 RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), 1533 RegionBoundaries[I].second); 1534 return; 1535 } 1536 if (Removing && MI == RegionBoundaries[I].second) { 1537 RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI)); 1538 return; 1539 } 1540 } 1541 } 1542 1543 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { 1544 return std::any_of( 1545 DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) { 1546 unsigned Opc = MI->getOpcode(); 1547 return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; 1548 }); 1549 } 1550 1551 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( 1552 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, 1553 bool RemoveKillFlags) 1554 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} 1555 1556 void GCNPostScheduleDAGMILive::schedule() { 1557 HasIGLPInstrs = hasIGLPInstrs(this); 1558 if (HasIGLPInstrs) { 1559 SavedMutations.clear(); 1560 SavedMutations.swap(Mutations); 1561 addMutation(createIGroupLPDAGMutation()); 1562 } 1563 1564 ScheduleDAGMI::schedule(); 1565 } 1566 1567 void GCNPostScheduleDAGMILive::finalizeSchedule() { 1568 if (HasIGLPInstrs) 1569 SavedMutations.swap(Mutations); 1570 1571 ScheduleDAGMI::finalizeSchedule(); 1572 } 1573