1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 /// 13 /// This pass will apply multiple scheduling stages to the same function. 14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual 15 /// entry point for the scheduling of those regions is 16 /// GCNScheduleDAGMILive::runSchedStages. 17 18 /// Generally, the reason for having multiple scheduling stages is to account 19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a 20 /// few scheduling regions will have register pressure high enough to limit 21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in 22 /// other regions. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "GCNSchedStrategy.h" 27 #include "AMDGPUIGroupLP.h" 28 #include "SIMachineFunctionInfo.h" 29 #include "llvm/CodeGen/RegisterClassInfo.h" 30 31 #define DEBUG_TYPE "machine-scheduler" 32 33 using namespace llvm; 34 35 static cl::opt<bool> 36 DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule", 37 cl::Hidden, 38 cl::desc("Disable unclustred high register pressure " 39 "reduction scheduling stage."), 40 cl::init(false)); 41 static cl::opt<unsigned> ScheduleMetricBias( 42 "amdgpu-schedule-metric-bias", cl::Hidden, 43 cl::desc( 44 "Sets the bias which adds weight to occupancy vs latency. Set it to " 45 "100 to chase the occupancy only."), 46 cl::init(10)); 47 48 const unsigned ScheduleMetrics::ScaleFactor = 100; 49 50 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) 51 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), 52 HasHighPressure(false) {} 53 54 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { 55 GenericScheduler::initialize(DAG); 56 57 MF = &DAG->MF; 58 59 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 60 61 SGPRExcessLimit = 62 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 63 VGPRExcessLimit = 64 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 65 66 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 67 // Set the initial TargetOccupnacy to the maximum occupancy that we can 68 // achieve for this function. This effectively sets a lower bound on the 69 // 'Critical' register limits in the scheduler. 70 TargetOccupancy = MFI.getOccupancy(); 71 SGPRCriticalLimit = 72 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 73 74 if (!KnownExcessRP) { 75 VGPRCriticalLimit = 76 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 77 } else { 78 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except 79 // returns a reasonably small number for targets with lots of VGPRs, such 80 // as GFX10 and GFX11. 81 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " 82 "VGPRCriticalLimit calculation method.\n"); 83 84 unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); 85 unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); 86 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); 87 VGPRBudget = std::max(VGPRBudget, Granule); 88 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); 89 } 90 91 // Subtract error margin and bias from register limits and avoid overflow. 92 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); 93 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); 94 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); 95 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); 96 97 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit 98 << ", VGPRExcessLimit = " << VGPRExcessLimit 99 << ", SGPRCriticalLimit = " << SGPRCriticalLimit 100 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); 101 } 102 103 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 104 bool AtTop, 105 const RegPressureTracker &RPTracker, 106 const SIRegisterInfo *SRI, 107 unsigned SGPRPressure, 108 unsigned VGPRPressure) { 109 Cand.SU = SU; 110 Cand.AtTop = AtTop; 111 112 if (!DAG->isTrackingPressure()) 113 return; 114 115 // getDownwardPressure() and getUpwardPressure() make temporary changes to 116 // the tracker, so we need to pass those function a non-const copy. 117 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 118 119 Pressure.clear(); 120 MaxPressure.clear(); 121 122 if (AtTop) 123 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); 124 else { 125 // FIXME: I think for bottom up scheduling, the register pressure is cached 126 // and can be retrieved by DAG->getPressureDif(SU). 127 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 128 } 129 130 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 131 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 132 133 // If two instructions increase the pressure of different register sets 134 // by the same amount, the generic scheduler will prefer to schedule the 135 // instruction that increases the set with the least amount of registers, 136 // which in our case would be SGPRs. This is rarely what we want, so 137 // when we report excess/critical register pressure, we do it either 138 // only for VGPRs or only for SGPRs. 139 140 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 141 const unsigned MaxVGPRPressureInc = 16; 142 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 143 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 144 145 146 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 147 // to increase the likelihood we don't go over the limits. We should improve 148 // the analysis to look through dependencies to find the path with the least 149 // register pressure. 150 151 // We only need to update the RPDelta for instructions that increase register 152 // pressure. Instructions that decrease or keep reg pressure the same will be 153 // marked as RegExcess in tryCandidate() when they are compared with 154 // instructions that increase the register pressure. 155 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 156 HasHighPressure = true; 157 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 158 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 159 } 160 161 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 162 HasHighPressure = true; 163 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 164 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 165 } 166 167 // Register pressure is considered 'CRITICAL' if it is approaching a value 168 // that would reduce the wave occupancy for the execution unit. When 169 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 170 // has the same cost, so we don't need to prefer one over the other. 171 172 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 173 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 174 175 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 176 HasHighPressure = true; 177 if (SGPRDelta > VGPRDelta) { 178 Cand.RPDelta.CriticalMax = 179 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 180 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 181 } else { 182 Cand.RPDelta.CriticalMax = 183 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 184 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 185 } 186 } 187 } 188 189 // This function is mostly cut and pasted from 190 // GenericScheduler::pickNodeFromQueue() 191 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 192 const CandPolicy &ZonePolicy, 193 const RegPressureTracker &RPTracker, 194 SchedCandidate &Cand) { 195 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 196 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 197 unsigned SGPRPressure = 0; 198 unsigned VGPRPressure = 0; 199 if (DAG->isTrackingPressure()) { 200 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 201 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 202 } 203 ReadyQueue &Q = Zone.Available; 204 for (SUnit *SU : Q) { 205 206 SchedCandidate TryCand(ZonePolicy); 207 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, 208 SGPRPressure, VGPRPressure); 209 // Pass SchedBoundary only when comparing nodes from the same boundary. 210 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 211 tryCandidate(Cand, TryCand, ZoneArg); 212 if (TryCand.Reason != NoCand) { 213 // Initialize resource delta if needed in case future heuristics query it. 214 if (TryCand.ResDelta == SchedResourceDelta()) 215 TryCand.initResourceDelta(Zone.DAG, SchedModel); 216 Cand.setBest(TryCand); 217 LLVM_DEBUG(traceCandidate(Cand)); 218 } 219 } 220 } 221 222 // This function is mostly cut and pasted from 223 // GenericScheduler::pickNodeBidirectional() 224 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 225 // Schedule as far as possible in the direction of no choice. This is most 226 // efficient, but also provides the best heuristics for CriticalPSets. 227 if (SUnit *SU = Bot.pickOnlyChoice()) { 228 IsTopNode = false; 229 return SU; 230 } 231 if (SUnit *SU = Top.pickOnlyChoice()) { 232 IsTopNode = true; 233 return SU; 234 } 235 // Set the bottom-up policy based on the state of the current bottom zone and 236 // the instructions outside the zone, including the top zone. 237 CandPolicy BotPolicy; 238 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 239 // Set the top-down policy based on the state of the current top zone and 240 // the instructions outside the zone, including the bottom zone. 241 CandPolicy TopPolicy; 242 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 243 244 // See if BotCand is still valid (because we previously scheduled from Top). 245 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 246 if (!BotCand.isValid() || BotCand.SU->isScheduled || 247 BotCand.Policy != BotPolicy) { 248 BotCand.reset(CandPolicy()); 249 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); 250 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 251 } else { 252 LLVM_DEBUG(traceCandidate(BotCand)); 253 #ifndef NDEBUG 254 if (VerifyScheduling) { 255 SchedCandidate TCand; 256 TCand.reset(CandPolicy()); 257 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); 258 assert(TCand.SU == BotCand.SU && 259 "Last pick result should correspond to re-picking right now"); 260 } 261 #endif 262 } 263 264 // Check if the top Q has a better candidate. 265 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 266 if (!TopCand.isValid() || TopCand.SU->isScheduled || 267 TopCand.Policy != TopPolicy) { 268 TopCand.reset(CandPolicy()); 269 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); 270 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 271 } else { 272 LLVM_DEBUG(traceCandidate(TopCand)); 273 #ifndef NDEBUG 274 if (VerifyScheduling) { 275 SchedCandidate TCand; 276 TCand.reset(CandPolicy()); 277 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); 278 assert(TCand.SU == TopCand.SU && 279 "Last pick result should correspond to re-picking right now"); 280 } 281 #endif 282 } 283 284 // Pick best from BotCand and TopCand. 285 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 286 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 287 SchedCandidate Cand = BotCand; 288 TopCand.Reason = NoCand; 289 tryCandidate(Cand, TopCand, nullptr); 290 if (TopCand.Reason != NoCand) { 291 Cand.setBest(TopCand); 292 } 293 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 294 295 IsTopNode = Cand.AtTop; 296 return Cand.SU; 297 } 298 299 // This function is mostly cut and pasted from 300 // GenericScheduler::pickNode() 301 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { 302 if (DAG->top() == DAG->bottom()) { 303 assert(Top.Available.empty() && Top.Pending.empty() && 304 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 305 return nullptr; 306 } 307 SUnit *SU; 308 do { 309 if (RegionPolicy.OnlyTopDown) { 310 SU = Top.pickOnlyChoice(); 311 if (!SU) { 312 CandPolicy NoPolicy; 313 TopCand.reset(NoPolicy); 314 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); 315 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 316 SU = TopCand.SU; 317 } 318 IsTopNode = true; 319 } else if (RegionPolicy.OnlyBottomUp) { 320 SU = Bot.pickOnlyChoice(); 321 if (!SU) { 322 CandPolicy NoPolicy; 323 BotCand.reset(NoPolicy); 324 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); 325 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 326 SU = BotCand.SU; 327 } 328 IsTopNode = false; 329 } else { 330 SU = pickNodeBidirectional(IsTopNode); 331 } 332 } while (SU->isScheduled); 333 334 if (SU->isTopReady()) 335 Top.removeReady(SU); 336 if (SU->isBottomReady()) 337 Bot.removeReady(SU); 338 339 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 340 << *SU->getInstr()); 341 return SU; 342 } 343 344 GCNSchedStageID GCNSchedStrategy::getCurrentStage() { 345 assert(CurrentStage && CurrentStage != SchedStages.end()); 346 return *CurrentStage; 347 } 348 349 bool GCNSchedStrategy::advanceStage() { 350 assert(CurrentStage != SchedStages.end()); 351 if (!CurrentStage) 352 CurrentStage = SchedStages.begin(); 353 else 354 CurrentStage++; 355 356 return CurrentStage != SchedStages.end(); 357 } 358 359 bool GCNSchedStrategy::hasNextStage() const { 360 assert(CurrentStage); 361 return std::next(CurrentStage) != SchedStages.end(); 362 } 363 364 GCNSchedStageID GCNSchedStrategy::getNextStage() const { 365 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); 366 return *std::next(CurrentStage); 367 } 368 369 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 370 const MachineSchedContext *C) 371 : GCNSchedStrategy(C) { 372 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); 373 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); 374 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); 375 SchedStages.push_back(GCNSchedStageID::PreRARematerialize); 376 } 377 378 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) 379 : GCNSchedStrategy(C) { 380 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); 381 } 382 383 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, 384 SchedCandidate &TryCand, 385 SchedBoundary *Zone) const { 386 // Initialize the candidate if needed. 387 if (!Cand.isValid()) { 388 TryCand.Reason = NodeOrder; 389 return true; 390 } 391 392 // Avoid spilling by exceeding the register limit. 393 if (DAG->isTrackingPressure() && 394 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, 395 RegExcess, TRI, DAG->MF)) 396 return TryCand.Reason != NoCand; 397 398 // Bias PhysReg Defs and copies to their uses and defined respectively. 399 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), 400 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) 401 return TryCand.Reason != NoCand; 402 403 bool SameBoundary = Zone != nullptr; 404 if (SameBoundary) { 405 // Prioritize instructions that read unbuffered resources by stall cycles. 406 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), 407 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) 408 return TryCand.Reason != NoCand; 409 410 // Avoid critical resource consumption and balance the schedule. 411 TryCand.initResourceDelta(DAG, SchedModel); 412 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, 413 TryCand, Cand, ResourceReduce)) 414 return TryCand.Reason != NoCand; 415 if (tryGreater(TryCand.ResDelta.DemandedResources, 416 Cand.ResDelta.DemandedResources, TryCand, Cand, 417 ResourceDemand)) 418 return TryCand.Reason != NoCand; 419 420 // Unconditionally try to reduce latency. 421 if (tryLatency(TryCand, Cand, *Zone)) 422 return TryCand.Reason != NoCand; 423 424 // Weak edges are for clustering and other constraints. 425 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), 426 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) 427 return TryCand.Reason != NoCand; 428 } 429 430 // Keep clustered nodes together to encourage downstream peephole 431 // optimizations which may reduce resource requirements. 432 // 433 // This is a best effort to set things up for a post-RA pass. Optimizations 434 // like generating loads of multiple registers should ideally be done within 435 // the scheduler pass by combining the loads during DAG postprocessing. 436 const SUnit *CandNextClusterSU = 437 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 438 const SUnit *TryCandNextClusterSU = 439 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); 440 if (tryGreater(TryCand.SU == TryCandNextClusterSU, 441 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) 442 return TryCand.Reason != NoCand; 443 444 // Avoid increasing the max critical pressure in the scheduled region. 445 if (DAG->isTrackingPressure() && 446 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, 447 TryCand, Cand, RegCritical, TRI, DAG->MF)) 448 return TryCand.Reason != NoCand; 449 450 // Avoid increasing the max pressure of the entire region. 451 if (DAG->isTrackingPressure() && 452 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, 453 Cand, RegMax, TRI, DAG->MF)) 454 return TryCand.Reason != NoCand; 455 456 if (SameBoundary) { 457 // Fall through to original instruction order. 458 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || 459 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { 460 TryCand.Reason = NodeOrder; 461 return true; 462 } 463 } 464 return false; 465 } 466 467 GCNScheduleDAGMILive::GCNScheduleDAGMILive( 468 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) 469 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), 470 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 471 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { 472 473 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 474 } 475 476 std::unique_ptr<GCNSchedStage> 477 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { 478 switch (SchedStageID) { 479 case GCNSchedStageID::OccInitialSchedule: 480 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); 481 case GCNSchedStageID::UnclusteredHighRPReschedule: 482 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); 483 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 484 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); 485 case GCNSchedStageID::PreRARematerialize: 486 return std::make_unique<PreRARematStage>(SchedStageID, *this); 487 case GCNSchedStageID::ILPInitialSchedule: 488 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); 489 } 490 491 llvm_unreachable("Unknown SchedStageID."); 492 } 493 494 void GCNScheduleDAGMILive::schedule() { 495 // Collect all scheduling regions. The actual scheduling is performed in 496 // GCNScheduleDAGMILive::finalizeSchedule. 497 Regions.push_back(std::pair(RegionBegin, RegionEnd)); 498 } 499 500 GCNRegPressure 501 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { 502 GCNDownwardRPTracker RPTracker(*LIS); 503 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 504 return RPTracker.moveMaxPressure(); 505 } 506 507 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, 508 const MachineBasicBlock *MBB) { 509 GCNDownwardRPTracker RPTracker(*LIS); 510 511 // If the block has the only successor then live-ins of that successor are 512 // live-outs of the current block. We can reuse calculated live set if the 513 // successor will be sent to scheduling past current block. 514 const MachineBasicBlock *OnlySucc = nullptr; 515 if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { 516 SlotIndexes *Ind = LIS->getSlotIndexes(); 517 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) 518 OnlySucc = *MBB->succ_begin(); 519 } 520 521 // Scheduler sends regions from the end of the block upwards. 522 size_t CurRegion = RegionIdx; 523 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 524 if (Regions[CurRegion].first->getParent() != MBB) 525 break; 526 --CurRegion; 527 528 auto I = MBB->begin(); 529 auto LiveInIt = MBBLiveIns.find(MBB); 530 auto &Rgn = Regions[CurRegion]; 531 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 532 if (LiveInIt != MBBLiveIns.end()) { 533 auto LiveIn = std::move(LiveInIt->second); 534 RPTracker.reset(*MBB->begin(), &LiveIn); 535 MBBLiveIns.erase(LiveInIt); 536 } else { 537 I = Rgn.first; 538 auto LRS = BBLiveInMap.lookup(NonDbgMI); 539 #ifdef EXPENSIVE_CHECKS 540 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 541 #endif 542 RPTracker.reset(*I, &LRS); 543 } 544 545 for (;;) { 546 I = RPTracker.getNext(); 547 548 if (Regions[CurRegion].first == I || NonDbgMI == I) { 549 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 550 RPTracker.clearMaxPressure(); 551 } 552 553 if (Regions[CurRegion].second == I) { 554 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 555 if (CurRegion-- == RegionIdx) 556 break; 557 } 558 RPTracker.advanceToNext(); 559 RPTracker.advanceBeforeNext(); 560 } 561 562 if (OnlySucc) { 563 if (I != MBB->end()) { 564 RPTracker.advanceToNext(); 565 RPTracker.advance(MBB->end()); 566 } 567 RPTracker.advanceBeforeNext(); 568 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 569 } 570 } 571 572 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 573 GCNScheduleDAGMILive::getBBLiveInMap() const { 574 assert(!Regions.empty()); 575 std::vector<MachineInstr *> BBStarters; 576 BBStarters.reserve(Regions.size()); 577 auto I = Regions.rbegin(), E = Regions.rend(); 578 auto *BB = I->first->getParent(); 579 do { 580 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 581 BBStarters.push_back(MI); 582 do { 583 ++I; 584 } while (I != E && I->first->getParent() == BB); 585 } while (I != E); 586 return getLiveRegMap(BBStarters, false /*After*/, *LIS); 587 } 588 589 void GCNScheduleDAGMILive::finalizeSchedule() { 590 // Start actual scheduling here. This function is called by the base 591 // MachineScheduler after all regions have been recorded by 592 // GCNScheduleDAGMILive::schedule(). 593 LiveIns.resize(Regions.size()); 594 Pressure.resize(Regions.size()); 595 RescheduleRegions.resize(Regions.size()); 596 RegionsWithHighRP.resize(Regions.size()); 597 RegionsWithExcessRP.resize(Regions.size()); 598 RegionsWithMinOcc.resize(Regions.size()); 599 RegionsWithIGLPInstrs.resize(Regions.size()); 600 RescheduleRegions.set(); 601 RegionsWithHighRP.reset(); 602 RegionsWithExcessRP.reset(); 603 RegionsWithMinOcc.reset(); 604 RegionsWithIGLPInstrs.reset(); 605 606 runSchedStages(); 607 } 608 609 void GCNScheduleDAGMILive::runSchedStages() { 610 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 611 612 if (!Regions.empty()) 613 BBLiveInMap = getBBLiveInMap(); 614 615 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); 616 while (S.advanceStage()) { 617 auto Stage = createSchedStage(S.getCurrentStage()); 618 if (!Stage->initGCNSchedStage()) 619 continue; 620 621 for (auto Region : Regions) { 622 RegionBegin = Region.first; 623 RegionEnd = Region.second; 624 // Setup for scheduling the region and check whether it should be skipped. 625 if (!Stage->initGCNRegion()) { 626 Stage->advanceRegion(); 627 exitRegion(); 628 continue; 629 } 630 631 ScheduleDAGMILive::schedule(); 632 Stage->finalizeGCNRegion(); 633 } 634 635 Stage->finalizeGCNSchedStage(); 636 } 637 } 638 639 #ifndef NDEBUG 640 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { 641 switch (StageID) { 642 case GCNSchedStageID::OccInitialSchedule: 643 OS << "Max Occupancy Initial Schedule"; 644 break; 645 case GCNSchedStageID::UnclusteredHighRPReschedule: 646 OS << "Unclustered High Register Pressure Reschedule"; 647 break; 648 case GCNSchedStageID::ClusteredLowOccupancyReschedule: 649 OS << "Clustered Low Occupancy Reschedule"; 650 break; 651 case GCNSchedStageID::PreRARematerialize: 652 OS << "Pre-RA Rematerialize"; 653 break; 654 case GCNSchedStageID::ILPInitialSchedule: 655 OS << "Max ILP Initial Schedule"; 656 break; 657 } 658 659 return OS; 660 } 661 #endif 662 663 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) 664 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), 665 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} 666 667 bool GCNSchedStage::initGCNSchedStage() { 668 if (!DAG.LIS) 669 return false; 670 671 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); 672 return true; 673 } 674 675 bool UnclusteredHighRPStage::initGCNSchedStage() { 676 if (DisableUnclusterHighRP) 677 return false; 678 679 if (!GCNSchedStage::initGCNSchedStage()) 680 return false; 681 682 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) 683 return false; 684 685 SavedMutations.swap(DAG.Mutations); 686 DAG.addMutation(createIGroupLPDAGMutation()); 687 688 InitialOccupancy = DAG.MinOccupancy; 689 // Aggressivly try to reduce register pressure in the unclustered high RP 690 // stage. Temporarily increase occupancy target in the region. 691 S.SGPRLimitBias = S.HighRPSGPRBias; 692 S.VGPRLimitBias = S.HighRPVGPRBias; 693 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) 694 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 695 696 LLVM_DEBUG( 697 dbgs() 698 << "Retrying function scheduling without clustering. " 699 "Aggressivly try to reduce register pressure to achieve occupancy " 700 << DAG.MinOccupancy << ".\n"); 701 702 return true; 703 } 704 705 bool ClusteredLowOccStage::initGCNSchedStage() { 706 if (!GCNSchedStage::initGCNSchedStage()) 707 return false; 708 709 // Don't bother trying to improve ILP in lower RP regions if occupancy has not 710 // been dropped. All regions will have already been scheduled with the ideal 711 // occupancy targets. 712 if (DAG.StartingOccupancy <= DAG.MinOccupancy) 713 return false; 714 715 LLVM_DEBUG( 716 dbgs() << "Retrying function scheduling with lowest recorded occupancy " 717 << DAG.MinOccupancy << ".\n"); 718 return true; 719 } 720 721 bool PreRARematStage::initGCNSchedStage() { 722 if (!GCNSchedStage::initGCNSchedStage()) 723 return false; 724 725 if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) 726 return false; 727 728 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 729 // Check maximum occupancy 730 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == 731 DAG.MinOccupancy) 732 return false; 733 734 // FIXME: This pass will invalidate cached MBBLiveIns for regions 735 // inbetween the defs and region we sinked the def to. Cached pressure 736 // for regions where a def is sinked from will also be invalidated. Will 737 // need to be fixed if there is another pass after this pass. 738 assert(!S.hasNextStage()); 739 740 collectRematerializableInstructions(); 741 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) 742 return false; 743 744 LLVM_DEBUG( 745 dbgs() << "Retrying function scheduling with improved occupancy of " 746 << DAG.MinOccupancy << " from rematerializing\n"); 747 return true; 748 } 749 750 void GCNSchedStage::finalizeGCNSchedStage() { 751 DAG.finishBlock(); 752 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); 753 } 754 755 void UnclusteredHighRPStage::finalizeGCNSchedStage() { 756 SavedMutations.swap(DAG.Mutations); 757 S.SGPRLimitBias = S.VGPRLimitBias = 0; 758 if (DAG.MinOccupancy > InitialOccupancy) { 759 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) 760 DAG.RegionsWithMinOcc[IDX] = 761 DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; 762 763 LLVM_DEBUG(dbgs() << StageID 764 << " stage successfully increased occupancy to " 765 << DAG.MinOccupancy << '\n'); 766 } 767 768 GCNSchedStage::finalizeGCNSchedStage(); 769 } 770 771 bool GCNSchedStage::initGCNRegion() { 772 // Check whether this new region is also a new block. 773 if (DAG.RegionBegin->getParent() != CurrentMBB) 774 setupNewBlock(); 775 776 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); 777 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); 778 779 // Skip empty scheduling regions (0 or 1 schedulable instructions). 780 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) 781 return false; 782 783 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 784 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) 785 << " " << CurrentMBB->getName() 786 << "\n From: " << *DAG.begin() << " To: "; 787 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; 788 else dbgs() << "End"; 789 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 790 791 // Save original instruction order before scheduling for possible revert. 792 Unsched.clear(); 793 Unsched.reserve(DAG.NumRegionInstrs); 794 if (StageID == GCNSchedStageID::OccInitialSchedule || 795 StageID == GCNSchedStageID::ILPInitialSchedule) { 796 for (auto &I : DAG) { 797 Unsched.push_back(&I); 798 if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 799 I.getOpcode() == AMDGPU::IGLP_OPT) 800 DAG.RegionsWithIGLPInstrs[RegionIdx] = true; 801 } 802 } else { 803 for (auto &I : DAG) 804 Unsched.push_back(&I); 805 } 806 807 PressureBefore = DAG.Pressure[RegionIdx]; 808 809 LLVM_DEBUG( 810 dbgs() << "Pressure before scheduling:\nRegion live-ins:" 811 << print(DAG.LiveIns[RegionIdx], DAG.MRI) 812 << "Region live-in pressure: " 813 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) 814 << "Region register pressure: " << print(PressureBefore)); 815 816 S.HasHighPressure = false; 817 S.KnownExcessRP = isRegionWithExcessRP(); 818 819 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 820 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { 821 SavedMutations.clear(); 822 SavedMutations.swap(DAG.Mutations); 823 DAG.addMutation(createIGroupLPDAGMutation()); 824 } 825 826 return true; 827 } 828 829 bool UnclusteredHighRPStage::initGCNRegion() { 830 // Only reschedule regions with the minimum occupancy or regions that may have 831 // spilling (excess register pressure). 832 if ((!DAG.RegionsWithMinOcc[RegionIdx] || 833 DAG.MinOccupancy <= InitialOccupancy) && 834 !DAG.RegionsWithExcessRP[RegionIdx]) 835 return false; 836 837 return GCNSchedStage::initGCNRegion(); 838 } 839 840 bool ClusteredLowOccStage::initGCNRegion() { 841 // We may need to reschedule this region if it wasn't rescheduled in the last 842 // stage, or if we found it was testing critical register pressure limits in 843 // the unclustered reschedule stage. The later is because we may not have been 844 // able to raise the min occupancy in the previous stage so the region may be 845 // overly constrained even if it was already rescheduled. 846 if (!DAG.RegionsWithHighRP[RegionIdx]) 847 return false; 848 849 return GCNSchedStage::initGCNRegion(); 850 } 851 852 bool PreRARematStage::initGCNRegion() { 853 if (!DAG.RescheduleRegions[RegionIdx]) 854 return false; 855 856 return GCNSchedStage::initGCNRegion(); 857 } 858 859 void GCNSchedStage::setupNewBlock() { 860 if (CurrentMBB) 861 DAG.finishBlock(); 862 863 CurrentMBB = DAG.RegionBegin->getParent(); 864 DAG.startBlock(CurrentMBB); 865 // Get real RP for the region if it hasn't be calculated before. After the 866 // initial schedule stage real RP will be collected after scheduling. 867 if (StageID == GCNSchedStageID::OccInitialSchedule) 868 DAG.computeBlockPressure(RegionIdx, CurrentMBB); 869 } 870 871 void GCNSchedStage::finalizeGCNRegion() { 872 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 873 DAG.RescheduleRegions[RegionIdx] = false; 874 if (S.HasHighPressure) 875 DAG.RegionsWithHighRP[RegionIdx] = true; 876 877 // Revert scheduling if we have dropped occupancy or there is some other 878 // reason that the original schedule is better. 879 checkScheduling(); 880 881 if (DAG.RegionsWithIGLPInstrs[RegionIdx] && 882 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) 883 SavedMutations.swap(DAG.Mutations); 884 885 DAG.exitRegion(); 886 RegionIdx++; 887 } 888 889 void GCNSchedStage::checkScheduling() { 890 // Check the results of scheduling. 891 PressureAfter = DAG.getRealRegPressure(RegionIdx); 892 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); 893 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); 894 895 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 896 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 897 DAG.Pressure[RegionIdx] = PressureAfter; 898 DAG.RegionsWithMinOcc[RegionIdx] = 899 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 900 901 // Early out if we have achieve the occupancy target. 902 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 903 return; 904 } 905 906 unsigned TargetOccupancy = 907 std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); 908 unsigned WavesAfter = 909 std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); 910 unsigned WavesBefore = 911 std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); 912 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 913 << ", after " << WavesAfter << ".\n"); 914 915 // We may not be able to keep the current target occupancy because of the just 916 // scheduled region. We might still be able to revert scheduling if the 917 // occupancy before was higher, or if the current schedule has register 918 // pressure higher than the excess limits which could lead to more spilling. 919 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 920 921 // Allow memory bound functions to drop to 4 waves if not limited by an 922 // attribute. 923 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && 924 WavesAfter >= MFI.getMinAllowedOccupancy()) { 925 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 926 << MFI.getMinAllowedOccupancy() << " waves\n"); 927 NewOccupancy = WavesAfter; 928 } 929 930 if (NewOccupancy < DAG.MinOccupancy) { 931 DAG.MinOccupancy = NewOccupancy; 932 MFI.limitOccupancy(DAG.MinOccupancy); 933 DAG.RegionsWithMinOcc.reset(); 934 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 935 << DAG.MinOccupancy << ".\n"); 936 } 937 938 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 939 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 940 if (PressureAfter.getVGPRNum(false) > MaxVGPRs || 941 PressureAfter.getAGPRNum() > MaxVGPRs || 942 PressureAfter.getSGPRNum() > MaxSGPRs) { 943 DAG.RescheduleRegions[RegionIdx] = true; 944 DAG.RegionsWithHighRP[RegionIdx] = true; 945 DAG.RegionsWithExcessRP[RegionIdx] = true; 946 } 947 948 // Revert if this region's schedule would cause a drop in occupancy or 949 // spilling. 950 if (shouldRevertScheduling(WavesAfter)) { 951 revertScheduling(); 952 } else { 953 DAG.Pressure[RegionIdx] = PressureAfter; 954 DAG.RegionsWithMinOcc[RegionIdx] = 955 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; 956 } 957 } 958 959 unsigned 960 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, 961 DenseMap<unsigned, unsigned> &ReadyCycles, 962 const TargetSchedModel &SM) { 963 unsigned ReadyCycle = CurrCycle; 964 for (auto &D : SU.Preds) { 965 if (D.isAssignedRegDep()) { 966 MachineInstr *DefMI = D.getSUnit()->getInstr(); 967 unsigned Latency = SM.computeInstrLatency(DefMI); 968 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; 969 ReadyCycle = std::max(ReadyCycle, DefReady + Latency); 970 } 971 } 972 ReadyCycles[SU.NodeNum] = ReadyCycle; 973 return ReadyCycle; 974 } 975 976 #ifndef NDEBUG 977 struct EarlierIssuingCycle { 978 bool operator()(std::pair<MachineInstr *, unsigned> A, 979 std::pair<MachineInstr *, unsigned> B) const { 980 return A.second < B.second; 981 } 982 }; 983 984 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, 985 EarlierIssuingCycle> &ReadyCycles) { 986 if (ReadyCycles.empty()) 987 return; 988 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); 989 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum 990 << " ##################\n# Cycle #\t\t\tInstruction " 991 " " 992 " \n"; 993 unsigned IPrev = 1; 994 for (auto &I : ReadyCycles) { 995 if (I.second > IPrev + 1) 996 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev 997 << " CYCLES DETECTED ******************************\n\n"; 998 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; 999 IPrev = I.second; 1000 } 1001 } 1002 #endif 1003 1004 ScheduleMetrics 1005 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { 1006 #ifndef NDEBUG 1007 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1008 ReadyCyclesSorted; 1009 #endif 1010 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1011 unsigned SumBubbles = 0; 1012 DenseMap<unsigned, unsigned> ReadyCycles; 1013 unsigned CurrCycle = 0; 1014 for (auto &SU : InputSchedule) { 1015 unsigned ReadyCycle = 1016 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); 1017 SumBubbles += ReadyCycle - CurrCycle; 1018 #ifndef NDEBUG 1019 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); 1020 #endif 1021 CurrCycle = ++ReadyCycle; 1022 } 1023 #ifndef NDEBUG 1024 LLVM_DEBUG( 1025 printScheduleModel(ReadyCyclesSorted); 1026 dbgs() << "\n\t" 1027 << "Metric: " 1028 << (SumBubbles 1029 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1030 : 1) 1031 << "\n\n"); 1032 #endif 1033 1034 return ScheduleMetrics(CurrCycle, SumBubbles); 1035 } 1036 1037 ScheduleMetrics 1038 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { 1039 #ifndef NDEBUG 1040 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> 1041 ReadyCyclesSorted; 1042 #endif 1043 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); 1044 unsigned SumBubbles = 0; 1045 DenseMap<unsigned, unsigned> ReadyCycles; 1046 unsigned CurrCycle = 0; 1047 for (auto &MI : DAG) { 1048 SUnit *SU = DAG.getSUnit(&MI); 1049 if (!SU) 1050 continue; 1051 unsigned ReadyCycle = 1052 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); 1053 SumBubbles += ReadyCycle - CurrCycle; 1054 #ifndef NDEBUG 1055 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); 1056 #endif 1057 CurrCycle = ++ReadyCycle; 1058 } 1059 #ifndef NDEBUG 1060 LLVM_DEBUG( 1061 printScheduleModel(ReadyCyclesSorted); 1062 dbgs() << "\n\t" 1063 << "Metric: " 1064 << (SumBubbles 1065 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle 1066 : 1) 1067 << "\n\n"); 1068 #endif 1069 1070 return ScheduleMetrics(CurrCycle, SumBubbles); 1071 } 1072 1073 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { 1074 if (WavesAfter < DAG.MinOccupancy) 1075 return true; 1076 1077 return false; 1078 } 1079 1080 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1081 if (PressureAfter == PressureBefore) 1082 return false; 1083 1084 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1085 return true; 1086 1087 if (mayCauseSpilling(WavesAfter)) 1088 return true; 1089 1090 return false; 1091 } 1092 1093 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { 1094 // If RP is not reduced in the unclustred reschedule stage, revert to the 1095 // old schedule. 1096 if ((WavesAfter <= PressureBefore.getOccupancy(ST) && 1097 mayCauseSpilling(WavesAfter)) || 1098 GCNSchedStage::shouldRevertScheduling(WavesAfter)) { 1099 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 1100 return true; 1101 } 1102 1103 LLVM_DEBUG( 1104 dbgs() 1105 << "\n\t *** In shouldRevertScheduling ***\n" 1106 << " *********** BEFORE UnclusteredHighRPStage ***********\n"); 1107 ScheduleMetrics MBefore = 1108 getScheduleMetrics(DAG.SUnits); 1109 LLVM_DEBUG( 1110 dbgs() 1111 << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); 1112 ScheduleMetrics MAfter = getScheduleMetrics(DAG); 1113 unsigned OldMetric = MBefore.getMetric(); 1114 unsigned NewMetric = MAfter.getMetric(); 1115 unsigned WavesBefore = 1116 std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); 1117 unsigned Profit = 1118 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * 1119 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / 1120 NewMetric) / 1121 ScheduleMetrics::ScaleFactor; 1122 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " 1123 << MAfter << "Profit: " << Profit << "\n"); 1124 return Profit < ScheduleMetrics::ScaleFactor; 1125 } 1126 1127 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { 1128 if (PressureAfter == PressureBefore) 1129 return false; 1130 1131 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1132 return true; 1133 1134 if (mayCauseSpilling(WavesAfter)) 1135 return true; 1136 1137 return false; 1138 } 1139 1140 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { 1141 if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) 1142 return true; 1143 1144 if (mayCauseSpilling(WavesAfter)) 1145 return true; 1146 1147 return false; 1148 } 1149 1150 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { 1151 if (mayCauseSpilling(WavesAfter)) 1152 return true; 1153 1154 return false; 1155 } 1156 1157 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { 1158 if (WavesAfter <= MFI.getMinWavesPerEU() && 1159 !PressureAfter.less(ST, PressureBefore) && 1160 isRegionWithExcessRP()) { 1161 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 1162 return true; 1163 } 1164 1165 return false; 1166 } 1167 1168 void GCNSchedStage::revertScheduling() { 1169 DAG.RegionsWithMinOcc[RegionIdx] = 1170 PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; 1171 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 1172 DAG.RescheduleRegions[RegionIdx] = 1173 S.hasNextStage() && 1174 S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; 1175 DAG.RegionEnd = DAG.RegionBegin; 1176 int SkippedDebugInstr = 0; 1177 for (MachineInstr *MI : Unsched) { 1178 if (MI->isDebugInstr()) { 1179 ++SkippedDebugInstr; 1180 continue; 1181 } 1182 1183 if (MI->getIterator() != DAG.RegionEnd) { 1184 DAG.BB->remove(MI); 1185 DAG.BB->insert(DAG.RegionEnd, MI); 1186 if (!MI->isDebugInstr()) 1187 DAG.LIS->handleMove(*MI, true); 1188 } 1189 1190 // Reset read-undef flags and update them later. 1191 for (auto &Op : MI->operands()) 1192 if (Op.isReg() && Op.isDef()) 1193 Op.setIsUndef(false); 1194 RegisterOperands RegOpers; 1195 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); 1196 if (!MI->isDebugInstr()) { 1197 if (DAG.ShouldTrackLaneMasks) { 1198 // Adjust liveness and add missing dead+read-undef flags. 1199 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); 1200 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); 1201 } else { 1202 // Adjust for missing dead-def flags. 1203 RegOpers.detectDeadDefs(*MI, *DAG.LIS); 1204 } 1205 } 1206 DAG.RegionEnd = MI->getIterator(); 1207 ++DAG.RegionEnd; 1208 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 1209 } 1210 1211 // After reverting schedule, debug instrs will now be at the end of the block 1212 // and RegionEnd will point to the first debug instr. Increment RegionEnd 1213 // pass debug instrs to the actual end of the scheduling region. 1214 while (SkippedDebugInstr-- > 0) 1215 ++DAG.RegionEnd; 1216 1217 // If Unsched.front() instruction is a debug instruction, this will actually 1218 // shrink the region since we moved all debug instructions to the end of the 1219 // block. Find the first instruction that is not a debug instruction. 1220 DAG.RegionBegin = Unsched.front()->getIterator(); 1221 if (DAG.RegionBegin->isDebugInstr()) { 1222 for (MachineInstr *MI : Unsched) { 1223 if (MI->isDebugInstr()) 1224 continue; 1225 DAG.RegionBegin = MI->getIterator(); 1226 break; 1227 } 1228 } 1229 1230 // Then move the debug instructions back into their correct place and set 1231 // RegionBegin and RegionEnd if needed. 1232 DAG.placeDebugValues(); 1233 1234 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); 1235 } 1236 1237 void PreRARematStage::collectRematerializableInstructions() { 1238 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); 1239 for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { 1240 Register Reg = Register::index2VirtReg(I); 1241 if (!DAG.LIS->hasInterval(Reg)) 1242 continue; 1243 1244 // TODO: Handle AGPR and SGPR rematerialization 1245 if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || 1246 !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) 1247 continue; 1248 1249 MachineOperand *Op = DAG.MRI.getOneDef(Reg); 1250 MachineInstr *Def = Op->getParent(); 1251 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) 1252 continue; 1253 1254 MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); 1255 if (Def->getParent() == UseI->getParent()) 1256 continue; 1257 1258 // We are only collecting defs that are defined in another block and are 1259 // live-through or used inside regions at MinOccupancy. This means that the 1260 // register must be in the live-in set for the region. 1261 bool AddedToRematList = false; 1262 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { 1263 auto It = DAG.LiveIns[I].find(Reg); 1264 if (It != DAG.LiveIns[I].end() && !It->second.none()) { 1265 if (DAG.RegionsWithMinOcc[I]) { 1266 RematerializableInsts[I][Def] = UseI; 1267 AddedToRematList = true; 1268 } 1269 1270 // Collect regions with rematerializable reg as live-in to avoid 1271 // searching later when updating RP. 1272 RematDefToLiveInRegions[Def].push_back(I); 1273 } 1274 } 1275 if (!AddedToRematList) 1276 RematDefToLiveInRegions.erase(Def); 1277 } 1278 } 1279 1280 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, 1281 const TargetInstrInfo *TII) { 1282 // Temporary copies of cached variables we will be modifying and replacing if 1283 // sinking succeeds. 1284 SmallVector< 1285 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> 1286 NewRegions; 1287 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; 1288 DenseMap<unsigned, GCNRegPressure> NewPressure; 1289 BitVector NewRescheduleRegions; 1290 LiveIntervals *LIS = DAG.LIS; 1291 1292 NewRegions.resize(DAG.Regions.size()); 1293 NewRescheduleRegions.resize(DAG.Regions.size()); 1294 1295 // Collect only regions that has a rematerializable def as a live-in. 1296 SmallSet<unsigned, 16> ImpactedRegions; 1297 for (const auto &It : RematDefToLiveInRegions) 1298 ImpactedRegions.insert(It.second.begin(), It.second.end()); 1299 1300 // Make copies of register pressure and live-ins cache that will be updated 1301 // as we rematerialize. 1302 for (auto Idx : ImpactedRegions) { 1303 NewPressure[Idx] = DAG.Pressure[Idx]; 1304 NewLiveIns[Idx] = DAG.LiveIns[Idx]; 1305 } 1306 NewRegions = DAG.Regions; 1307 NewRescheduleRegions.reset(); 1308 1309 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 1310 bool Improved = false; 1311 for (auto I : ImpactedRegions) { 1312 if (!DAG.RegionsWithMinOcc[I]) 1313 continue; 1314 1315 Improved = false; 1316 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); 1317 int SGPRUsage = NewPressure[I].getSGPRNum(); 1318 1319 // TODO: Handle occupancy drop due to AGPR and SGPR. 1320 // Check if cause of occupancy drop is due to VGPR usage and not SGPR. 1321 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) 1322 break; 1323 1324 // The occupancy of this region could have been improved by a previous 1325 // iteration's sinking of defs. 1326 if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { 1327 NewRescheduleRegions[I] = true; 1328 Improved = true; 1329 continue; 1330 } 1331 1332 // First check if we have enough trivially rematerializable instructions to 1333 // improve occupancy. Optimistically assume all instructions we are able to 1334 // sink decreased RP. 1335 int TotalSinkableRegs = 0; 1336 for (const auto &It : RematerializableInsts[I]) { 1337 MachineInstr *Def = It.first; 1338 Register DefReg = Def->getOperand(0).getReg(); 1339 TotalSinkableRegs += 1340 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); 1341 } 1342 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 1343 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 1344 // If in the most optimistic scenario, we cannot improve occupancy, then do 1345 // not attempt to sink any instructions. 1346 if (OptimisticOccupancy <= DAG.MinOccupancy) 1347 break; 1348 1349 unsigned ImproveOccupancy = 0; 1350 SmallVector<MachineInstr *, 4> SinkedDefs; 1351 for (auto &It : RematerializableInsts[I]) { 1352 MachineInstr *Def = It.first; 1353 MachineBasicBlock::iterator InsertPos = 1354 MachineBasicBlock::iterator(It.second); 1355 Register Reg = Def->getOperand(0).getReg(); 1356 // Rematerialize MI to its use block. Since we are only rematerializing 1357 // instructions that do not have any virtual reg uses, we do not need to 1358 // call LiveRangeEdit::allUsesAvailableAt() and 1359 // LiveRangeEdit::canRematerializeAt(). 1360 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 1361 Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); 1362 MachineInstr *NewMI = &*std::prev(InsertPos); 1363 LIS->InsertMachineInstrInMaps(*NewMI); 1364 LIS->removeInterval(Reg); 1365 LIS->createAndComputeVirtRegInterval(Reg); 1366 InsertedMIToOldDef[NewMI] = Def; 1367 1368 // Update region boundaries in scheduling region we sinked from since we 1369 // may sink an instruction that was at the beginning or end of its region 1370 DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, 1371 /*Removing =*/true); 1372 1373 // Update region boundaries in region we sinked to. 1374 DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); 1375 1376 LaneBitmask PrevMask = NewLiveIns[I][Reg]; 1377 // FIXME: Also update cached pressure for where the def was sinked from. 1378 // Update RP for all regions that has this reg as a live-in and remove 1379 // the reg from all regions as a live-in. 1380 for (auto Idx : RematDefToLiveInRegions[Def]) { 1381 NewLiveIns[Idx].erase(Reg); 1382 if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { 1383 // Def is live-through and not used in this block. 1384 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); 1385 } else { 1386 // Def is used and rematerialized into this block. 1387 GCNDownwardRPTracker RPT(*LIS); 1388 auto *NonDbgMI = &*skipDebugInstructionsForward( 1389 NewRegions[Idx].first, NewRegions[Idx].second); 1390 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); 1391 RPT.advance(NewRegions[Idx].second); 1392 NewPressure[Idx] = RPT.moveMaxPressure(); 1393 } 1394 } 1395 1396 SinkedDefs.push_back(Def); 1397 ImproveOccupancy = NewPressure[I].getOccupancy(ST); 1398 if (ImproveOccupancy > DAG.MinOccupancy) 1399 break; 1400 } 1401 1402 // Remove defs we just sinked from all regions' list of sinkable defs 1403 for (auto &Def : SinkedDefs) 1404 for (auto TrackedIdx : RematDefToLiveInRegions[Def]) 1405 RematerializableInsts[TrackedIdx].erase(Def); 1406 1407 if (ImproveOccupancy <= DAG.MinOccupancy) 1408 break; 1409 1410 NewRescheduleRegions[I] = true; 1411 Improved = true; 1412 } 1413 1414 if (!Improved) { 1415 // Occupancy was not improved for all regions that were at MinOccupancy. 1416 // Undo sinking and remove newly rematerialized instructions. 1417 for (auto &Entry : InsertedMIToOldDef) { 1418 MachineInstr *MI = Entry.first; 1419 MachineInstr *OldMI = Entry.second; 1420 Register Reg = MI->getOperand(0).getReg(); 1421 LIS->RemoveMachineInstrFromMaps(*MI); 1422 MI->eraseFromParent(); 1423 OldMI->clearRegisterDeads(Reg); 1424 LIS->removeInterval(Reg); 1425 LIS->createAndComputeVirtRegInterval(Reg); 1426 } 1427 return false; 1428 } 1429 1430 // Occupancy was improved for all regions. 1431 for (auto &Entry : InsertedMIToOldDef) { 1432 MachineInstr *MI = Entry.first; 1433 MachineInstr *OldMI = Entry.second; 1434 1435 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 1436 DAG.BBLiveInMap.erase(OldMI); 1437 1438 // Remove OldMI and update LIS 1439 Register Reg = MI->getOperand(0).getReg(); 1440 LIS->RemoveMachineInstrFromMaps(*OldMI); 1441 OldMI->eraseFromParent(); 1442 LIS->removeInterval(Reg); 1443 LIS->createAndComputeVirtRegInterval(Reg); 1444 } 1445 1446 // Update live-ins, register pressure, and regions caches. 1447 for (auto Idx : ImpactedRegions) { 1448 DAG.LiveIns[Idx] = NewLiveIns[Idx]; 1449 DAG.Pressure[Idx] = NewPressure[Idx]; 1450 DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); 1451 } 1452 DAG.Regions = NewRegions; 1453 DAG.RescheduleRegions = NewRescheduleRegions; 1454 1455 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1456 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); 1457 1458 return true; 1459 } 1460 1461 // Copied from MachineLICM 1462 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { 1463 if (!DAG.TII->isTriviallyReMaterializable(MI)) 1464 return false; 1465 1466 for (const MachineOperand &MO : MI.operands()) 1467 if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) 1468 return false; 1469 1470 return true; 1471 } 1472 1473 // When removing, we will have to check both beginning and ending of the region. 1474 // When inserting, we will only have to check if we are inserting NewMI in front 1475 // of a scheduling region and do not need to check the ending since we will only 1476 // ever be inserting before an already existing MI. 1477 void GCNScheduleDAGMILive::updateRegionBoundaries( 1478 SmallVectorImpl<std::pair<MachineBasicBlock::iterator, 1479 MachineBasicBlock::iterator>> &RegionBoundaries, 1480 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 1481 unsigned I = 0, E = RegionBoundaries.size(); 1482 // Search for first region of the block where MI is located 1483 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) 1484 ++I; 1485 1486 for (; I != E; ++I) { 1487 if (MI->getParent() != RegionBoundaries[I].first->getParent()) 1488 return; 1489 1490 if (Removing && MI == RegionBoundaries[I].first && 1491 MI == RegionBoundaries[I].second) { 1492 // MI is in a region with size 1, after removing, the region will be 1493 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 1494 RegionBoundaries[I] = 1495 std::pair(MI->getParent()->end(), MI->getParent()->end()); 1496 return; 1497 } 1498 if (MI == RegionBoundaries[I].first) { 1499 if (Removing) 1500 RegionBoundaries[I] = 1501 std::pair(std::next(MI), RegionBoundaries[I].second); 1502 else 1503 // Inserted NewMI in front of region, set new RegionBegin to NewMI 1504 RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), 1505 RegionBoundaries[I].second); 1506 return; 1507 } 1508 if (Removing && MI == RegionBoundaries[I].second) { 1509 RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI)); 1510 return; 1511 } 1512 } 1513 } 1514 1515 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { 1516 return std::any_of( 1517 DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) { 1518 unsigned Opc = MI->getOpcode(); 1519 return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; 1520 }); 1521 } 1522 1523 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( 1524 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, 1525 bool RemoveKillFlags) 1526 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} 1527 1528 void GCNPostScheduleDAGMILive::schedule() { 1529 HasIGLPInstrs = hasIGLPInstrs(this); 1530 if (HasIGLPInstrs) { 1531 SavedMutations.clear(); 1532 SavedMutations.swap(Mutations); 1533 addMutation(createIGroupLPDAGMutation()); 1534 } 1535 1536 ScheduleDAGMI::schedule(); 1537 } 1538 1539 void GCNPostScheduleDAGMILive::finalizeSchedule() { 1540 if (HasIGLPInstrs) 1541 SavedMutations.swap(Mutations); 1542 1543 ScheduleDAGMI::finalizeSchedule(); 1544 } 1545