1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIMachineScheduler.h" 15 #include "SIInstrInfo.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "llvm/CodeGen/LiveIntervals.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "machine-scheduler" 23 24 // This scheduler implements a different scheduling algorithm than 25 // GenericScheduler. 26 // 27 // There are several specific architecture behaviours that can't be modelled 28 // for GenericScheduler: 29 // . When accessing the result of an SGPR load instruction, you have to wait 30 // for all the SGPR load instructions before your current instruction to 31 // have finished. 32 // . When accessing the result of an VGPR load instruction, you have to wait 33 // for all the VGPR load instructions previous to the VGPR load instruction 34 // you are interested in to finish. 35 // . The less the register pressure, the best load latencies are hidden 36 // 37 // Moreover some specifities (like the fact a lot of instructions in the shader 38 // have few dependencies) makes the generic scheduler have some unpredictable 39 // behaviours. For example when register pressure becomes high, it can either 40 // manage to prevent register pressure from going too high, or it can 41 // increase register pressure even more than if it hadn't taken register 42 // pressure into account. 43 // 44 // Also some other bad behaviours are generated, like loading at the beginning 45 // of the shader a constant in VGPR you won't need until the end of the shader. 46 // 47 // The scheduling problem for SI can distinguish three main parts: 48 // . Hiding high latencies (texture sampling, etc) 49 // . Hiding low latencies (SGPR constant loading, etc) 50 // . Keeping register usage low for better latency hiding and general 51 // performance 52 // 53 // Some other things can also affect performance, but are hard to predict 54 // (cache usage, the fact the HW can issue several instructions from different 55 // wavefronts if different types, etc) 56 // 57 // This scheduler tries to solve the scheduling problem by dividing it into 58 // simpler sub-problems. It divides the instructions into blocks, schedules 59 // locally inside the blocks where it takes care of low latencies, and then 60 // chooses the order of the blocks by taking care of high latencies. 61 // Dividing the instructions into blocks helps control keeping register 62 // usage low. 63 // 64 // First the instructions are put into blocks. 65 // We want the blocks help control register usage and hide high latencies 66 // later. To help control register usage, we typically want all local 67 // computations, when for example you create a result that can be comsummed 68 // right away, to be contained in a block. Block inputs and outputs would 69 // typically be important results that are needed in several locations of 70 // the shader. Since we do want blocks to help hide high latencies, we want 71 // the instructions inside the block to have a minimal set of dependencies 72 // on high latencies. It will make it easy to pick blocks to hide specific 73 // high latencies. 74 // The block creation algorithm is divided into several steps, and several 75 // variants can be tried during the scheduling process. 76 // 77 // Second the order of the instructions inside the blocks is chosen. 78 // At that step we do take into account only register usage and hiding 79 // low latency instructions 80 // 81 // Third the block order is chosen, there we try to hide high latencies 82 // and keep register usage low. 83 // 84 // After the third step, a pass is done to improve the hiding of low 85 // latencies. 86 // 87 // Actually when talking about 'low latency' or 'high latency' it includes 88 // both the latency to get the cache (or global mem) data go to the register, 89 // and the bandwidth limitations. 90 // Increasing the number of active wavefronts helps hide the former, but it 91 // doesn't solve the latter, thus why even if wavefront count is high, we have 92 // to try have as many instructions hiding high latencies as possible. 93 // The OpenCL doc says for example latency of 400 cycles for a global mem access, 94 // which is hidden by 10 instructions if the wavefront count is 10. 95 96 // Some figures taken from AMD docs: 97 // Both texture and constant L1 caches are 4-way associative with 64 bytes 98 // lines. 99 // Constant cache is shared with 4 CUs. 100 // For texture sampling, the address generation unit receives 4 texture 101 // addresses per cycle, thus we could expect texture sampling latency to be 102 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 103 // instructions in a wavefront group are executed every 4 cycles), 104 // or 16 instructions if the other wavefronts associated to the 3 other VALUs 105 // of the CU do texture sampling too. (Don't take these figures too seriously, 106 // as I'm not 100% sure of the computation) 107 // Data exports should get similar latency. 108 // For constant loading, the cache is shader with 4 CUs. 109 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 110 // I guess if the other CU don't read the cache, it can go up to 64B/cycle. 111 // It means a simple s_buffer_load should take one instruction to hide, as 112 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 113 // cache line. 114 // 115 // As of today the driver doesn't preload the constants in cache, thus the 116 // first loads get extra latency. The doc says global memory access can be 117 // 300-600 cycles. We do not specially take that into account when scheduling 118 // As we expect the driver to be able to preload the constants soon. 119 120 // common code // 121 122 #ifndef NDEBUG 123 124 static const char *getReasonStr(SIScheduleCandReason Reason) { 125 switch (Reason) { 126 case NoCand: return "NOCAND"; 127 case RegUsage: return "REGUSAGE"; 128 case Latency: return "LATENCY"; 129 case Successor: return "SUCCESSOR"; 130 case Depth: return "DEPTH"; 131 case NodeOrder: return "ORDER"; 132 } 133 llvm_unreachable("Unknown reason!"); 134 } 135 136 #endif 137 138 namespace llvm { 139 namespace SISched { 140 static bool tryLess(int TryVal, int CandVal, 141 SISchedulerCandidate &TryCand, 142 SISchedulerCandidate &Cand, 143 SIScheduleCandReason Reason) { 144 if (TryVal < CandVal) { 145 TryCand.Reason = Reason; 146 return true; 147 } 148 if (TryVal > CandVal) { 149 if (Cand.Reason > Reason) 150 Cand.Reason = Reason; 151 return true; 152 } 153 Cand.setRepeat(Reason); 154 return false; 155 } 156 157 static bool tryGreater(int TryVal, int CandVal, 158 SISchedulerCandidate &TryCand, 159 SISchedulerCandidate &Cand, 160 SIScheduleCandReason Reason) { 161 if (TryVal > CandVal) { 162 TryCand.Reason = Reason; 163 return true; 164 } 165 if (TryVal < CandVal) { 166 if (Cand.Reason > Reason) 167 Cand.Reason = Reason; 168 return true; 169 } 170 Cand.setRepeat(Reason); 171 return false; 172 } 173 } // end namespace SISched 174 } // end namespace llvm 175 176 // SIScheduleBlock // 177 178 void SIScheduleBlock::addUnit(SUnit *SU) { 179 NodeNum2Index[SU->NodeNum] = SUnits.size(); 180 SUnits.push_back(SU); 181 } 182 183 #ifndef NDEBUG 184 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 185 186 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 187 dbgs() << '\n'; 188 } 189 #endif 190 191 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 192 SISchedCandidate &TryCand) { 193 // Initialize the candidate if needed. 194 if (!Cand.isValid()) { 195 TryCand.Reason = NodeOrder; 196 return; 197 } 198 199 if (Cand.SGPRUsage > 60 && 200 SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, 201 TryCand, Cand, RegUsage)) 202 return; 203 204 // Schedule low latency instructions as top as possible. 205 // Order of priority is: 206 // . Low latency instructions which do not depend on other low latency 207 // instructions we haven't waited for 208 // . Other instructions which do not depend on low latency instructions 209 // we haven't waited for 210 // . Low latencies 211 // . All other instructions 212 // Goal is to get: low latency instructions - independent instructions 213 // - (eventually some more low latency instructions) 214 // - instructions that depend on the first low latency instructions. 215 // If in the block there is a lot of constant loads, the SGPR usage 216 // could go quite high, thus above the arbitrary limit of 60 will encourage 217 // use the already loaded constants (in order to release some SGPRs) before 218 // loading more. 219 if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, 220 Cand.HasLowLatencyNonWaitedParent, 221 TryCand, Cand, SIScheduleCandReason::Depth)) 222 return; 223 224 if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 225 TryCand, Cand, SIScheduleCandReason::Depth)) 226 return; 227 228 if (TryCand.IsLowLatency && 229 SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 230 TryCand, Cand, SIScheduleCandReason::Depth)) 231 return; 232 233 if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, 234 TryCand, Cand, RegUsage)) 235 return; 236 237 // Fall through to original instruction order. 238 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 239 TryCand.Reason = NodeOrder; 240 } 241 } 242 243 SUnit* SIScheduleBlock::pickNode() { 244 SISchedCandidate TopCand; 245 246 for (SUnit* SU : TopReadySUs) { 247 SISchedCandidate TryCand; 248 std::vector<unsigned> pressure; 249 std::vector<unsigned> MaxPressure; 250 // Predict register usage after this instruction. 251 TryCand.SU = SU; 252 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 253 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32]; 254 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 255 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 256 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 257 TryCand.HasLowLatencyNonWaitedParent = 258 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 259 tryCandidateTopDown(TopCand, TryCand); 260 if (TryCand.Reason != NoCand) 261 TopCand.setBest(TryCand); 262 } 263 264 return TopCand.SU; 265 } 266 267 268 // Schedule something valid. 269 void SIScheduleBlock::fastSchedule() { 270 TopReadySUs.clear(); 271 if (Scheduled) 272 undoSchedule(); 273 274 for (SUnit* SU : SUnits) { 275 if (!SU->NumPredsLeft) 276 TopReadySUs.push_back(SU); 277 } 278 279 while (!TopReadySUs.empty()) { 280 SUnit *SU = TopReadySUs[0]; 281 ScheduledSUnits.push_back(SU); 282 nodeScheduled(SU); 283 } 284 285 Scheduled = true; 286 } 287 288 // Returns if the register was set between first and last. 289 static bool isDefBetween(unsigned Reg, 290 SlotIndex First, SlotIndex Last, 291 const MachineRegisterInfo *MRI, 292 const LiveIntervals *LIS) { 293 for (MachineRegisterInfo::def_instr_iterator 294 UI = MRI->def_instr_begin(Reg), 295 UE = MRI->def_instr_end(); UI != UE; ++UI) { 296 const MachineInstr* MI = &*UI; 297 if (MI->isDebugValue()) 298 continue; 299 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 300 if (InstSlot >= First && InstSlot <= Last) 301 return true; 302 } 303 return false; 304 } 305 306 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 307 MachineBasicBlock::iterator EndBlock) { 308 IntervalPressure Pressure, BotPressure; 309 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 310 LiveIntervals *LIS = DAG->getLIS(); 311 MachineRegisterInfo *MRI = DAG->getMRI(); 312 DAG->initRPTracker(TopRPTracker); 313 DAG->initRPTracker(BotRPTracker); 314 DAG->initRPTracker(RPTracker); 315 316 // Goes though all SU. RPTracker captures what had to be alive for the SUs 317 // to execute, and what is still alive at the end. 318 for (SUnit* SU : ScheduledSUnits) { 319 RPTracker.setPos(SU->getInstr()); 320 RPTracker.advance(); 321 } 322 323 // Close the RPTracker to finalize live ins/outs. 324 RPTracker.closeRegion(); 325 326 // Initialize the live ins and live outs. 327 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 328 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 329 330 // Do not Track Physical Registers, because it messes up. 331 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 332 if (Register::isVirtualRegister(RegMaskPair.RegUnit)) 333 LiveInRegs.insert(RegMaskPair.RegUnit); 334 } 335 LiveOutRegs.clear(); 336 // There is several possibilities to distinguish: 337 // 1) Reg is not input to any instruction in the block, but is output of one 338 // 2) 1) + read in the block and not needed after it 339 // 3) 1) + read in the block but needed in another block 340 // 4) Reg is input of an instruction but another block will read it too 341 // 5) Reg is input of an instruction and then rewritten in the block. 342 // result is not read in the block (implies used in another block) 343 // 6) Reg is input of an instruction and then rewritten in the block. 344 // result is read in the block and not needed in another block 345 // 7) Reg is input of an instruction and then rewritten in the block. 346 // result is read in the block but also needed in another block 347 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 348 // We want LiveOutRegs to contain only Regs whose content will be read after 349 // in another block, and whose content was written in the current block, 350 // that is we want it to get 1, 3, 5, 7 351 // Since we made the MIs of a block to be packed all together before 352 // scheduling, then the LiveIntervals were correct, and the RPTracker was 353 // able to correctly handle 5 vs 6, 2 vs 3. 354 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 355 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 356 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 357 // The use of findDefBetween removes the case 4. 358 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 359 Register Reg = RegMaskPair.RegUnit; 360 if (Reg.isVirtual() && 361 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 362 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 363 LIS)) { 364 LiveOutRegs.insert(Reg); 365 } 366 } 367 368 // Pressure = sum_alive_registers register size 369 // Internally llvm will represent some registers as big 128 bits registers 370 // for example, but they actually correspond to 4 actual 32 bits registers. 371 // Thus Pressure is not equal to num_alive_registers * constant. 372 LiveInPressure = TopPressure.MaxSetPressure; 373 LiveOutPressure = BotPressure.MaxSetPressure; 374 375 // Prepares TopRPTracker for top down scheduling. 376 TopRPTracker.closeTop(); 377 } 378 379 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 380 MachineBasicBlock::iterator EndBlock) { 381 if (!Scheduled) 382 fastSchedule(); 383 384 // PreScheduling phase to set LiveIn and LiveOut. 385 initRegPressure(BeginBlock, EndBlock); 386 undoSchedule(); 387 388 // Schedule for real now. 389 390 TopReadySUs.clear(); 391 392 for (SUnit* SU : SUnits) { 393 if (!SU->NumPredsLeft) 394 TopReadySUs.push_back(SU); 395 } 396 397 while (!TopReadySUs.empty()) { 398 SUnit *SU = pickNode(); 399 ScheduledSUnits.push_back(SU); 400 TopRPTracker.setPos(SU->getInstr()); 401 TopRPTracker.advance(); 402 nodeScheduled(SU); 403 } 404 405 // TODO: compute InternalAdditionnalPressure. 406 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); 407 408 // Check everything is right. 409 #ifndef NDEBUG 410 assert(SUnits.size() == ScheduledSUnits.size() && 411 TopReadySUs.empty()); 412 for (SUnit* SU : SUnits) { 413 assert(SU->isScheduled && 414 SU->NumPredsLeft == 0); 415 } 416 #endif 417 418 Scheduled = true; 419 } 420 421 void SIScheduleBlock::undoSchedule() { 422 for (SUnit* SU : SUnits) { 423 SU->isScheduled = false; 424 for (SDep& Succ : SU->Succs) { 425 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 426 undoReleaseSucc(SU, &Succ); 427 } 428 } 429 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 430 ScheduledSUnits.clear(); 431 Scheduled = false; 432 } 433 434 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 435 SUnit *SuccSU = SuccEdge->getSUnit(); 436 437 if (SuccEdge->isWeak()) { 438 ++SuccSU->WeakPredsLeft; 439 return; 440 } 441 ++SuccSU->NumPredsLeft; 442 } 443 444 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 445 SUnit *SuccSU = SuccEdge->getSUnit(); 446 447 if (SuccEdge->isWeak()) { 448 --SuccSU->WeakPredsLeft; 449 return; 450 } 451 #ifndef NDEBUG 452 if (SuccSU->NumPredsLeft == 0) { 453 dbgs() << "*** Scheduling failed! ***\n"; 454 DAG->dumpNode(*SuccSU); 455 dbgs() << " has been released too many times!\n"; 456 llvm_unreachable(nullptr); 457 } 458 #endif 459 460 --SuccSU->NumPredsLeft; 461 } 462 463 /// Release Successors of the SU that are in the block or not. 464 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 465 for (SDep& Succ : SU->Succs) { 466 SUnit *SuccSU = Succ.getSUnit(); 467 468 if (SuccSU->NodeNum >= DAG->SUnits.size()) 469 continue; 470 471 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 472 continue; 473 474 releaseSucc(SU, &Succ); 475 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 476 TopReadySUs.push_back(SuccSU); 477 } 478 } 479 480 void SIScheduleBlock::nodeScheduled(SUnit *SU) { 481 // Is in TopReadySUs 482 assert (!SU->NumPredsLeft); 483 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 484 if (I == TopReadySUs.end()) { 485 dbgs() << "Data Structure Bug in SI Scheduler\n"; 486 llvm_unreachable(nullptr); 487 } 488 TopReadySUs.erase(I); 489 490 releaseSuccessors(SU, true); 491 // Scheduling this node will trigger a wait, 492 // thus propagate to other instructions that they do not need to wait either. 493 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 494 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 495 496 if (DAG->IsLowLatencySU[SU->NodeNum]) { 497 for (SDep& Succ : SU->Succs) { 498 std::map<unsigned, unsigned>::iterator I = 499 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 500 if (I != NodeNum2Index.end()) 501 HasLowLatencyNonWaitedParent[I->second] = 1; 502 } 503 } 504 SU->isScheduled = true; 505 } 506 507 void SIScheduleBlock::finalizeUnits() { 508 // We remove links from outside blocks to enable scheduling inside the block. 509 for (SUnit* SU : SUnits) { 510 releaseSuccessors(SU, false); 511 if (DAG->IsHighLatencySU[SU->NodeNum]) 512 HighLatencyBlock = true; 513 } 514 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 515 } 516 517 // we maintain ascending order of IDs 518 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 519 unsigned PredID = Pred->getID(); 520 521 // Check if not already predecessor. 522 for (SIScheduleBlock* P : Preds) { 523 if (PredID == P->getID()) 524 return; 525 } 526 Preds.push_back(Pred); 527 528 assert(none_of(Succs, 529 [=](std::pair<SIScheduleBlock*, 530 SIScheduleBlockLinkKind> S) { 531 return PredID == S.first->getID(); 532 }) && 533 "Loop in the Block Graph!"); 534 } 535 536 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 537 SIScheduleBlockLinkKind Kind) { 538 unsigned SuccID = Succ->getID(); 539 540 // Check if not already predecessor. 541 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 542 if (SuccID == S.first->getID()) { 543 if (S.second == SIScheduleBlockLinkKind::NoData && 544 Kind == SIScheduleBlockLinkKind::Data) 545 S.second = Kind; 546 return; 547 } 548 } 549 if (Succ->isHighLatencyBlock()) 550 ++NumHighLatencySuccessors; 551 Succs.push_back(std::make_pair(Succ, Kind)); 552 553 assert(none_of(Preds, 554 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 555 "Loop in the Block Graph!"); 556 } 557 558 #ifndef NDEBUG 559 void SIScheduleBlock::printDebug(bool full) { 560 dbgs() << "Block (" << ID << ")\n"; 561 if (!full) 562 return; 563 564 dbgs() << "\nContains High Latency Instruction: " 565 << HighLatencyBlock << '\n'; 566 dbgs() << "\nDepends On:\n"; 567 for (SIScheduleBlock* P : Preds) { 568 P->printDebug(false); 569 } 570 571 dbgs() << "\nSuccessors:\n"; 572 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 573 if (S.second == SIScheduleBlockLinkKind::Data) 574 dbgs() << "(Data Dep) "; 575 S.first->printDebug(false); 576 } 577 578 if (Scheduled) { 579 dbgs() << "LiveInPressure " 580 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' 581 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n'; 582 dbgs() << "LiveOutPressure " 583 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' 584 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; 585 dbgs() << "LiveIns:\n"; 586 for (unsigned Reg : LiveInRegs) 587 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 588 589 dbgs() << "\nLiveOuts:\n"; 590 for (unsigned Reg : LiveOutRegs) 591 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 592 } 593 594 dbgs() << "\nInstructions:\n"; 595 for (const SUnit* SU : SUnits) 596 DAG->dumpNode(*SU); 597 598 dbgs() << "///////////////////////\n"; 599 } 600 #endif 601 602 // SIScheduleBlockCreator // 603 604 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) 605 : DAG(DAG) {} 606 607 SIScheduleBlocks 608 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 609 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 610 Blocks.find(BlockVariant); 611 if (B == Blocks.end()) { 612 SIScheduleBlocks Res; 613 createBlocksForVariant(BlockVariant); 614 topologicalSort(); 615 scheduleInsideBlocks(); 616 fillStats(); 617 Res.Blocks = CurrentBlocks; 618 Res.TopDownIndex2Block = TopDownIndex2Block; 619 Res.TopDownBlock2Index = TopDownBlock2Index; 620 Blocks[BlockVariant] = Res; 621 return Res; 622 } else { 623 return B->second; 624 } 625 } 626 627 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 628 if (SU->NodeNum >= DAG->SUnits.size()) 629 return false; 630 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 631 } 632 633 void SIScheduleBlockCreator::colorHighLatenciesAlone() { 634 unsigned DAGSize = DAG->SUnits.size(); 635 636 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 637 SUnit *SU = &DAG->SUnits[i]; 638 if (DAG->IsHighLatencySU[SU->NodeNum]) { 639 CurrentColoring[SU->NodeNum] = NextReservedID++; 640 } 641 } 642 } 643 644 static bool 645 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 646 for (const auto &PredDep : SU.Preds) { 647 if (PredDep.getSUnit() == &FromSU && 648 PredDep.getKind() == llvm::SDep::Data) 649 return true; 650 } 651 return false; 652 } 653 654 void SIScheduleBlockCreator::colorHighLatenciesGroups() { 655 unsigned DAGSize = DAG->SUnits.size(); 656 unsigned NumHighLatencies = 0; 657 unsigned GroupSize; 658 int Color = NextReservedID; 659 unsigned Count = 0; 660 std::set<unsigned> FormingGroup; 661 662 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 663 SUnit *SU = &DAG->SUnits[i]; 664 if (DAG->IsHighLatencySU[SU->NodeNum]) 665 ++NumHighLatencies; 666 } 667 668 if (NumHighLatencies == 0) 669 return; 670 671 if (NumHighLatencies <= 6) 672 GroupSize = 2; 673 else if (NumHighLatencies <= 12) 674 GroupSize = 3; 675 else 676 GroupSize = 4; 677 678 for (unsigned SUNum : DAG->TopDownIndex2SU) { 679 const SUnit &SU = DAG->SUnits[SUNum]; 680 if (DAG->IsHighLatencySU[SU.NodeNum]) { 681 unsigned CompatibleGroup = true; 682 int ProposedColor = Color; 683 std::vector<int> AdditionalElements; 684 685 // We don't want to put in the same block 686 // two high latency instructions that depend 687 // on each other. 688 // One way would be to check canAddEdge 689 // in both directions, but that currently is not 690 // enough because there the high latency order is 691 // enforced (via links). 692 // Instead, look at the dependencies between the 693 // high latency instructions and deduce if it is 694 // a data dependency or not. 695 for (unsigned j : FormingGroup) { 696 bool HasSubGraph; 697 std::vector<int> SubGraph; 698 // By construction (topological order), if SU and 699 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 700 // in the parent graph of SU. 701 #ifndef NDEBUG 702 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 703 HasSubGraph); 704 assert(!HasSubGraph); 705 #endif 706 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 707 HasSubGraph); 708 if (!HasSubGraph) 709 continue; // No dependencies between each other 710 else if (SubGraph.size() > 5) { 711 // Too many elements would be required to be added to the block. 712 CompatibleGroup = false; 713 break; 714 } 715 else { 716 // Check the type of dependency 717 for (unsigned k : SubGraph) { 718 // If in the path to join the two instructions, 719 // there is another high latency instruction, 720 // or instructions colored for another block 721 // abort the merge. 722 if (DAG->IsHighLatencySU[k] || 723 (CurrentColoring[k] != ProposedColor && 724 CurrentColoring[k] != 0)) { 725 CompatibleGroup = false; 726 break; 727 } 728 // If one of the SU in the subgraph depends on the result of SU j, 729 // there'll be a data dependency. 730 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 731 CompatibleGroup = false; 732 break; 733 } 734 } 735 if (!CompatibleGroup) 736 break; 737 // Same check for the SU 738 if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 739 CompatibleGroup = false; 740 break; 741 } 742 // Add all the required instructions to the block 743 // These cannot live in another block (because they 744 // depend (order dependency) on one of the 745 // instruction in the block, and are required for the 746 // high latency instruction we add. 747 llvm::append_range(AdditionalElements, SubGraph); 748 } 749 } 750 if (CompatibleGroup) { 751 FormingGroup.insert(SU.NodeNum); 752 for (unsigned j : AdditionalElements) 753 CurrentColoring[j] = ProposedColor; 754 CurrentColoring[SU.NodeNum] = ProposedColor; 755 ++Count; 756 } 757 // Found one incompatible instruction, 758 // or has filled a big enough group. 759 // -> start a new one. 760 if (!CompatibleGroup) { 761 FormingGroup.clear(); 762 Color = ++NextReservedID; 763 ProposedColor = Color; 764 FormingGroup.insert(SU.NodeNum); 765 CurrentColoring[SU.NodeNum] = ProposedColor; 766 Count = 0; 767 } else if (Count == GroupSize) { 768 FormingGroup.clear(); 769 Color = ++NextReservedID; 770 ProposedColor = Color; 771 Count = 0; 772 } 773 } 774 } 775 } 776 777 void SIScheduleBlockCreator::colorComputeReservedDependencies() { 778 unsigned DAGSize = DAG->SUnits.size(); 779 std::map<std::set<unsigned>, unsigned> ColorCombinations; 780 781 CurrentTopDownReservedDependencyColoring.clear(); 782 CurrentBottomUpReservedDependencyColoring.clear(); 783 784 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 785 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 786 787 // Traverse TopDown, and give different colors to SUs depending 788 // on which combination of High Latencies they depend on. 789 790 for (unsigned SUNum : DAG->TopDownIndex2SU) { 791 SUnit *SU = &DAG->SUnits[SUNum]; 792 std::set<unsigned> SUColors; 793 794 // Already given. 795 if (CurrentColoring[SU->NodeNum]) { 796 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 797 CurrentColoring[SU->NodeNum]; 798 continue; 799 } 800 801 for (SDep& PredDep : SU->Preds) { 802 SUnit *Pred = PredDep.getSUnit(); 803 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 804 continue; 805 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 806 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 807 } 808 // Color 0 by default. 809 if (SUColors.empty()) 810 continue; 811 // Same color than parents. 812 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 813 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 814 *SUColors.begin(); 815 else { 816 std::map<std::set<unsigned>, unsigned>::iterator Pos = 817 ColorCombinations.find(SUColors); 818 if (Pos != ColorCombinations.end()) { 819 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 820 } else { 821 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 822 NextNonReservedID; 823 ColorCombinations[SUColors] = NextNonReservedID++; 824 } 825 } 826 } 827 828 ColorCombinations.clear(); 829 830 // Same as before, but BottomUp. 831 832 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 833 SUnit *SU = &DAG->SUnits[SUNum]; 834 std::set<unsigned> SUColors; 835 836 // Already given. 837 if (CurrentColoring[SU->NodeNum]) { 838 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 839 CurrentColoring[SU->NodeNum]; 840 continue; 841 } 842 843 for (SDep& SuccDep : SU->Succs) { 844 SUnit *Succ = SuccDep.getSUnit(); 845 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 846 continue; 847 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 848 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 849 } 850 // Keep color 0. 851 if (SUColors.empty()) 852 continue; 853 // Same color than parents. 854 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 855 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 856 *SUColors.begin(); 857 else { 858 std::map<std::set<unsigned>, unsigned>::iterator Pos = 859 ColorCombinations.find(SUColors); 860 if (Pos != ColorCombinations.end()) { 861 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 862 } else { 863 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 864 NextNonReservedID; 865 ColorCombinations[SUColors] = NextNonReservedID++; 866 } 867 } 868 } 869 } 870 871 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 872 unsigned DAGSize = DAG->SUnits.size(); 873 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 874 875 // Every combination of colors given by the top down 876 // and bottom up Reserved node dependency 877 878 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 879 SUnit *SU = &DAG->SUnits[i]; 880 std::pair<unsigned, unsigned> SUColors; 881 882 // High latency instructions: already given. 883 if (CurrentColoring[SU->NodeNum]) 884 continue; 885 886 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; 887 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; 888 889 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 890 ColorCombinations.find(SUColors); 891 if (Pos != ColorCombinations.end()) { 892 CurrentColoring[SU->NodeNum] = Pos->second; 893 } else { 894 CurrentColoring[SU->NodeNum] = NextNonReservedID; 895 ColorCombinations[SUColors] = NextNonReservedID++; 896 } 897 } 898 } 899 900 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 901 unsigned DAGSize = DAG->SUnits.size(); 902 std::vector<int> PendingColoring = CurrentColoring; 903 904 assert(DAGSize >= 1 && 905 CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 906 CurrentTopDownReservedDependencyColoring.size() == DAGSize); 907 // If there is no reserved block at all, do nothing. We don't want 908 // everything in one block. 909 if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), 910 CurrentBottomUpReservedDependencyColoring.end()) == 0 && 911 *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), 912 CurrentTopDownReservedDependencyColoring.end()) == 0) 913 return; 914 915 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 916 SUnit *SU = &DAG->SUnits[SUNum]; 917 std::set<unsigned> SUColors; 918 std::set<unsigned> SUColorsPending; 919 920 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 921 continue; 922 923 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 924 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 925 continue; 926 927 for (SDep& SuccDep : SU->Succs) { 928 SUnit *Succ = SuccDep.getSUnit(); 929 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 930 continue; 931 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 932 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 933 SUColors.insert(CurrentColoring[Succ->NodeNum]); 934 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 935 } 936 // If there is only one child/parent block, and that block 937 // is not among the ones we are removing in this path, then 938 // merge the instruction to that block 939 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 940 PendingColoring[SU->NodeNum] = *SUColors.begin(); 941 else // TODO: Attribute new colors depending on color 942 // combination of children. 943 PendingColoring[SU->NodeNum] = NextNonReservedID++; 944 } 945 CurrentColoring = PendingColoring; 946 } 947 948 949 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 950 unsigned DAGSize = DAG->SUnits.size(); 951 unsigned PreviousColor; 952 std::set<unsigned> SeenColors; 953 954 if (DAGSize <= 1) 955 return; 956 957 PreviousColor = CurrentColoring[0]; 958 959 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 960 SUnit *SU = &DAG->SUnits[i]; 961 unsigned CurrentColor = CurrentColoring[i]; 962 unsigned PreviousColorSave = PreviousColor; 963 assert(i == SU->NodeNum); 964 965 if (CurrentColor != PreviousColor) 966 SeenColors.insert(PreviousColor); 967 PreviousColor = CurrentColor; 968 969 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 970 continue; 971 972 if (SeenColors.find(CurrentColor) == SeenColors.end()) 973 continue; 974 975 if (PreviousColorSave != CurrentColor) 976 CurrentColoring[i] = NextNonReservedID++; 977 else 978 CurrentColoring[i] = CurrentColoring[i-1]; 979 } 980 } 981 982 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 983 unsigned DAGSize = DAG->SUnits.size(); 984 985 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 986 SUnit *SU = &DAG->SUnits[SUNum]; 987 std::set<unsigned> SUColors; 988 989 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 990 continue; 991 992 // No predecessor: Vgpr constant loading. 993 // Low latency instructions usually have a predecessor (the address) 994 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 995 continue; 996 997 for (SDep& SuccDep : SU->Succs) { 998 SUnit *Succ = SuccDep.getSUnit(); 999 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1000 continue; 1001 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1002 } 1003 if (SUColors.size() == 1) 1004 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1005 } 1006 } 1007 1008 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 1009 unsigned DAGSize = DAG->SUnits.size(); 1010 1011 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1012 SUnit *SU = &DAG->SUnits[SUNum]; 1013 std::set<unsigned> SUColors; 1014 1015 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1016 continue; 1017 1018 for (SDep& SuccDep : SU->Succs) { 1019 SUnit *Succ = SuccDep.getSUnit(); 1020 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1021 continue; 1022 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1023 } 1024 if (SUColors.size() == 1) 1025 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1026 } 1027 } 1028 1029 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1030 unsigned DAGSize = DAG->SUnits.size(); 1031 1032 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1033 SUnit *SU = &DAG->SUnits[SUNum]; 1034 std::set<unsigned> SUColors; 1035 1036 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1037 continue; 1038 1039 for (SDep& SuccDep : SU->Succs) { 1040 SUnit *Succ = SuccDep.getSUnit(); 1041 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1042 continue; 1043 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1044 } 1045 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1046 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1047 } 1048 } 1049 1050 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1051 unsigned DAGSize = DAG->SUnits.size(); 1052 std::map<unsigned, unsigned> ColorCount; 1053 1054 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1055 SUnit *SU = &DAG->SUnits[SUNum]; 1056 unsigned color = CurrentColoring[SU->NodeNum]; 1057 ++ColorCount[color]; 1058 } 1059 1060 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1061 SUnit *SU = &DAG->SUnits[SUNum]; 1062 unsigned color = CurrentColoring[SU->NodeNum]; 1063 std::set<unsigned> SUColors; 1064 1065 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1066 continue; 1067 1068 if (ColorCount[color] > 1) 1069 continue; 1070 1071 for (SDep& SuccDep : SU->Succs) { 1072 SUnit *Succ = SuccDep.getSUnit(); 1073 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1074 continue; 1075 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1076 } 1077 if (SUColors.size() == 1 && *SUColors.begin() != color) { 1078 --ColorCount[color]; 1079 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1080 ++ColorCount[*SUColors.begin()]; 1081 } 1082 } 1083 } 1084 1085 void SIScheduleBlockCreator::cutHugeBlocks() { 1086 // TODO 1087 } 1088 1089 void SIScheduleBlockCreator::regroupNoUserInstructions() { 1090 unsigned DAGSize = DAG->SUnits.size(); 1091 int GroupID = NextNonReservedID++; 1092 1093 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1094 SUnit *SU = &DAG->SUnits[SUNum]; 1095 bool hasSuccessor = false; 1096 1097 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1098 continue; 1099 1100 for (SDep& SuccDep : SU->Succs) { 1101 SUnit *Succ = SuccDep.getSUnit(); 1102 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1103 continue; 1104 hasSuccessor = true; 1105 } 1106 if (!hasSuccessor) 1107 CurrentColoring[SU->NodeNum] = GroupID; 1108 } 1109 } 1110 1111 void SIScheduleBlockCreator::colorExports() { 1112 unsigned ExportColor = NextNonReservedID++; 1113 SmallVector<unsigned, 8> ExpGroup; 1114 1115 // Put all exports together in a block. 1116 // The block will naturally end up being scheduled last, 1117 // thus putting exports at the end of the schedule, which 1118 // is better for performance. 1119 // However we must ensure, for safety, the exports can be put 1120 // together in the same block without any other instruction. 1121 // This could happen, for example, when scheduling after regalloc 1122 // if reloading a spilled register from memory using the same 1123 // register than used in a previous export. 1124 // If that happens, do not regroup the exports. 1125 for (unsigned SUNum : DAG->TopDownIndex2SU) { 1126 const SUnit &SU = DAG->SUnits[SUNum]; 1127 if (SIInstrInfo::isEXP(*SU.getInstr())) { 1128 // Check the EXP can be added to the group safely, 1129 // ie without needing any other instruction. 1130 // The EXP is allowed to depend on other EXP 1131 // (they will be in the same group). 1132 for (unsigned j : ExpGroup) { 1133 bool HasSubGraph; 1134 std::vector<int> SubGraph; 1135 // By construction (topological order), if SU and 1136 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 1137 // in the parent graph of SU. 1138 #ifndef NDEBUG 1139 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 1140 HasSubGraph); 1141 assert(!HasSubGraph); 1142 #endif 1143 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 1144 HasSubGraph); 1145 if (!HasSubGraph) 1146 continue; // No dependencies between each other 1147 1148 // SubGraph contains all the instructions required 1149 // between EXP SUnits[j] and EXP SU. 1150 for (unsigned k : SubGraph) { 1151 if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) 1152 // Other instructions than EXP would be required in the group. 1153 // Abort the groupping. 1154 return; 1155 } 1156 } 1157 1158 ExpGroup.push_back(SUNum); 1159 } 1160 } 1161 1162 // The group can be formed. Give the color. 1163 for (unsigned j : ExpGroup) 1164 CurrentColoring[j] = ExportColor; 1165 } 1166 1167 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1168 unsigned DAGSize = DAG->SUnits.size(); 1169 std::map<unsigned,unsigned> RealID; 1170 1171 CurrentBlocks.clear(); 1172 CurrentColoring.clear(); 1173 CurrentColoring.resize(DAGSize, 0); 1174 Node2CurrentBlock.clear(); 1175 1176 // Restore links previous scheduling variant has overridden. 1177 DAG->restoreSULinksLeft(); 1178 1179 NextReservedID = 1; 1180 NextNonReservedID = DAGSize + 1; 1181 1182 LLVM_DEBUG(dbgs() << "Coloring the graph\n"); 1183 1184 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1185 colorHighLatenciesGroups(); 1186 else 1187 colorHighLatenciesAlone(); 1188 colorComputeReservedDependencies(); 1189 colorAccordingToReservedDependencies(); 1190 colorEndsAccordingToDependencies(); 1191 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1192 colorForceConsecutiveOrderInGroup(); 1193 regroupNoUserInstructions(); 1194 colorMergeConstantLoadsNextGroup(); 1195 colorMergeIfPossibleNextGroupOnlyForReserved(); 1196 colorExports(); 1197 1198 // Put SUs of same color into same block 1199 Node2CurrentBlock.resize(DAGSize, -1); 1200 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1201 SUnit *SU = &DAG->SUnits[i]; 1202 unsigned Color = CurrentColoring[SU->NodeNum]; 1203 if (RealID.find(Color) == RealID.end()) { 1204 int ID = CurrentBlocks.size(); 1205 BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); 1206 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1207 RealID[Color] = ID; 1208 } 1209 CurrentBlocks[RealID[Color]]->addUnit(SU); 1210 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1211 } 1212 1213 // Build dependencies between blocks. 1214 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1215 SUnit *SU = &DAG->SUnits[i]; 1216 int SUID = Node2CurrentBlock[i]; 1217 for (SDep& SuccDep : SU->Succs) { 1218 SUnit *Succ = SuccDep.getSUnit(); 1219 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1220 continue; 1221 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1222 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1223 SuccDep.isCtrl() ? NoData : Data); 1224 } 1225 for (SDep& PredDep : SU->Preds) { 1226 SUnit *Pred = PredDep.getSUnit(); 1227 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1228 continue; 1229 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1230 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1231 } 1232 } 1233 1234 // Free root and leafs of all blocks to enable scheduling inside them. 1235 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1236 SIScheduleBlock *Block = CurrentBlocks[i]; 1237 Block->finalizeUnits(); 1238 } 1239 LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; 1240 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1241 SIScheduleBlock *Block = CurrentBlocks[i]; 1242 Block->printDebug(true); 1243 }); 1244 } 1245 1246 // Two functions taken from Codegen/MachineScheduler.cpp 1247 1248 /// Non-const version. 1249 static MachineBasicBlock::iterator 1250 nextIfDebug(MachineBasicBlock::iterator I, 1251 MachineBasicBlock::const_iterator End) { 1252 for (; I != End; ++I) { 1253 if (!I->isDebugInstr()) 1254 break; 1255 } 1256 return I; 1257 } 1258 1259 void SIScheduleBlockCreator::topologicalSort() { 1260 unsigned DAGSize = CurrentBlocks.size(); 1261 std::vector<int> WorkList; 1262 1263 LLVM_DEBUG(dbgs() << "Topological Sort\n"); 1264 1265 WorkList.reserve(DAGSize); 1266 TopDownIndex2Block.resize(DAGSize); 1267 TopDownBlock2Index.resize(DAGSize); 1268 BottomUpIndex2Block.resize(DAGSize); 1269 1270 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1271 SIScheduleBlock *Block = CurrentBlocks[i]; 1272 unsigned Degree = Block->getSuccs().size(); 1273 TopDownBlock2Index[i] = Degree; 1274 if (Degree == 0) { 1275 WorkList.push_back(i); 1276 } 1277 } 1278 1279 int Id = DAGSize; 1280 while (!WorkList.empty()) { 1281 int i = WorkList.back(); 1282 SIScheduleBlock *Block = CurrentBlocks[i]; 1283 WorkList.pop_back(); 1284 TopDownBlock2Index[i] = --Id; 1285 TopDownIndex2Block[Id] = i; 1286 for (SIScheduleBlock* Pred : Block->getPreds()) { 1287 if (!--TopDownBlock2Index[Pred->getID()]) 1288 WorkList.push_back(Pred->getID()); 1289 } 1290 } 1291 1292 #ifndef NDEBUG 1293 // Check correctness of the ordering. 1294 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1295 SIScheduleBlock *Block = CurrentBlocks[i]; 1296 for (SIScheduleBlock* Pred : Block->getPreds()) { 1297 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1298 "Wrong Top Down topological sorting"); 1299 } 1300 } 1301 #endif 1302 1303 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1304 TopDownIndex2Block.rend()); 1305 } 1306 1307 void SIScheduleBlockCreator::scheduleInsideBlocks() { 1308 unsigned DAGSize = CurrentBlocks.size(); 1309 1310 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1311 1312 // We do schedule a valid scheduling such that a Block corresponds 1313 // to a range of instructions. 1314 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1315 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1316 SIScheduleBlock *Block = CurrentBlocks[i]; 1317 Block->fastSchedule(); 1318 } 1319 1320 // Note: the following code, and the part restoring previous position 1321 // is by far the most expensive operation of the Scheduler. 1322 1323 // Do not update CurrentTop. 1324 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1325 std::vector<MachineBasicBlock::iterator> PosOld; 1326 std::vector<MachineBasicBlock::iterator> PosNew; 1327 PosOld.reserve(DAG->SUnits.size()); 1328 PosNew.reserve(DAG->SUnits.size()); 1329 1330 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1331 int BlockIndice = TopDownIndex2Block[i]; 1332 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1333 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1334 1335 for (SUnit* SU : SUs) { 1336 MachineInstr *MI = SU->getInstr(); 1337 MachineBasicBlock::iterator Pos = MI; 1338 PosOld.push_back(Pos); 1339 if (&*CurrentTopFastSched == MI) { 1340 PosNew.push_back(Pos); 1341 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1342 DAG->getCurrentBottom()); 1343 } else { 1344 // Update the instruction stream. 1345 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1346 1347 // Update LiveIntervals. 1348 // Note: Moving all instructions and calling handleMove every time 1349 // is the most cpu intensive operation of the scheduler. 1350 // It would gain a lot if there was a way to recompute the 1351 // LiveIntervals for the entire scheduling region. 1352 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1353 PosNew.push_back(CurrentTopFastSched); 1354 } 1355 } 1356 } 1357 1358 // Now we have Block of SUs == Block of MI. 1359 // We do the final schedule for the instructions inside the block. 1360 // The property that all the SUs of the Block are grouped together as MI 1361 // is used for correct reg usage tracking. 1362 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1363 SIScheduleBlock *Block = CurrentBlocks[i]; 1364 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1365 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1366 } 1367 1368 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); 1369 // Restore old ordering (which prevents a LIS->handleMove bug). 1370 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1371 MachineBasicBlock::iterator POld = PosOld[i-1]; 1372 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1373 if (PNew != POld) { 1374 // Update the instruction stream. 1375 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1376 1377 // Update LiveIntervals. 1378 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1379 } 1380 } 1381 1382 LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1383 SIScheduleBlock *Block = CurrentBlocks[i]; 1384 Block->printDebug(true); 1385 }); 1386 } 1387 1388 void SIScheduleBlockCreator::fillStats() { 1389 unsigned DAGSize = CurrentBlocks.size(); 1390 1391 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1392 int BlockIndice = TopDownIndex2Block[i]; 1393 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1394 if (Block->getPreds().empty()) 1395 Block->Depth = 0; 1396 else { 1397 unsigned Depth = 0; 1398 for (SIScheduleBlock *Pred : Block->getPreds()) { 1399 if (Depth < Pred->Depth + Pred->getCost()) 1400 Depth = Pred->Depth + Pred->getCost(); 1401 } 1402 Block->Depth = Depth; 1403 } 1404 } 1405 1406 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1407 int BlockIndice = BottomUpIndex2Block[i]; 1408 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1409 if (Block->getSuccs().empty()) 1410 Block->Height = 0; 1411 else { 1412 unsigned Height = 0; 1413 for (const auto &Succ : Block->getSuccs()) 1414 Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); 1415 Block->Height = Height; 1416 } 1417 } 1418 } 1419 1420 // SIScheduleBlockScheduler // 1421 1422 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1423 SISchedulerBlockSchedulerVariant Variant, 1424 SIScheduleBlocks BlocksStruct) : 1425 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1426 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1427 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1428 1429 // Fill the usage of every output 1430 // Warning: while by construction we always have a link between two blocks 1431 // when one needs a result from the other, the number of users of an output 1432 // is not the sum of child blocks having as input the same virtual register. 1433 // Here is an example. A produces x and y. B eats x and produces x'. 1434 // C eats x' and y. The register coalescer may have attributed the same 1435 // virtual register to x and x'. 1436 // To count accurately, we do a topological sort. In case the register is 1437 // found for several parents, we increment the usage of the one with the 1438 // highest topological index. 1439 LiveOutRegsNumUsages.resize(Blocks.size()); 1440 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1441 SIScheduleBlock *Block = Blocks[i]; 1442 for (unsigned Reg : Block->getInRegs()) { 1443 bool Found = false; 1444 int topoInd = -1; 1445 for (SIScheduleBlock* Pred: Block->getPreds()) { 1446 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1447 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1448 1449 if (RegPos != PredOutRegs.end()) { 1450 Found = true; 1451 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1452 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1453 } 1454 } 1455 } 1456 1457 if (!Found) 1458 continue; 1459 1460 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1461 ++LiveOutRegsNumUsages[PredID][Reg]; 1462 } 1463 } 1464 1465 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1466 BlockNumPredsLeft.resize(Blocks.size()); 1467 BlockNumSuccsLeft.resize(Blocks.size()); 1468 1469 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1470 SIScheduleBlock *Block = Blocks[i]; 1471 BlockNumPredsLeft[i] = Block->getPreds().size(); 1472 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1473 } 1474 1475 #ifndef NDEBUG 1476 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1477 SIScheduleBlock *Block = Blocks[i]; 1478 assert(Block->getID() == i); 1479 } 1480 #endif 1481 1482 std::set<unsigned> InRegs = DAG->getInRegs(); 1483 addLiveRegs(InRegs); 1484 1485 // Increase LiveOutRegsNumUsages for blocks 1486 // producing registers consumed in another 1487 // scheduling region. 1488 for (unsigned Reg : DAG->getOutRegs()) { 1489 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1490 // Do reverse traversal 1491 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1492 SIScheduleBlock *Block = Blocks[ID]; 1493 const std::set<unsigned> &OutRegs = Block->getOutRegs(); 1494 1495 if (OutRegs.find(Reg) == OutRegs.end()) 1496 continue; 1497 1498 ++LiveOutRegsNumUsages[ID][Reg]; 1499 break; 1500 } 1501 } 1502 1503 // Fill LiveRegsConsumers for regs that were already 1504 // defined before scheduling. 1505 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1506 SIScheduleBlock *Block = Blocks[i]; 1507 for (unsigned Reg : Block->getInRegs()) { 1508 bool Found = false; 1509 for (SIScheduleBlock* Pred: Block->getPreds()) { 1510 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1511 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1512 1513 if (RegPos != PredOutRegs.end()) { 1514 Found = true; 1515 break; 1516 } 1517 } 1518 1519 if (!Found) 1520 ++LiveRegsConsumers[Reg]; 1521 } 1522 } 1523 1524 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1525 SIScheduleBlock *Block = Blocks[i]; 1526 if (BlockNumPredsLeft[i] == 0) { 1527 ReadyBlocks.push_back(Block); 1528 } 1529 } 1530 1531 while (SIScheduleBlock *Block = pickBlock()) { 1532 BlocksScheduled.push_back(Block); 1533 blockScheduled(Block); 1534 } 1535 1536 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block 1537 : BlocksScheduled) { 1538 dbgs() << ' ' << Block->getID(); 1539 } dbgs() << '\n';); 1540 } 1541 1542 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1543 SIBlockSchedCandidate &TryCand) { 1544 if (!Cand.isValid()) { 1545 TryCand.Reason = NodeOrder; 1546 return true; 1547 } 1548 1549 // Try to hide high latencies. 1550 if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, 1551 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1552 return true; 1553 // Schedule high latencies early so you can hide them better. 1554 if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1555 TryCand, Cand, Latency)) 1556 return true; 1557 if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, 1558 TryCand, Cand, Depth)) 1559 return true; 1560 if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, 1561 Cand.NumHighLatencySuccessors, 1562 TryCand, Cand, Successor)) 1563 return true; 1564 return false; 1565 } 1566 1567 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1568 SIBlockSchedCandidate &TryCand) { 1569 if (!Cand.isValid()) { 1570 TryCand.Reason = NodeOrder; 1571 return true; 1572 } 1573 1574 if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1575 TryCand, Cand, RegUsage)) 1576 return true; 1577 if (SISched::tryGreater(TryCand.NumSuccessors > 0, 1578 Cand.NumSuccessors > 0, 1579 TryCand, Cand, Successor)) 1580 return true; 1581 if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1582 return true; 1583 if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1584 TryCand, Cand, RegUsage)) 1585 return true; 1586 return false; 1587 } 1588 1589 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1590 SIBlockSchedCandidate Cand; 1591 std::vector<SIScheduleBlock*>::iterator Best; 1592 SIScheduleBlock *Block; 1593 if (ReadyBlocks.empty()) 1594 return nullptr; 1595 1596 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1597 VregCurrentUsage, SregCurrentUsage); 1598 if (VregCurrentUsage > maxVregUsage) 1599 maxVregUsage = VregCurrentUsage; 1600 if (SregCurrentUsage > maxSregUsage) 1601 maxSregUsage = SregCurrentUsage; 1602 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; 1603 for (SIScheduleBlock *Block 1604 : ReadyBlocks) dbgs() 1605 << Block->getID() << ' '; 1606 dbgs() << "\nCurrent Live:\n"; 1607 for (unsigned Reg 1608 : LiveRegs) dbgs() 1609 << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1610 dbgs() << '\n'; 1611 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1612 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); 1613 1614 Cand.Block = nullptr; 1615 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1616 E = ReadyBlocks.end(); I != E; ++I) { 1617 SIBlockSchedCandidate TryCand; 1618 TryCand.Block = *I; 1619 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1620 TryCand.VGPRUsageDiff = 1621 checkRegUsageImpact(TryCand.Block->getInRegs(), 1622 TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32]; 1623 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1624 TryCand.NumHighLatencySuccessors = 1625 TryCand.Block->getNumHighLatencySuccessors(); 1626 TryCand.LastPosHighLatParentScheduled = 1627 (unsigned int) std::max<int> (0, 1628 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1629 LastPosWaitedHighLatency); 1630 TryCand.Height = TryCand.Block->Height; 1631 // Try not to increase VGPR usage too much, else we may spill. 1632 if (VregCurrentUsage > 120 || 1633 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1634 if (!tryCandidateRegUsage(Cand, TryCand) && 1635 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1636 tryCandidateLatency(Cand, TryCand); 1637 } else { 1638 if (!tryCandidateLatency(Cand, TryCand)) 1639 tryCandidateRegUsage(Cand, TryCand); 1640 } 1641 if (TryCand.Reason != NoCand) { 1642 Cand.setBest(TryCand); 1643 Best = I; 1644 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1645 << getReasonStr(Cand.Reason) << '\n'); 1646 } 1647 } 1648 1649 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1650 dbgs() << "Is a block with high latency instruction: " 1651 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1652 dbgs() << "Position of last high latency dependency: " 1653 << Cand.LastPosHighLatParentScheduled << '\n'; 1654 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1655 dbgs() << '\n';); 1656 1657 Block = Cand.Block; 1658 ReadyBlocks.erase(Best); 1659 return Block; 1660 } 1661 1662 // Tracking of currently alive registers to determine VGPR Usage. 1663 1664 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1665 for (Register Reg : Regs) { 1666 // For now only track virtual registers. 1667 if (!Reg.isVirtual()) 1668 continue; 1669 // If not already in the live set, then add it. 1670 (void) LiveRegs.insert(Reg); 1671 } 1672 } 1673 1674 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1675 std::set<unsigned> &Regs) { 1676 for (unsigned Reg : Regs) { 1677 // For now only track virtual registers. 1678 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1679 assert (Pos != LiveRegs.end() && // Reg must be live. 1680 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1681 LiveRegsConsumers[Reg] >= 1); 1682 --LiveRegsConsumers[Reg]; 1683 if (LiveRegsConsumers[Reg] == 0) 1684 LiveRegs.erase(Pos); 1685 } 1686 } 1687 1688 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1689 for (const auto &Block : Parent->getSuccs()) { 1690 if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1691 ReadyBlocks.push_back(Block.first); 1692 1693 if (Parent->isHighLatencyBlock() && 1694 Block.second == SIScheduleBlockLinkKind::Data) 1695 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1696 } 1697 } 1698 1699 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1700 decreaseLiveRegs(Block, Block->getInRegs()); 1701 addLiveRegs(Block->getOutRegs()); 1702 releaseBlockSuccs(Block); 1703 for (std::map<unsigned, unsigned>::iterator RegI = 1704 LiveOutRegsNumUsages[Block->getID()].begin(), 1705 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { 1706 std::pair<unsigned, unsigned> RegP = *RegI; 1707 // We produce this register, thus it must not be previously alive. 1708 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1709 LiveRegsConsumers[RegP.first] == 0); 1710 LiveRegsConsumers[RegP.first] += RegP.second; 1711 } 1712 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1713 (unsigned)LastPosWaitedHighLatency) 1714 LastPosWaitedHighLatency = 1715 LastPosHighLatencyParentScheduled[Block->getID()]; 1716 ++NumBlockScheduled; 1717 } 1718 1719 std::vector<int> 1720 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1721 std::set<unsigned> &OutRegs) { 1722 std::vector<int> DiffSetPressure; 1723 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1724 1725 for (Register Reg : InRegs) { 1726 // For now only track virtual registers. 1727 if (!Reg.isVirtual()) 1728 continue; 1729 if (LiveRegsConsumers[Reg] > 1) 1730 continue; 1731 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1732 for (; PSetI.isValid(); ++PSetI) { 1733 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1734 } 1735 } 1736 1737 for (Register Reg : OutRegs) { 1738 // For now only track virtual registers. 1739 if (!Reg.isVirtual()) 1740 continue; 1741 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1742 for (; PSetI.isValid(); ++PSetI) { 1743 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1744 } 1745 } 1746 1747 return DiffSetPressure; 1748 } 1749 1750 // SIScheduler // 1751 1752 struct SIScheduleBlockResult 1753 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1754 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1755 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1756 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1757 std::vector<SIScheduleBlock*> ScheduledBlocks; 1758 struct SIScheduleBlockResult Res; 1759 1760 ScheduledBlocks = Scheduler.getBlocks(); 1761 1762 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { 1763 SIScheduleBlock *Block = ScheduledBlocks[b]; 1764 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1765 1766 for (SUnit* SU : SUs) 1767 Res.SUs.push_back(SU->NodeNum); 1768 } 1769 1770 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1771 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1772 return Res; 1773 } 1774 1775 // SIScheduleDAGMI // 1776 1777 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1778 ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { 1779 SITII = static_cast<const SIInstrInfo*>(TII); 1780 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1781 } 1782 1783 SIScheduleDAGMI::~SIScheduleDAGMI() = default; 1784 1785 // Code adapted from scheduleDAG.cpp 1786 // Does a topological sort over the SUs. 1787 // Both TopDown and BottomUp 1788 void SIScheduleDAGMI::topologicalSort() { 1789 Topo.InitDAGTopologicalSorting(); 1790 1791 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1792 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1793 } 1794 1795 // Move low latencies further from their user without 1796 // increasing SGPR usage (in general) 1797 // This is to be replaced by a better pass that would 1798 // take into account SGPR usage (based on VGPR Usage 1799 // and the corresponding wavefront count), that would 1800 // try to merge groups of loads if it make sense, etc 1801 void SIScheduleDAGMI::moveLowLatencies() { 1802 unsigned DAGSize = SUnits.size(); 1803 int LastLowLatencyUser = -1; 1804 int LastLowLatencyPos = -1; 1805 1806 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1807 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1808 bool IsLowLatencyUser = false; 1809 unsigned MinPos = 0; 1810 1811 for (SDep& PredDep : SU->Preds) { 1812 SUnit *Pred = PredDep.getSUnit(); 1813 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1814 IsLowLatencyUser = true; 1815 } 1816 if (Pred->NodeNum >= DAGSize) 1817 continue; 1818 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1819 if (PredPos >= MinPos) 1820 MinPos = PredPos + 1; 1821 } 1822 1823 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1824 unsigned BestPos = LastLowLatencyUser + 1; 1825 if ((int)BestPos <= LastLowLatencyPos) 1826 BestPos = LastLowLatencyPos + 1; 1827 if (BestPos < MinPos) 1828 BestPos = MinPos; 1829 if (BestPos < i) { 1830 for (unsigned u = i; u > BestPos; --u) { 1831 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1832 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1833 } 1834 ScheduledSUnits[BestPos] = SU->NodeNum; 1835 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1836 } 1837 LastLowLatencyPos = BestPos; 1838 if (IsLowLatencyUser) 1839 LastLowLatencyUser = BestPos; 1840 } else if (IsLowLatencyUser) { 1841 LastLowLatencyUser = i; 1842 // Moves COPY instructions on which depends 1843 // the low latency instructions too. 1844 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1845 bool CopyForLowLat = false; 1846 for (SDep& SuccDep : SU->Succs) { 1847 SUnit *Succ = SuccDep.getSUnit(); 1848 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1849 continue; 1850 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1851 CopyForLowLat = true; 1852 } 1853 } 1854 if (!CopyForLowLat) 1855 continue; 1856 if (MinPos < i) { 1857 for (unsigned u = i; u > MinPos; --u) { 1858 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1859 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1860 } 1861 ScheduledSUnits[MinPos] = SU->NodeNum; 1862 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1863 } 1864 } 1865 } 1866 } 1867 1868 void SIScheduleDAGMI::restoreSULinksLeft() { 1869 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1870 SUnits[i].isScheduled = false; 1871 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1872 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1873 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1874 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1875 } 1876 } 1877 1878 // Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1879 template<typename _Iterator> void 1880 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1881 unsigned &VgprUsage, unsigned &SgprUsage) { 1882 VgprUsage = 0; 1883 SgprUsage = 0; 1884 for (_Iterator RegI = First; RegI != End; ++RegI) { 1885 Register Reg = *RegI; 1886 // For now only track virtual registers 1887 if (!Reg.isVirtual()) 1888 continue; 1889 PSetIterator PSetI = MRI.getPressureSets(Reg); 1890 for (; PSetI.isValid(); ++PSetI) { 1891 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) 1892 VgprUsage += PSetI.getWeight(); 1893 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32) 1894 SgprUsage += PSetI.getWeight(); 1895 } 1896 } 1897 } 1898 1899 void SIScheduleDAGMI::schedule() 1900 { 1901 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1902 SIScheduleBlockResult Best, Temp; 1903 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); 1904 1905 buildDAGWithRegPressure(); 1906 LLVM_DEBUG(dump()); 1907 1908 topologicalSort(); 1909 findRootsAndBiasEdges(TopRoots, BotRoots); 1910 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1911 // functions, but to make them happy we must initialize 1912 // the default Scheduler implementation (even if we do not 1913 // run it) 1914 SchedImpl->initialize(this); 1915 initQueues(TopRoots, BotRoots); 1916 1917 // Fill some stats to help scheduling. 1918 1919 SUnitsLinksBackup = SUnits; 1920 IsLowLatencySU.clear(); 1921 LowLatencyOffset.clear(); 1922 IsHighLatencySU.clear(); 1923 1924 IsLowLatencySU.resize(SUnits.size(), 0); 1925 LowLatencyOffset.resize(SUnits.size(), 0); 1926 IsHighLatencySU.resize(SUnits.size(), 0); 1927 1928 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1929 SUnit *SU = &SUnits[i]; 1930 const MachineOperand *BaseLatOp; 1931 int64_t OffLatReg; 1932 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1933 IsLowLatencySU[i] = 1; 1934 bool OffsetIsScalable; 1935 if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, 1936 OffsetIsScalable, TRI)) 1937 LowLatencyOffset[i] = OffLatReg; 1938 } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode())) 1939 IsHighLatencySU[i] = 1; 1940 } 1941 1942 SIScheduler Scheduler(this); 1943 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1944 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1945 1946 // if VGPR usage is extremely high, try other good performing variants 1947 // which could lead to lower VGPR usage 1948 if (Best.MaxVGPRUsage > 180) { 1949 static const std::pair<SISchedulerBlockCreatorVariant, 1950 SISchedulerBlockSchedulerVariant> 1951 Variants[] = { 1952 { LatenciesAlone, BlockRegUsageLatency }, 1953 // { LatenciesAlone, BlockRegUsage }, 1954 { LatenciesGrouped, BlockLatencyRegUsage }, 1955 // { LatenciesGrouped, BlockRegUsageLatency }, 1956 // { LatenciesGrouped, BlockRegUsage }, 1957 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1958 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1959 // { LatenciesAlonePlusConsecutive, BlockRegUsage } 1960 }; 1961 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1962 Temp = Scheduler.scheduleVariant(v.first, v.second); 1963 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1964 Best = Temp; 1965 } 1966 } 1967 // if VGPR usage is still extremely high, we may spill. Try other variants 1968 // which are less performing, but that could lead to lower VGPR usage. 1969 if (Best.MaxVGPRUsage > 200) { 1970 static const std::pair<SISchedulerBlockCreatorVariant, 1971 SISchedulerBlockSchedulerVariant> 1972 Variants[] = { 1973 // { LatenciesAlone, BlockRegUsageLatency }, 1974 { LatenciesAlone, BlockRegUsage }, 1975 // { LatenciesGrouped, BlockLatencyRegUsage }, 1976 { LatenciesGrouped, BlockRegUsageLatency }, 1977 { LatenciesGrouped, BlockRegUsage }, 1978 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1979 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1980 { LatenciesAlonePlusConsecutive, BlockRegUsage } 1981 }; 1982 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1983 Temp = Scheduler.scheduleVariant(v.first, v.second); 1984 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1985 Best = Temp; 1986 } 1987 } 1988 1989 ScheduledSUnits = Best.SUs; 1990 ScheduledSUnitsInv.resize(SUnits.size()); 1991 1992 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1993 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 1994 } 1995 1996 moveLowLatencies(); 1997 1998 // Tell the outside world about the result of the scheduling. 1999 2000 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 2001 TopRPTracker.setPos(CurrentTop); 2002 2003 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), 2004 E = ScheduledSUnits.end(); I != E; ++I) { 2005 SUnit *SU = &SUnits[*I]; 2006 2007 scheduleMI(SU, true); 2008 2009 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 2010 << *SU->getInstr()); 2011 } 2012 2013 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 2014 2015 placeDebugValues(); 2016 2017 LLVM_DEBUG({ 2018 dbgs() << "*** Final schedule for " 2019 << printMBBReference(*begin()->getParent()) << " ***\n"; 2020 dumpSchedule(); 2021 dbgs() << '\n'; 2022 }); 2023 } 2024