1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIMachineScheduler.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIInstrInfo.h" 17 #include "llvm/CodeGen/LiveIntervals.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "machine-scheduler" 23 24 // This scheduler implements a different scheduling algorithm than 25 // GenericScheduler. 26 // 27 // There are several specific architecture behaviours that can't be modelled 28 // for GenericScheduler: 29 // . When accessing the result of an SGPR load instruction, you have to wait 30 // for all the SGPR load instructions before your current instruction to 31 // have finished. 32 // . When accessing the result of an VGPR load instruction, you have to wait 33 // for all the VGPR load instructions previous to the VGPR load instruction 34 // you are interested in to finish. 35 // . The less the register pressure, the best load latencies are hidden 36 // 37 // Moreover some specifities (like the fact a lot of instructions in the shader 38 // have few dependencies) makes the generic scheduler have some unpredictable 39 // behaviours. For example when register pressure becomes high, it can either 40 // manage to prevent register pressure from going too high, or it can 41 // increase register pressure even more than if it hadn't taken register 42 // pressure into account. 43 // 44 // Also some other bad behaviours are generated, like loading at the beginning 45 // of the shader a constant in VGPR you won't need until the end of the shader. 46 // 47 // The scheduling problem for SI can distinguish three main parts: 48 // . Hiding high latencies (texture sampling, etc) 49 // . Hiding low latencies (SGPR constant loading, etc) 50 // . Keeping register usage low for better latency hiding and general 51 // performance 52 // 53 // Some other things can also affect performance, but are hard to predict 54 // (cache usage, the fact the HW can issue several instructions from different 55 // wavefronts if different types, etc) 56 // 57 // This scheduler tries to solve the scheduling problem by dividing it into 58 // simpler sub-problems. It divides the instructions into blocks, schedules 59 // locally inside the blocks where it takes care of low latencies, and then 60 // chooses the order of the blocks by taking care of high latencies. 61 // Dividing the instructions into blocks helps control keeping register 62 // usage low. 63 // 64 // First the instructions are put into blocks. 65 // We want the blocks help control register usage and hide high latencies 66 // later. To help control register usage, we typically want all local 67 // computations, when for example you create a result that can be consumed 68 // right away, to be contained in a block. Block inputs and outputs would 69 // typically be important results that are needed in several locations of 70 // the shader. Since we do want blocks to help hide high latencies, we want 71 // the instructions inside the block to have a minimal set of dependencies 72 // on high latencies. It will make it easy to pick blocks to hide specific 73 // high latencies. 74 // The block creation algorithm is divided into several steps, and several 75 // variants can be tried during the scheduling process. 76 // 77 // Second the order of the instructions inside the blocks is chosen. 78 // At that step we do take into account only register usage and hiding 79 // low latency instructions 80 // 81 // Third the block order is chosen, there we try to hide high latencies 82 // and keep register usage low. 83 // 84 // After the third step, a pass is done to improve the hiding of low 85 // latencies. 86 // 87 // Actually when talking about 'low latency' or 'high latency' it includes 88 // both the latency to get the cache (or global mem) data go to the register, 89 // and the bandwidth limitations. 90 // Increasing the number of active wavefronts helps hide the former, but it 91 // doesn't solve the latter, thus why even if wavefront count is high, we have 92 // to try have as many instructions hiding high latencies as possible. 93 // The OpenCL doc says for example latency of 400 cycles for a global mem 94 // access, which is hidden by 10 instructions if the wavefront count is 10. 95 96 // Some figures taken from AMD docs: 97 // Both texture and constant L1 caches are 4-way associative with 64 bytes 98 // lines. 99 // Constant cache is shared with 4 CUs. 100 // For texture sampling, the address generation unit receives 4 texture 101 // addresses per cycle, thus we could expect texture sampling latency to be 102 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 103 // instructions in a wavefront group are executed every 4 cycles), 104 // or 16 instructions if the other wavefronts associated to the 3 other VALUs 105 // of the CU do texture sampling too. (Don't take these figures too seriously, 106 // as I'm not 100% sure of the computation) 107 // Data exports should get similar latency. 108 // For constant loading, the cache is shader with 4 CUs. 109 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 110 // I guess if the other CU don't read the cache, it can go up to 64B/cycle. 111 // It means a simple s_buffer_load should take one instruction to hide, as 112 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 113 // cache line. 114 // 115 // As of today the driver doesn't preload the constants in cache, thus the 116 // first loads get extra latency. The doc says global memory access can be 117 // 300-600 cycles. We do not specially take that into account when scheduling 118 // As we expect the driver to be able to preload the constants soon. 119 120 // common code // 121 122 #ifndef NDEBUG 123 124 static const char *getReasonStr(SIScheduleCandReason Reason) { 125 switch (Reason) { 126 case NoCand: return "NOCAND"; 127 case RegUsage: return "REGUSAGE"; 128 case Latency: return "LATENCY"; 129 case Successor: return "SUCCESSOR"; 130 case Depth: return "DEPTH"; 131 case NodeOrder: return "ORDER"; 132 } 133 llvm_unreachable("Unknown reason!"); 134 } 135 136 #endif 137 138 namespace llvm::SISched { 139 static bool tryLess(int TryVal, int CandVal, 140 SISchedulerCandidate &TryCand, 141 SISchedulerCandidate &Cand, 142 SIScheduleCandReason Reason) { 143 if (TryVal < CandVal) { 144 TryCand.Reason = Reason; 145 return true; 146 } 147 if (TryVal > CandVal) { 148 if (Cand.Reason > Reason) 149 Cand.Reason = Reason; 150 return true; 151 } 152 Cand.setRepeat(Reason); 153 return false; 154 } 155 156 static bool tryGreater(int TryVal, int CandVal, 157 SISchedulerCandidate &TryCand, 158 SISchedulerCandidate &Cand, 159 SIScheduleCandReason Reason) { 160 if (TryVal > CandVal) { 161 TryCand.Reason = Reason; 162 return true; 163 } 164 if (TryVal < CandVal) { 165 if (Cand.Reason > Reason) 166 Cand.Reason = Reason; 167 return true; 168 } 169 Cand.setRepeat(Reason); 170 return false; 171 } 172 } // end namespace llvm::SISched 173 174 // SIScheduleBlock // 175 176 void SIScheduleBlock::addUnit(SUnit *SU) { 177 NodeNum2Index[SU->NodeNum] = SUnits.size(); 178 SUnits.push_back(SU); 179 } 180 181 #ifndef NDEBUG 182 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 183 184 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 185 dbgs() << '\n'; 186 } 187 #endif 188 189 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 190 SISchedCandidate &TryCand) { 191 // Initialize the candidate if needed. 192 if (!Cand.isValid()) { 193 TryCand.Reason = NodeOrder; 194 return; 195 } 196 197 if (Cand.SGPRUsage > 60 && 198 SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, 199 TryCand, Cand, RegUsage)) 200 return; 201 202 // Schedule low latency instructions as top as possible. 203 // Order of priority is: 204 // . Low latency instructions which do not depend on other low latency 205 // instructions we haven't waited for 206 // . Other instructions which do not depend on low latency instructions 207 // we haven't waited for 208 // . Low latencies 209 // . All other instructions 210 // Goal is to get: low latency instructions - independent instructions 211 // - (eventually some more low latency instructions) 212 // - instructions that depend on the first low latency instructions. 213 // If in the block there is a lot of constant loads, the SGPR usage 214 // could go quite high, thus above the arbitrary limit of 60 will encourage 215 // use the already loaded constants (in order to release some SGPRs) before 216 // loading more. 217 if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, 218 Cand.HasLowLatencyNonWaitedParent, 219 TryCand, Cand, SIScheduleCandReason::Depth)) 220 return; 221 222 if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 223 TryCand, Cand, SIScheduleCandReason::Depth)) 224 return; 225 226 if (TryCand.IsLowLatency && 227 SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 228 TryCand, Cand, SIScheduleCandReason::Depth)) 229 return; 230 231 if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, 232 TryCand, Cand, RegUsage)) 233 return; 234 235 // Fall through to original instruction order. 236 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 237 TryCand.Reason = NodeOrder; 238 } 239 } 240 241 SUnit* SIScheduleBlock::pickNode() { 242 SISchedCandidate TopCand; 243 244 for (SUnit* SU : TopReadySUs) { 245 SISchedCandidate TryCand; 246 std::vector<unsigned> pressure; 247 std::vector<unsigned> MaxPressure; 248 // Predict register usage after this instruction. 249 TryCand.SU = SU; 250 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 251 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32]; 252 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 253 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 254 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 255 TryCand.HasLowLatencyNonWaitedParent = 256 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 257 tryCandidateTopDown(TopCand, TryCand); 258 if (TryCand.Reason != NoCand) 259 TopCand.setBest(TryCand); 260 } 261 262 return TopCand.SU; 263 } 264 265 266 // Schedule something valid. 267 void SIScheduleBlock::fastSchedule() { 268 TopReadySUs.clear(); 269 if (Scheduled) 270 undoSchedule(); 271 272 for (SUnit* SU : SUnits) { 273 if (!SU->NumPredsLeft) 274 TopReadySUs.push_back(SU); 275 } 276 277 while (!TopReadySUs.empty()) { 278 SUnit *SU = TopReadySUs[0]; 279 ScheduledSUnits.push_back(SU); 280 nodeScheduled(SU); 281 } 282 283 Scheduled = true; 284 } 285 286 // Returns if the register was set between first and last. 287 static bool isDefBetween(unsigned Reg, 288 SlotIndex First, SlotIndex Last, 289 const MachineRegisterInfo *MRI, 290 const LiveIntervals *LIS) { 291 for (MachineRegisterInfo::def_instr_iterator 292 UI = MRI->def_instr_begin(Reg), 293 UE = MRI->def_instr_end(); UI != UE; ++UI) { 294 const MachineInstr* MI = &*UI; 295 if (MI->isDebugValue()) 296 continue; 297 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 298 if (InstSlot >= First && InstSlot <= Last) 299 return true; 300 } 301 return false; 302 } 303 304 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 305 MachineBasicBlock::iterator EndBlock) { 306 IntervalPressure Pressure, BotPressure; 307 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 308 LiveIntervals *LIS = DAG->getLIS(); 309 MachineRegisterInfo *MRI = DAG->getMRI(); 310 DAG->initRPTracker(TopRPTracker); 311 DAG->initRPTracker(BotRPTracker); 312 DAG->initRPTracker(RPTracker); 313 314 // Goes though all SU. RPTracker captures what had to be alive for the SUs 315 // to execute, and what is still alive at the end. 316 for (SUnit* SU : ScheduledSUnits) { 317 RPTracker.setPos(SU->getInstr()); 318 RPTracker.advance(); 319 } 320 321 // Close the RPTracker to finalize live ins/outs. 322 RPTracker.closeRegion(); 323 324 // Initialize the live ins and live outs. 325 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 326 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 327 328 // Do not Track Physical Registers, because it messes up. 329 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 330 if (RegMaskPair.RegUnit.isVirtual()) 331 LiveInRegs.insert(RegMaskPair.RegUnit); 332 } 333 LiveOutRegs.clear(); 334 // There is several possibilities to distinguish: 335 // 1) Reg is not input to any instruction in the block, but is output of one 336 // 2) 1) + read in the block and not needed after it 337 // 3) 1) + read in the block but needed in another block 338 // 4) Reg is input of an instruction but another block will read it too 339 // 5) Reg is input of an instruction and then rewritten in the block. 340 // result is not read in the block (implies used in another block) 341 // 6) Reg is input of an instruction and then rewritten in the block. 342 // result is read in the block and not needed in another block 343 // 7) Reg is input of an instruction and then rewritten in the block. 344 // result is read in the block but also needed in another block 345 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 346 // We want LiveOutRegs to contain only Regs whose content will be read after 347 // in another block, and whose content was written in the current block, 348 // that is we want it to get 1, 3, 5, 7 349 // Since we made the MIs of a block to be packed all together before 350 // scheduling, then the LiveIntervals were correct, and the RPTracker was 351 // able to correctly handle 5 vs 6, 2 vs 3. 352 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 353 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 354 // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7 355 // The use of findDefBetween removes the case 4. 356 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 357 Register Reg = RegMaskPair.RegUnit; 358 if (Reg.isVirtual() && 359 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 360 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 361 LIS)) { 362 LiveOutRegs.insert(Reg); 363 } 364 } 365 366 // Pressure = sum_alive_registers register size 367 // Internally llvm will represent some registers as big 128 bits registers 368 // for example, but they actually correspond to 4 actual 32 bits registers. 369 // Thus Pressure is not equal to num_alive_registers * constant. 370 LiveInPressure = TopPressure.MaxSetPressure; 371 LiveOutPressure = BotPressure.MaxSetPressure; 372 373 // Prepares TopRPTracker for top down scheduling. 374 TopRPTracker.closeTop(); 375 } 376 377 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 378 MachineBasicBlock::iterator EndBlock) { 379 if (!Scheduled) 380 fastSchedule(); 381 382 // PreScheduling phase to set LiveIn and LiveOut. 383 initRegPressure(BeginBlock, EndBlock); 384 undoSchedule(); 385 386 // Schedule for real now. 387 388 TopReadySUs.clear(); 389 390 for (SUnit* SU : SUnits) { 391 if (!SU->NumPredsLeft) 392 TopReadySUs.push_back(SU); 393 } 394 395 while (!TopReadySUs.empty()) { 396 SUnit *SU = pickNode(); 397 ScheduledSUnits.push_back(SU); 398 TopRPTracker.setPos(SU->getInstr()); 399 TopRPTracker.advance(); 400 nodeScheduled(SU); 401 } 402 403 // TODO: compute InternalAdditionalPressure. 404 InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size()); 405 406 // Check everything is right. 407 #ifndef NDEBUG 408 assert(SUnits.size() == ScheduledSUnits.size() && 409 TopReadySUs.empty()); 410 for (SUnit* SU : SUnits) { 411 assert(SU->isScheduled && 412 SU->NumPredsLeft == 0); 413 } 414 #endif 415 416 Scheduled = true; 417 } 418 419 void SIScheduleBlock::undoSchedule() { 420 for (SUnit* SU : SUnits) { 421 SU->isScheduled = false; 422 for (SDep& Succ : SU->Succs) { 423 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 424 undoReleaseSucc(SU, &Succ); 425 } 426 } 427 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 428 ScheduledSUnits.clear(); 429 Scheduled = false; 430 } 431 432 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 433 SUnit *SuccSU = SuccEdge->getSUnit(); 434 435 if (SuccEdge->isWeak()) { 436 ++SuccSU->WeakPredsLeft; 437 return; 438 } 439 ++SuccSU->NumPredsLeft; 440 } 441 442 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 443 SUnit *SuccSU = SuccEdge->getSUnit(); 444 445 if (SuccEdge->isWeak()) { 446 --SuccSU->WeakPredsLeft; 447 return; 448 } 449 #ifndef NDEBUG 450 if (SuccSU->NumPredsLeft == 0) { 451 dbgs() << "*** Scheduling failed! ***\n"; 452 DAG->dumpNode(*SuccSU); 453 dbgs() << " has been released too many times!\n"; 454 llvm_unreachable(nullptr); 455 } 456 #endif 457 458 --SuccSU->NumPredsLeft; 459 } 460 461 /// Release Successors of the SU that are in the block or not. 462 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 463 for (SDep& Succ : SU->Succs) { 464 SUnit *SuccSU = Succ.getSUnit(); 465 466 if (SuccSU->NodeNum >= DAG->SUnits.size()) 467 continue; 468 469 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 470 continue; 471 472 releaseSucc(SU, &Succ); 473 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 474 TopReadySUs.push_back(SuccSU); 475 } 476 } 477 478 void SIScheduleBlock::nodeScheduled(SUnit *SU) { 479 // Is in TopReadySUs 480 assert (!SU->NumPredsLeft); 481 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 482 if (I == TopReadySUs.end()) { 483 dbgs() << "Data Structure Bug in SI Scheduler\n"; 484 llvm_unreachable(nullptr); 485 } 486 TopReadySUs.erase(I); 487 488 releaseSuccessors(SU, true); 489 // Scheduling this node will trigger a wait, 490 // thus propagate to other instructions that they do not need to wait either. 491 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 492 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 493 494 if (DAG->IsLowLatencySU[SU->NodeNum]) { 495 for (SDep& Succ : SU->Succs) { 496 std::map<unsigned, unsigned>::iterator I = 497 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 498 if (I != NodeNum2Index.end()) 499 HasLowLatencyNonWaitedParent[I->second] = 1; 500 } 501 } 502 SU->isScheduled = true; 503 } 504 505 void SIScheduleBlock::finalizeUnits() { 506 // We remove links from outside blocks to enable scheduling inside the block. 507 for (SUnit* SU : SUnits) { 508 releaseSuccessors(SU, false); 509 if (DAG->IsHighLatencySU[SU->NodeNum]) 510 HighLatencyBlock = true; 511 } 512 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 513 } 514 515 // we maintain ascending order of IDs 516 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 517 unsigned PredID = Pred->getID(); 518 519 // Check if not already predecessor. 520 for (SIScheduleBlock* P : Preds) { 521 if (PredID == P->getID()) 522 return; 523 } 524 Preds.push_back(Pred); 525 526 assert(none_of(Succs, 527 [=](std::pair<SIScheduleBlock*, 528 SIScheduleBlockLinkKind> S) { 529 return PredID == S.first->getID(); 530 }) && 531 "Loop in the Block Graph!"); 532 } 533 534 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 535 SIScheduleBlockLinkKind Kind) { 536 unsigned SuccID = Succ->getID(); 537 538 // Check if not already predecessor. 539 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 540 if (SuccID == S.first->getID()) { 541 if (S.second == SIScheduleBlockLinkKind::NoData && 542 Kind == SIScheduleBlockLinkKind::Data) 543 S.second = Kind; 544 return; 545 } 546 } 547 if (Succ->isHighLatencyBlock()) 548 ++NumHighLatencySuccessors; 549 Succs.emplace_back(Succ, Kind); 550 551 assert(none_of(Preds, 552 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 553 "Loop in the Block Graph!"); 554 } 555 556 #ifndef NDEBUG 557 void SIScheduleBlock::printDebug(bool full) { 558 dbgs() << "Block (" << ID << ")\n"; 559 if (!full) 560 return; 561 562 dbgs() << "\nContains High Latency Instruction: " 563 << HighLatencyBlock << '\n'; 564 dbgs() << "\nDepends On:\n"; 565 for (SIScheduleBlock* P : Preds) { 566 P->printDebug(false); 567 } 568 569 dbgs() << "\nSuccessors:\n"; 570 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 571 if (S.second == SIScheduleBlockLinkKind::Data) 572 dbgs() << "(Data Dep) "; 573 S.first->printDebug(false); 574 } 575 576 if (Scheduled) { 577 dbgs() << "LiveInPressure " 578 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' 579 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n'; 580 dbgs() << "LiveOutPressure " 581 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' 582 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; 583 dbgs() << "LiveIns:\n"; 584 for (unsigned Reg : LiveInRegs) 585 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 586 587 dbgs() << "\nLiveOuts:\n"; 588 for (unsigned Reg : LiveOutRegs) 589 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 590 } 591 592 dbgs() << "\nInstructions:\n"; 593 for (const SUnit* SU : SUnits) 594 DAG->dumpNode(*SU); 595 596 dbgs() << "///////////////////////\n"; 597 } 598 #endif 599 600 // SIScheduleBlockCreator // 601 602 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) 603 : DAG(DAG) {} 604 605 SIScheduleBlocks 606 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 607 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 608 Blocks.find(BlockVariant); 609 if (B == Blocks.end()) { 610 SIScheduleBlocks Res; 611 createBlocksForVariant(BlockVariant); 612 topologicalSort(); 613 scheduleInsideBlocks(); 614 fillStats(); 615 Res.Blocks = CurrentBlocks; 616 Res.TopDownIndex2Block = TopDownIndex2Block; 617 Res.TopDownBlock2Index = TopDownBlock2Index; 618 Blocks[BlockVariant] = Res; 619 return Res; 620 } 621 return B->second; 622 } 623 624 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 625 if (SU->NodeNum >= DAG->SUnits.size()) 626 return false; 627 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 628 } 629 630 void SIScheduleBlockCreator::colorHighLatenciesAlone() { 631 unsigned DAGSize = DAG->SUnits.size(); 632 633 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 634 SUnit *SU = &DAG->SUnits[i]; 635 if (DAG->IsHighLatencySU[SU->NodeNum]) { 636 CurrentColoring[SU->NodeNum] = NextReservedID++; 637 } 638 } 639 } 640 641 static bool 642 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 643 for (const auto &PredDep : SU.Preds) { 644 if (PredDep.getSUnit() == &FromSU && 645 PredDep.getKind() == llvm::SDep::Data) 646 return true; 647 } 648 return false; 649 } 650 651 void SIScheduleBlockCreator::colorHighLatenciesGroups() { 652 unsigned DAGSize = DAG->SUnits.size(); 653 unsigned NumHighLatencies = 0; 654 unsigned GroupSize; 655 int Color = NextReservedID; 656 unsigned Count = 0; 657 std::set<unsigned> FormingGroup; 658 659 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 660 SUnit *SU = &DAG->SUnits[i]; 661 if (DAG->IsHighLatencySU[SU->NodeNum]) 662 ++NumHighLatencies; 663 } 664 665 if (NumHighLatencies == 0) 666 return; 667 668 if (NumHighLatencies <= 6) 669 GroupSize = 2; 670 else if (NumHighLatencies <= 12) 671 GroupSize = 3; 672 else 673 GroupSize = 4; 674 675 for (unsigned SUNum : DAG->TopDownIndex2SU) { 676 const SUnit &SU = DAG->SUnits[SUNum]; 677 if (DAG->IsHighLatencySU[SU.NodeNum]) { 678 unsigned CompatibleGroup = true; 679 int ProposedColor = Color; 680 std::vector<int> AdditionalElements; 681 682 // We don't want to put in the same block 683 // two high latency instructions that depend 684 // on each other. 685 // One way would be to check canAddEdge 686 // in both directions, but that currently is not 687 // enough because there the high latency order is 688 // enforced (via links). 689 // Instead, look at the dependencies between the 690 // high latency instructions and deduce if it is 691 // a data dependency or not. 692 for (unsigned j : FormingGroup) { 693 bool HasSubGraph; 694 std::vector<int> SubGraph; 695 // By construction (topological order), if SU and 696 // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary 697 // in the parent graph of SU. 698 #ifndef NDEBUG 699 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 700 HasSubGraph); 701 assert(!HasSubGraph); 702 #endif 703 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 704 HasSubGraph); 705 if (!HasSubGraph) 706 continue; // No dependencies between each other 707 if (SubGraph.size() > 5) { 708 // Too many elements would be required to be added to the block. 709 CompatibleGroup = false; 710 break; 711 } 712 // Check the type of dependency 713 for (unsigned k : SubGraph) { 714 // If in the path to join the two instructions, 715 // there is another high latency instruction, 716 // or instructions colored for another block 717 // abort the merge. 718 if (DAG->IsHighLatencySU[k] || (CurrentColoring[k] != ProposedColor && 719 CurrentColoring[k] != 0)) { 720 CompatibleGroup = false; 721 break; 722 } 723 // If one of the SU in the subgraph depends on the result of SU j, 724 // there'll be a data dependency. 725 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 726 CompatibleGroup = false; 727 break; 728 } 729 } 730 if (!CompatibleGroup) 731 break; 732 // Same check for the SU 733 if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 734 CompatibleGroup = false; 735 break; 736 } 737 // Add all the required instructions to the block 738 // These cannot live in another block (because they 739 // depend (order dependency) on one of the 740 // instruction in the block, and are required for the 741 // high latency instruction we add. 742 llvm::append_range(AdditionalElements, SubGraph); 743 } 744 if (CompatibleGroup) { 745 FormingGroup.insert(SU.NodeNum); 746 for (unsigned j : AdditionalElements) 747 CurrentColoring[j] = ProposedColor; 748 CurrentColoring[SU.NodeNum] = ProposedColor; 749 ++Count; 750 } 751 // Found one incompatible instruction, 752 // or has filled a big enough group. 753 // -> start a new one. 754 if (!CompatibleGroup) { 755 FormingGroup.clear(); 756 Color = ++NextReservedID; 757 ProposedColor = Color; 758 FormingGroup.insert(SU.NodeNum); 759 CurrentColoring[SU.NodeNum] = ProposedColor; 760 Count = 0; 761 } else if (Count == GroupSize) { 762 FormingGroup.clear(); 763 Color = ++NextReservedID; 764 ProposedColor = Color; 765 Count = 0; 766 } 767 } 768 } 769 } 770 771 void SIScheduleBlockCreator::colorComputeReservedDependencies() { 772 unsigned DAGSize = DAG->SUnits.size(); 773 std::map<std::set<unsigned>, unsigned> ColorCombinations; 774 775 CurrentTopDownReservedDependencyColoring.clear(); 776 CurrentBottomUpReservedDependencyColoring.clear(); 777 778 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 779 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 780 781 // Traverse TopDown, and give different colors to SUs depending 782 // on which combination of High Latencies they depend on. 783 784 for (unsigned SUNum : DAG->TopDownIndex2SU) { 785 SUnit *SU = &DAG->SUnits[SUNum]; 786 std::set<unsigned> SUColors; 787 788 // Already given. 789 if (CurrentColoring[SU->NodeNum]) { 790 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 791 CurrentColoring[SU->NodeNum]; 792 continue; 793 } 794 795 for (SDep& PredDep : SU->Preds) { 796 SUnit *Pred = PredDep.getSUnit(); 797 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 798 continue; 799 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 800 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 801 } 802 // Color 0 by default. 803 if (SUColors.empty()) 804 continue; 805 // Same color than parents. 806 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 807 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 808 *SUColors.begin(); 809 else { 810 std::map<std::set<unsigned>, unsigned>::iterator Pos = 811 ColorCombinations.find(SUColors); 812 if (Pos != ColorCombinations.end()) { 813 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 814 } else { 815 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 816 NextNonReservedID; 817 ColorCombinations[SUColors] = NextNonReservedID++; 818 } 819 } 820 } 821 822 ColorCombinations.clear(); 823 824 // Same as before, but BottomUp. 825 826 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 827 SUnit *SU = &DAG->SUnits[SUNum]; 828 std::set<unsigned> SUColors; 829 830 // Already given. 831 if (CurrentColoring[SU->NodeNum]) { 832 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 833 CurrentColoring[SU->NodeNum]; 834 continue; 835 } 836 837 for (SDep& SuccDep : SU->Succs) { 838 SUnit *Succ = SuccDep.getSUnit(); 839 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 840 continue; 841 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 842 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 843 } 844 // Keep color 0. 845 if (SUColors.empty()) 846 continue; 847 // Same color than parents. 848 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 849 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 850 *SUColors.begin(); 851 else { 852 std::map<std::set<unsigned>, unsigned>::iterator Pos = 853 ColorCombinations.find(SUColors); 854 if (Pos != ColorCombinations.end()) { 855 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 856 } else { 857 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 858 NextNonReservedID; 859 ColorCombinations[SUColors] = NextNonReservedID++; 860 } 861 } 862 } 863 } 864 865 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 866 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 867 868 // Every combination of colors given by the top down 869 // and bottom up Reserved node dependency 870 871 for (const SUnit &SU : DAG->SUnits) { 872 std::pair<unsigned, unsigned> SUColors; 873 874 // High latency instructions: already given. 875 if (CurrentColoring[SU.NodeNum]) 876 continue; 877 878 SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum]; 879 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum]; 880 881 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 882 ColorCombinations.find(SUColors); 883 if (Pos != ColorCombinations.end()) { 884 CurrentColoring[SU.NodeNum] = Pos->second; 885 } else { 886 CurrentColoring[SU.NodeNum] = NextNonReservedID; 887 ColorCombinations[SUColors] = NextNonReservedID++; 888 } 889 } 890 } 891 892 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 893 unsigned DAGSize = DAG->SUnits.size(); 894 std::vector<int> PendingColoring = CurrentColoring; 895 896 assert(DAGSize >= 1 && 897 CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 898 CurrentTopDownReservedDependencyColoring.size() == DAGSize); 899 // If there is no reserved block at all, do nothing. We don't want 900 // everything in one block. 901 if (*llvm::max_element(CurrentBottomUpReservedDependencyColoring) == 0 && 902 *llvm::max_element(CurrentTopDownReservedDependencyColoring) == 0) 903 return; 904 905 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 906 SUnit *SU = &DAG->SUnits[SUNum]; 907 std::set<unsigned> SUColors; 908 std::set<unsigned> SUColorsPending; 909 910 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 911 continue; 912 913 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 914 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 915 continue; 916 917 for (SDep& SuccDep : SU->Succs) { 918 SUnit *Succ = SuccDep.getSUnit(); 919 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 920 continue; 921 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 922 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 923 SUColors.insert(CurrentColoring[Succ->NodeNum]); 924 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 925 } 926 // If there is only one child/parent block, and that block 927 // is not among the ones we are removing in this path, then 928 // merge the instruction to that block 929 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 930 PendingColoring[SU->NodeNum] = *SUColors.begin(); 931 else // TODO: Attribute new colors depending on color 932 // combination of children. 933 PendingColoring[SU->NodeNum] = NextNonReservedID++; 934 } 935 CurrentColoring = PendingColoring; 936 } 937 938 939 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 940 unsigned DAGSize = DAG->SUnits.size(); 941 unsigned PreviousColor; 942 std::set<unsigned> SeenColors; 943 944 if (DAGSize <= 1) 945 return; 946 947 PreviousColor = CurrentColoring[0]; 948 949 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 950 SUnit *SU = &DAG->SUnits[i]; 951 unsigned CurrentColor = CurrentColoring[i]; 952 unsigned PreviousColorSave = PreviousColor; 953 assert(i == SU->NodeNum); 954 955 if (CurrentColor != PreviousColor) 956 SeenColors.insert(PreviousColor); 957 PreviousColor = CurrentColor; 958 959 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 960 continue; 961 962 if (SeenColors.find(CurrentColor) == SeenColors.end()) 963 continue; 964 965 if (PreviousColorSave != CurrentColor) 966 CurrentColoring[i] = NextNonReservedID++; 967 else 968 CurrentColoring[i] = CurrentColoring[i-1]; 969 } 970 } 971 972 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 973 unsigned DAGSize = DAG->SUnits.size(); 974 975 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 976 SUnit *SU = &DAG->SUnits[SUNum]; 977 std::set<unsigned> SUColors; 978 979 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 980 continue; 981 982 // No predecessor: Vgpr constant loading. 983 // Low latency instructions usually have a predecessor (the address) 984 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 985 continue; 986 987 for (SDep& SuccDep : SU->Succs) { 988 SUnit *Succ = SuccDep.getSUnit(); 989 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 990 continue; 991 SUColors.insert(CurrentColoring[Succ->NodeNum]); 992 } 993 if (SUColors.size() == 1) 994 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 995 } 996 } 997 998 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 999 unsigned DAGSize = DAG->SUnits.size(); 1000 1001 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1002 SUnit *SU = &DAG->SUnits[SUNum]; 1003 std::set<unsigned> SUColors; 1004 1005 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1006 continue; 1007 1008 for (SDep& SuccDep : SU->Succs) { 1009 SUnit *Succ = SuccDep.getSUnit(); 1010 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1011 continue; 1012 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1013 } 1014 if (SUColors.size() == 1) 1015 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1016 } 1017 } 1018 1019 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1020 unsigned DAGSize = DAG->SUnits.size(); 1021 1022 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1023 SUnit *SU = &DAG->SUnits[SUNum]; 1024 std::set<unsigned> SUColors; 1025 1026 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1027 continue; 1028 1029 for (SDep& SuccDep : SU->Succs) { 1030 SUnit *Succ = SuccDep.getSUnit(); 1031 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1032 continue; 1033 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1034 } 1035 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1036 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1037 } 1038 } 1039 1040 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1041 unsigned DAGSize = DAG->SUnits.size(); 1042 std::map<unsigned, unsigned> ColorCount; 1043 1044 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1045 SUnit *SU = &DAG->SUnits[SUNum]; 1046 unsigned color = CurrentColoring[SU->NodeNum]; 1047 ++ColorCount[color]; 1048 } 1049 1050 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1051 SUnit *SU = &DAG->SUnits[SUNum]; 1052 unsigned color = CurrentColoring[SU->NodeNum]; 1053 std::set<unsigned> SUColors; 1054 1055 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1056 continue; 1057 1058 if (ColorCount[color] > 1) 1059 continue; 1060 1061 for (SDep& SuccDep : SU->Succs) { 1062 SUnit *Succ = SuccDep.getSUnit(); 1063 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1064 continue; 1065 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1066 } 1067 if (SUColors.size() == 1 && *SUColors.begin() != color) { 1068 --ColorCount[color]; 1069 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1070 ++ColorCount[*SUColors.begin()]; 1071 } 1072 } 1073 } 1074 1075 void SIScheduleBlockCreator::cutHugeBlocks() { 1076 // TODO 1077 } 1078 1079 void SIScheduleBlockCreator::regroupNoUserInstructions() { 1080 unsigned DAGSize = DAG->SUnits.size(); 1081 int GroupID = NextNonReservedID++; 1082 1083 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1084 SUnit *SU = &DAG->SUnits[SUNum]; 1085 bool hasSuccessor = false; 1086 1087 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1088 continue; 1089 1090 for (SDep& SuccDep : SU->Succs) { 1091 SUnit *Succ = SuccDep.getSUnit(); 1092 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1093 continue; 1094 hasSuccessor = true; 1095 } 1096 if (!hasSuccessor) 1097 CurrentColoring[SU->NodeNum] = GroupID; 1098 } 1099 } 1100 1101 void SIScheduleBlockCreator::colorExports() { 1102 unsigned ExportColor = NextNonReservedID++; 1103 SmallVector<unsigned, 8> ExpGroup; 1104 1105 // Put all exports together in a block. 1106 // The block will naturally end up being scheduled last, 1107 // thus putting exports at the end of the schedule, which 1108 // is better for performance. 1109 // However we must ensure, for safety, the exports can be put 1110 // together in the same block without any other instruction. 1111 // This could happen, for example, when scheduling after regalloc 1112 // if reloading a spilled register from memory using the same 1113 // register than used in a previous export. 1114 // If that happens, do not regroup the exports. 1115 for (unsigned SUNum : DAG->TopDownIndex2SU) { 1116 const SUnit &SU = DAG->SUnits[SUNum]; 1117 if (SIInstrInfo::isEXP(*SU.getInstr())) { 1118 // SU is an export instruction. Check whether one of its successor 1119 // dependencies is a non-export, in which case we skip export grouping. 1120 for (const SDep &SuccDep : SU.Succs) { 1121 const SUnit *SuccSU = SuccDep.getSUnit(); 1122 if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) { 1123 // Ignore these dependencies. 1124 continue; 1125 } 1126 assert(SuccSU->isInstr() && 1127 "SUnit unexpectedly not representing an instruction!"); 1128 1129 if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) { 1130 // A non-export depends on us. Skip export grouping. 1131 // Note that this is a bit pessimistic: We could still group all other 1132 // exports that are not depended on by non-exports, directly or 1133 // indirectly. Simply skipping this particular export but grouping all 1134 // others would not account for indirect dependencies. 1135 return; 1136 } 1137 } 1138 ExpGroup.push_back(SUNum); 1139 } 1140 } 1141 1142 // The group can be formed. Give the color. 1143 for (unsigned j : ExpGroup) 1144 CurrentColoring[j] = ExportColor; 1145 } 1146 1147 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1148 unsigned DAGSize = DAG->SUnits.size(); 1149 std::map<unsigned,unsigned> RealID; 1150 1151 CurrentBlocks.clear(); 1152 CurrentColoring.clear(); 1153 CurrentColoring.resize(DAGSize, 0); 1154 Node2CurrentBlock.clear(); 1155 1156 // Restore links previous scheduling variant has overridden. 1157 DAG->restoreSULinksLeft(); 1158 1159 NextReservedID = 1; 1160 NextNonReservedID = DAGSize + 1; 1161 1162 LLVM_DEBUG(dbgs() << "Coloring the graph\n"); 1163 1164 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1165 colorHighLatenciesGroups(); 1166 else 1167 colorHighLatenciesAlone(); 1168 colorComputeReservedDependencies(); 1169 colorAccordingToReservedDependencies(); 1170 colorEndsAccordingToDependencies(); 1171 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1172 colorForceConsecutiveOrderInGroup(); 1173 regroupNoUserInstructions(); 1174 colorMergeConstantLoadsNextGroup(); 1175 colorMergeIfPossibleNextGroupOnlyForReserved(); 1176 colorExports(); 1177 1178 // Put SUs of same color into same block 1179 Node2CurrentBlock.resize(DAGSize, -1); 1180 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1181 SUnit *SU = &DAG->SUnits[i]; 1182 unsigned Color = CurrentColoring[SU->NodeNum]; 1183 if (RealID.find(Color) == RealID.end()) { 1184 int ID = CurrentBlocks.size(); 1185 BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); 1186 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1187 RealID[Color] = ID; 1188 } 1189 CurrentBlocks[RealID[Color]]->addUnit(SU); 1190 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1191 } 1192 1193 // Build dependencies between blocks. 1194 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1195 SUnit *SU = &DAG->SUnits[i]; 1196 int SUID = Node2CurrentBlock[i]; 1197 for (SDep& SuccDep : SU->Succs) { 1198 SUnit *Succ = SuccDep.getSUnit(); 1199 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1200 continue; 1201 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1202 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1203 SuccDep.isCtrl() ? NoData : Data); 1204 } 1205 for (SDep& PredDep : SU->Preds) { 1206 SUnit *Pred = PredDep.getSUnit(); 1207 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1208 continue; 1209 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1210 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1211 } 1212 } 1213 1214 // Free root and leafs of all blocks to enable scheduling inside them. 1215 for (SIScheduleBlock *Block : CurrentBlocks) 1216 Block->finalizeUnits(); 1217 LLVM_DEBUG({ 1218 dbgs() << "Blocks created:\n\n"; 1219 for (SIScheduleBlock *Block : CurrentBlocks) 1220 Block->printDebug(true); 1221 }); 1222 } 1223 1224 // Two functions taken from Codegen/MachineScheduler.cpp 1225 1226 /// Non-const version. 1227 static MachineBasicBlock::iterator 1228 nextIfDebug(MachineBasicBlock::iterator I, 1229 MachineBasicBlock::const_iterator End) { 1230 for (; I != End; ++I) { 1231 if (!I->isDebugInstr()) 1232 break; 1233 } 1234 return I; 1235 } 1236 1237 void SIScheduleBlockCreator::topologicalSort() { 1238 unsigned DAGSize = CurrentBlocks.size(); 1239 std::vector<int> WorkList; 1240 1241 LLVM_DEBUG(dbgs() << "Topological Sort\n"); 1242 1243 WorkList.reserve(DAGSize); 1244 TopDownIndex2Block.resize(DAGSize); 1245 TopDownBlock2Index.resize(DAGSize); 1246 BottomUpIndex2Block.resize(DAGSize); 1247 1248 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1249 SIScheduleBlock *Block = CurrentBlocks[i]; 1250 unsigned Degree = Block->getSuccs().size(); 1251 TopDownBlock2Index[i] = Degree; 1252 if (Degree == 0) { 1253 WorkList.push_back(i); 1254 } 1255 } 1256 1257 int Id = DAGSize; 1258 while (!WorkList.empty()) { 1259 int i = WorkList.back(); 1260 SIScheduleBlock *Block = CurrentBlocks[i]; 1261 WorkList.pop_back(); 1262 TopDownBlock2Index[i] = --Id; 1263 TopDownIndex2Block[Id] = i; 1264 for (SIScheduleBlock* Pred : Block->getPreds()) { 1265 if (!--TopDownBlock2Index[Pred->getID()]) 1266 WorkList.push_back(Pred->getID()); 1267 } 1268 } 1269 1270 #ifndef NDEBUG 1271 // Check correctness of the ordering. 1272 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1273 SIScheduleBlock *Block = CurrentBlocks[i]; 1274 for (SIScheduleBlock* Pred : Block->getPreds()) { 1275 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1276 "Wrong Top Down topological sorting"); 1277 } 1278 } 1279 #endif 1280 1281 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1282 TopDownIndex2Block.rend()); 1283 } 1284 1285 void SIScheduleBlockCreator::scheduleInsideBlocks() { 1286 unsigned DAGSize = CurrentBlocks.size(); 1287 1288 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1289 1290 // We do schedule a valid scheduling such that a Block corresponds 1291 // to a range of instructions. 1292 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1293 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1294 SIScheduleBlock *Block = CurrentBlocks[i]; 1295 Block->fastSchedule(); 1296 } 1297 1298 // Note: the following code, and the part restoring previous position 1299 // is by far the most expensive operation of the Scheduler. 1300 1301 // Do not update CurrentTop. 1302 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1303 std::vector<MachineBasicBlock::iterator> PosOld; 1304 std::vector<MachineBasicBlock::iterator> PosNew; 1305 PosOld.reserve(DAG->SUnits.size()); 1306 PosNew.reserve(DAG->SUnits.size()); 1307 1308 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1309 int BlockIndice = TopDownIndex2Block[i]; 1310 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1311 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1312 1313 for (SUnit* SU : SUs) { 1314 MachineInstr *MI = SU->getInstr(); 1315 MachineBasicBlock::iterator Pos = MI; 1316 PosOld.push_back(Pos); 1317 if (&*CurrentTopFastSched == MI) { 1318 PosNew.push_back(Pos); 1319 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1320 DAG->getCurrentBottom()); 1321 } else { 1322 // Update the instruction stream. 1323 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1324 1325 // Update LiveIntervals. 1326 // Note: Moving all instructions and calling handleMove every time 1327 // is the most cpu intensive operation of the scheduler. 1328 // It would gain a lot if there was a way to recompute the 1329 // LiveIntervals for the entire scheduling region. 1330 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1331 PosNew.push_back(CurrentTopFastSched); 1332 } 1333 } 1334 } 1335 1336 // Now we have Block of SUs == Block of MI. 1337 // We do the final schedule for the instructions inside the block. 1338 // The property that all the SUs of the Block are grouped together as MI 1339 // is used for correct reg usage tracking. 1340 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1341 SIScheduleBlock *Block = CurrentBlocks[i]; 1342 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1343 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1344 } 1345 1346 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); 1347 // Restore old ordering (which prevents a LIS->handleMove bug). 1348 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1349 MachineBasicBlock::iterator POld = PosOld[i-1]; 1350 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1351 if (PNew != POld) { 1352 // Update the instruction stream. 1353 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1354 1355 // Update LiveIntervals. 1356 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1357 } 1358 } 1359 1360 LLVM_DEBUG({ 1361 for (SIScheduleBlock *Block : CurrentBlocks) 1362 Block->printDebug(true); 1363 }); 1364 } 1365 1366 void SIScheduleBlockCreator::fillStats() { 1367 unsigned DAGSize = CurrentBlocks.size(); 1368 1369 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1370 int BlockIndice = TopDownIndex2Block[i]; 1371 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1372 if (Block->getPreds().empty()) 1373 Block->Depth = 0; 1374 else { 1375 unsigned Depth = 0; 1376 for (SIScheduleBlock *Pred : Block->getPreds()) { 1377 if (Depth < Pred->Depth + Pred->getCost()) 1378 Depth = Pred->Depth + Pred->getCost(); 1379 } 1380 Block->Depth = Depth; 1381 } 1382 } 1383 1384 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1385 int BlockIndice = BottomUpIndex2Block[i]; 1386 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1387 if (Block->getSuccs().empty()) 1388 Block->Height = 0; 1389 else { 1390 unsigned Height = 0; 1391 for (const auto &Succ : Block->getSuccs()) 1392 Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); 1393 Block->Height = Height; 1394 } 1395 } 1396 } 1397 1398 // SIScheduleBlockScheduler // 1399 1400 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1401 SISchedulerBlockSchedulerVariant Variant, 1402 SIScheduleBlocks BlocksStruct) : 1403 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1404 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1405 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1406 1407 // Fill the usage of every output 1408 // Warning: while by construction we always have a link between two blocks 1409 // when one needs a result from the other, the number of users of an output 1410 // is not the sum of child blocks having as input the same virtual register. 1411 // Here is an example. A produces x and y. B eats x and produces x'. 1412 // C eats x' and y. The register coalescer may have attributed the same 1413 // virtual register to x and x'. 1414 // To count accurately, we do a topological sort. In case the register is 1415 // found for several parents, we increment the usage of the one with the 1416 // highest topological index. 1417 LiveOutRegsNumUsages.resize(Blocks.size()); 1418 for (SIScheduleBlock *Block : Blocks) { 1419 for (unsigned Reg : Block->getInRegs()) { 1420 bool Found = false; 1421 int topoInd = -1; 1422 for (SIScheduleBlock* Pred: Block->getPreds()) { 1423 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1424 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1425 1426 if (RegPos != PredOutRegs.end()) { 1427 Found = true; 1428 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1429 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1430 } 1431 } 1432 } 1433 1434 if (!Found) 1435 continue; 1436 1437 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1438 ++LiveOutRegsNumUsages[PredID][Reg]; 1439 } 1440 } 1441 1442 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1443 BlockNumPredsLeft.resize(Blocks.size()); 1444 BlockNumSuccsLeft.resize(Blocks.size()); 1445 1446 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1447 SIScheduleBlock *Block = Blocks[i]; 1448 BlockNumPredsLeft[i] = Block->getPreds().size(); 1449 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1450 } 1451 1452 #ifndef NDEBUG 1453 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1454 SIScheduleBlock *Block = Blocks[i]; 1455 assert(Block->getID() == i); 1456 } 1457 #endif 1458 1459 std::set<unsigned> InRegs = DAG->getInRegs(); 1460 addLiveRegs(InRegs); 1461 1462 // Increase LiveOutRegsNumUsages for blocks 1463 // producing registers consumed in another 1464 // scheduling region. 1465 for (unsigned Reg : DAG->getOutRegs()) { 1466 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1467 // Do reverse traversal 1468 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1469 SIScheduleBlock *Block = Blocks[ID]; 1470 const std::set<unsigned> &OutRegs = Block->getOutRegs(); 1471 1472 if (OutRegs.find(Reg) == OutRegs.end()) 1473 continue; 1474 1475 ++LiveOutRegsNumUsages[ID][Reg]; 1476 break; 1477 } 1478 } 1479 1480 // Fill LiveRegsConsumers for regs that were already 1481 // defined before scheduling. 1482 for (SIScheduleBlock *Block : Blocks) { 1483 for (unsigned Reg : Block->getInRegs()) { 1484 bool Found = false; 1485 for (SIScheduleBlock* Pred: Block->getPreds()) { 1486 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1487 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1488 1489 if (RegPos != PredOutRegs.end()) { 1490 Found = true; 1491 break; 1492 } 1493 } 1494 1495 if (!Found) 1496 ++LiveRegsConsumers[Reg]; 1497 } 1498 } 1499 1500 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1501 SIScheduleBlock *Block = Blocks[i]; 1502 if (BlockNumPredsLeft[i] == 0) { 1503 ReadyBlocks.push_back(Block); 1504 } 1505 } 1506 1507 while (SIScheduleBlock *Block = pickBlock()) { 1508 BlocksScheduled.push_back(Block); 1509 blockScheduled(Block); 1510 } 1511 1512 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block 1513 : BlocksScheduled) { 1514 dbgs() << ' ' << Block->getID(); 1515 } dbgs() << '\n';); 1516 } 1517 1518 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1519 SIBlockSchedCandidate &TryCand) { 1520 if (!Cand.isValid()) { 1521 TryCand.Reason = NodeOrder; 1522 return true; 1523 } 1524 1525 // Try to hide high latencies. 1526 if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, 1527 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1528 return true; 1529 // Schedule high latencies early so you can hide them better. 1530 if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1531 TryCand, Cand, Latency)) 1532 return true; 1533 if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, 1534 TryCand, Cand, Depth)) 1535 return true; 1536 if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, 1537 Cand.NumHighLatencySuccessors, 1538 TryCand, Cand, Successor)) 1539 return true; 1540 return false; 1541 } 1542 1543 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1544 SIBlockSchedCandidate &TryCand) { 1545 if (!Cand.isValid()) { 1546 TryCand.Reason = NodeOrder; 1547 return true; 1548 } 1549 1550 if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1551 TryCand, Cand, RegUsage)) 1552 return true; 1553 if (SISched::tryGreater(TryCand.NumSuccessors > 0, 1554 Cand.NumSuccessors > 0, 1555 TryCand, Cand, Successor)) 1556 return true; 1557 if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1558 return true; 1559 if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1560 TryCand, Cand, RegUsage)) 1561 return true; 1562 return false; 1563 } 1564 1565 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1566 SIBlockSchedCandidate Cand; 1567 std::vector<SIScheduleBlock*>::iterator Best; 1568 SIScheduleBlock *Block; 1569 if (ReadyBlocks.empty()) 1570 return nullptr; 1571 1572 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1573 VregCurrentUsage, SregCurrentUsage); 1574 if (VregCurrentUsage > maxVregUsage) 1575 maxVregUsage = VregCurrentUsage; 1576 if (SregCurrentUsage > maxSregUsage) 1577 maxSregUsage = SregCurrentUsage; 1578 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; 1579 for (SIScheduleBlock *Block 1580 : ReadyBlocks) dbgs() 1581 << Block->getID() << ' '; 1582 dbgs() << "\nCurrent Live:\n"; 1583 for (unsigned Reg 1584 : LiveRegs) dbgs() 1585 << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1586 dbgs() << '\n'; 1587 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1588 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); 1589 1590 Cand.Block = nullptr; 1591 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1592 E = ReadyBlocks.end(); I != E; ++I) { 1593 SIBlockSchedCandidate TryCand; 1594 TryCand.Block = *I; 1595 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1596 TryCand.VGPRUsageDiff = 1597 checkRegUsageImpact(TryCand.Block->getInRegs(), 1598 TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32]; 1599 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1600 TryCand.NumHighLatencySuccessors = 1601 TryCand.Block->getNumHighLatencySuccessors(); 1602 TryCand.LastPosHighLatParentScheduled = 1603 (unsigned int) std::max<int> (0, 1604 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1605 LastPosWaitedHighLatency); 1606 TryCand.Height = TryCand.Block->Height; 1607 // Try not to increase VGPR usage too much, else we may spill. 1608 if (VregCurrentUsage > 120 || 1609 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1610 if (!tryCandidateRegUsage(Cand, TryCand) && 1611 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1612 tryCandidateLatency(Cand, TryCand); 1613 } else { 1614 if (!tryCandidateLatency(Cand, TryCand)) 1615 tryCandidateRegUsage(Cand, TryCand); 1616 } 1617 if (TryCand.Reason != NoCand) { 1618 Cand.setBest(TryCand); 1619 Best = I; 1620 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1621 << getReasonStr(Cand.Reason) << '\n'); 1622 } 1623 } 1624 1625 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1626 dbgs() << "Is a block with high latency instruction: " 1627 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1628 dbgs() << "Position of last high latency dependency: " 1629 << Cand.LastPosHighLatParentScheduled << '\n'; 1630 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1631 dbgs() << '\n';); 1632 1633 Block = Cand.Block; 1634 ReadyBlocks.erase(Best); 1635 return Block; 1636 } 1637 1638 // Tracking of currently alive registers to determine VGPR Usage. 1639 1640 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1641 for (Register Reg : Regs) { 1642 // For now only track virtual registers. 1643 if (!Reg.isVirtual()) 1644 continue; 1645 // If not already in the live set, then add it. 1646 (void) LiveRegs.insert(Reg); 1647 } 1648 } 1649 1650 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1651 std::set<unsigned> &Regs) { 1652 for (unsigned Reg : Regs) { 1653 // For now only track virtual registers. 1654 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1655 assert (Pos != LiveRegs.end() && // Reg must be live. 1656 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1657 LiveRegsConsumers[Reg] >= 1); 1658 --LiveRegsConsumers[Reg]; 1659 if (LiveRegsConsumers[Reg] == 0) 1660 LiveRegs.erase(Pos); 1661 } 1662 } 1663 1664 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1665 for (const auto &Block : Parent->getSuccs()) { 1666 if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1667 ReadyBlocks.push_back(Block.first); 1668 1669 if (Parent->isHighLatencyBlock() && 1670 Block.second == SIScheduleBlockLinkKind::Data) 1671 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1672 } 1673 } 1674 1675 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1676 decreaseLiveRegs(Block, Block->getInRegs()); 1677 addLiveRegs(Block->getOutRegs()); 1678 releaseBlockSuccs(Block); 1679 for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) { 1680 // We produce this register, thus it must not be previously alive. 1681 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1682 LiveRegsConsumers[RegP.first] == 0); 1683 LiveRegsConsumers[RegP.first] += RegP.second; 1684 } 1685 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1686 (unsigned)LastPosWaitedHighLatency) 1687 LastPosWaitedHighLatency = 1688 LastPosHighLatencyParentScheduled[Block->getID()]; 1689 ++NumBlockScheduled; 1690 } 1691 1692 std::vector<int> 1693 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1694 std::set<unsigned> &OutRegs) { 1695 std::vector<int> DiffSetPressure; 1696 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1697 1698 for (Register Reg : InRegs) { 1699 // For now only track virtual registers. 1700 if (!Reg.isVirtual()) 1701 continue; 1702 if (LiveRegsConsumers[Reg] > 1) 1703 continue; 1704 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1705 for (; PSetI.isValid(); ++PSetI) { 1706 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1707 } 1708 } 1709 1710 for (Register Reg : OutRegs) { 1711 // For now only track virtual registers. 1712 if (!Reg.isVirtual()) 1713 continue; 1714 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1715 for (; PSetI.isValid(); ++PSetI) { 1716 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1717 } 1718 } 1719 1720 return DiffSetPressure; 1721 } 1722 1723 // SIScheduler // 1724 1725 struct SIScheduleBlockResult 1726 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1727 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1728 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1729 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1730 std::vector<SIScheduleBlock*> ScheduledBlocks; 1731 struct SIScheduleBlockResult Res; 1732 1733 ScheduledBlocks = Scheduler.getBlocks(); 1734 1735 for (SIScheduleBlock *Block : ScheduledBlocks) { 1736 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1737 1738 for (SUnit* SU : SUs) 1739 Res.SUs.push_back(SU->NodeNum); 1740 } 1741 1742 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1743 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1744 return Res; 1745 } 1746 1747 // SIScheduleDAGMI // 1748 1749 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1750 ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { 1751 SITII = static_cast<const SIInstrInfo*>(TII); 1752 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1753 } 1754 1755 SIScheduleDAGMI::~SIScheduleDAGMI() = default; 1756 1757 // Code adapted from scheduleDAG.cpp 1758 // Does a topological sort over the SUs. 1759 // Both TopDown and BottomUp 1760 void SIScheduleDAGMI::topologicalSort() { 1761 Topo.InitDAGTopologicalSorting(); 1762 1763 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1764 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1765 } 1766 1767 // Move low latencies further from their user without 1768 // increasing SGPR usage (in general) 1769 // This is to be replaced by a better pass that would 1770 // take into account SGPR usage (based on VGPR Usage 1771 // and the corresponding wavefront count), that would 1772 // try to merge groups of loads if it make sense, etc 1773 void SIScheduleDAGMI::moveLowLatencies() { 1774 unsigned DAGSize = SUnits.size(); 1775 int LastLowLatencyUser = -1; 1776 int LastLowLatencyPos = -1; 1777 1778 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1779 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1780 bool IsLowLatencyUser = false; 1781 unsigned MinPos = 0; 1782 1783 for (SDep& PredDep : SU->Preds) { 1784 SUnit *Pred = PredDep.getSUnit(); 1785 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1786 IsLowLatencyUser = true; 1787 } 1788 if (Pred->NodeNum >= DAGSize) 1789 continue; 1790 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1791 if (PredPos >= MinPos) 1792 MinPos = PredPos + 1; 1793 } 1794 1795 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1796 unsigned BestPos = LastLowLatencyUser + 1; 1797 if ((int)BestPos <= LastLowLatencyPos) 1798 BestPos = LastLowLatencyPos + 1; 1799 if (BestPos < MinPos) 1800 BestPos = MinPos; 1801 if (BestPos < i) { 1802 for (unsigned u = i; u > BestPos; --u) { 1803 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1804 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1805 } 1806 ScheduledSUnits[BestPos] = SU->NodeNum; 1807 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1808 } 1809 LastLowLatencyPos = BestPos; 1810 if (IsLowLatencyUser) 1811 LastLowLatencyUser = BestPos; 1812 } else if (IsLowLatencyUser) { 1813 LastLowLatencyUser = i; 1814 // Moves COPY instructions on which depends 1815 // the low latency instructions too. 1816 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1817 bool CopyForLowLat = false; 1818 for (SDep& SuccDep : SU->Succs) { 1819 SUnit *Succ = SuccDep.getSUnit(); 1820 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1821 continue; 1822 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1823 CopyForLowLat = true; 1824 } 1825 } 1826 if (!CopyForLowLat) 1827 continue; 1828 if (MinPos < i) { 1829 for (unsigned u = i; u > MinPos; --u) { 1830 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1831 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1832 } 1833 ScheduledSUnits[MinPos] = SU->NodeNum; 1834 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1835 } 1836 } 1837 } 1838 } 1839 1840 void SIScheduleDAGMI::restoreSULinksLeft() { 1841 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1842 SUnits[i].isScheduled = false; 1843 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1844 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1845 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1846 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1847 } 1848 } 1849 1850 // Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1851 template<typename _Iterator> void 1852 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1853 unsigned &VgprUsage, unsigned &SgprUsage) { 1854 VgprUsage = 0; 1855 SgprUsage = 0; 1856 for (_Iterator RegI = First; RegI != End; ++RegI) { 1857 Register Reg = *RegI; 1858 // For now only track virtual registers 1859 if (!Reg.isVirtual()) 1860 continue; 1861 PSetIterator PSetI = MRI.getPressureSets(Reg); 1862 for (; PSetI.isValid(); ++PSetI) { 1863 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) 1864 VgprUsage += PSetI.getWeight(); 1865 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32) 1866 SgprUsage += PSetI.getWeight(); 1867 } 1868 } 1869 } 1870 1871 void SIScheduleDAGMI::schedule() 1872 { 1873 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1874 SIScheduleBlockResult Best, Temp; 1875 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); 1876 1877 buildDAGWithRegPressure(); 1878 postProcessDAG(); 1879 1880 LLVM_DEBUG(dump()); 1881 if (PrintDAGs) 1882 dump(); 1883 if (ViewMISchedDAGs) 1884 viewGraph(); 1885 1886 topologicalSort(); 1887 findRootsAndBiasEdges(TopRoots, BotRoots); 1888 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1889 // functions, but to make them happy we must initialize 1890 // the default Scheduler implementation (even if we do not 1891 // run it) 1892 SchedImpl->initialize(this); 1893 initQueues(TopRoots, BotRoots); 1894 1895 // Fill some stats to help scheduling. 1896 1897 SUnitsLinksBackup = SUnits; 1898 IsLowLatencySU.clear(); 1899 LowLatencyOffset.clear(); 1900 IsHighLatencySU.clear(); 1901 1902 IsLowLatencySU.resize(SUnits.size(), 0); 1903 LowLatencyOffset.resize(SUnits.size(), 0); 1904 IsHighLatencySU.resize(SUnits.size(), 0); 1905 1906 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1907 SUnit *SU = &SUnits[i]; 1908 const MachineOperand *BaseLatOp; 1909 int64_t OffLatReg; 1910 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1911 IsLowLatencySU[i] = 1; 1912 bool OffsetIsScalable; 1913 if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, 1914 OffsetIsScalable, TRI)) 1915 LowLatencyOffset[i] = OffLatReg; 1916 } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode())) 1917 IsHighLatencySU[i] = 1; 1918 } 1919 1920 SIScheduler Scheduler(this); 1921 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1922 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1923 1924 // if VGPR usage is extremely high, try other good performing variants 1925 // which could lead to lower VGPR usage 1926 if (Best.MaxVGPRUsage > 180) { 1927 static const std::pair<SISchedulerBlockCreatorVariant, 1928 SISchedulerBlockSchedulerVariant> 1929 Variants[] = { 1930 { LatenciesAlone, BlockRegUsageLatency }, 1931 // { LatenciesAlone, BlockRegUsage }, 1932 { LatenciesGrouped, BlockLatencyRegUsage }, 1933 // { LatenciesGrouped, BlockRegUsageLatency }, 1934 // { LatenciesGrouped, BlockRegUsage }, 1935 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1936 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1937 // { LatenciesAlonePlusConsecutive, BlockRegUsage } 1938 }; 1939 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1940 Temp = Scheduler.scheduleVariant(v.first, v.second); 1941 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1942 Best = Temp; 1943 } 1944 } 1945 // if VGPR usage is still extremely high, we may spill. Try other variants 1946 // which are less performing, but that could lead to lower VGPR usage. 1947 if (Best.MaxVGPRUsage > 200) { 1948 static const std::pair<SISchedulerBlockCreatorVariant, 1949 SISchedulerBlockSchedulerVariant> 1950 Variants[] = { 1951 // { LatenciesAlone, BlockRegUsageLatency }, 1952 { LatenciesAlone, BlockRegUsage }, 1953 // { LatenciesGrouped, BlockLatencyRegUsage }, 1954 { LatenciesGrouped, BlockRegUsageLatency }, 1955 { LatenciesGrouped, BlockRegUsage }, 1956 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1957 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1958 { LatenciesAlonePlusConsecutive, BlockRegUsage } 1959 }; 1960 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1961 Temp = Scheduler.scheduleVariant(v.first, v.second); 1962 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1963 Best = Temp; 1964 } 1965 } 1966 1967 ScheduledSUnits = Best.SUs; 1968 ScheduledSUnitsInv.resize(SUnits.size()); 1969 1970 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1971 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 1972 } 1973 1974 moveLowLatencies(); 1975 1976 // Tell the outside world about the result of the scheduling. 1977 1978 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 1979 TopRPTracker.setPos(CurrentTop); 1980 1981 for (unsigned I : ScheduledSUnits) { 1982 SUnit *SU = &SUnits[I]; 1983 1984 scheduleMI(SU, true); 1985 1986 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 1987 << *SU->getInstr()); 1988 } 1989 1990 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 1991 1992 placeDebugValues(); 1993 1994 LLVM_DEBUG({ 1995 dbgs() << "*** Final schedule for " 1996 << printMBBReference(*begin()->getParent()) << " ***\n"; 1997 dumpSchedule(); 1998 dbgs() << '\n'; 1999 }); 2000 } 2001