1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H 15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H 16 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/CodeGen/RegisterPressure.h" 19 #include "llvm/CodeGen/ScheduleDAG.h" 20 #include <cstdint> 21 #include <set> 22 #include <vector> 23 24 namespace llvm { 25 26 class SIInstrInfo; 27 class SIRegisterInfo; 28 class SIScheduleDAGMI; 29 class SIScheduleBlockCreator; 30 31 enum SIScheduleCandReason { 32 NoCand, 33 RegUsage, 34 Latency, 35 Successor, 36 Depth, 37 NodeOrder 38 }; 39 40 struct SISchedulerCandidate { 41 // The reason for this candidate. 42 SIScheduleCandReason Reason = NoCand; 43 44 // Set of reasons that apply to multiple candidates. 45 uint32_t RepeatReasonSet = 0; 46 47 SISchedulerCandidate() = default; 48 49 bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } 50 void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } 51 }; 52 53 enum SIScheduleBlockLinkKind { 54 NoData, 55 Data 56 }; 57 58 class SIScheduleBlock { 59 SIScheduleDAGMI *DAG; 60 SIScheduleBlockCreator *BC; 61 62 std::vector<SUnit*> SUnits; 63 std::map<unsigned, unsigned> NodeNum2Index; 64 std::vector<SUnit*> TopReadySUs; 65 std::vector<SUnit*> ScheduledSUnits; 66 67 /// The top of the unscheduled zone. 68 IntervalPressure TopPressure; 69 RegPressureTracker TopRPTracker; 70 71 // Pressure: number of said class of registers needed to 72 // store the live virtual and real registers. 73 // We do care only of SGPR32 and VGPR32 and do track only virtual registers. 74 // Pressure of additional registers required inside the block. 75 std::vector<unsigned> InternalAdditionalPressure; 76 // Pressure of input and output registers 77 std::vector<unsigned> LiveInPressure; 78 std::vector<unsigned> LiveOutPressure; 79 // Registers required by the block, and outputs. 80 // We do track only virtual registers. 81 // Note that some registers are not 32 bits, 82 // and thus the pressure is not equal 83 // to the number of live registers. 84 std::set<unsigned> LiveInRegs; 85 std::set<unsigned> LiveOutRegs; 86 87 bool Scheduled = false; 88 bool HighLatencyBlock = false; 89 90 std::vector<unsigned> HasLowLatencyNonWaitedParent; 91 92 // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table. 93 unsigned ID; 94 95 std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. 96 // All blocks successors, and the kind of link 97 std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs; 98 unsigned NumHighLatencySuccessors = 0; 99 100 public: 101 SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, 102 unsigned ID): 103 DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {} 104 105 ~SIScheduleBlock() = default; 106 107 unsigned getID() const { return ID; } 108 109 /// Functions for Block construction. 110 void addUnit(SUnit *SU); 111 112 // When all SUs have been added. 113 void finalizeUnits(); 114 115 // Add block pred, which has instruction predecessor of SU. 116 void addPred(SIScheduleBlock *Pred); 117 void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind); 118 119 const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } 120 ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> 121 getSuccs() const { return Succs; } 122 123 unsigned Height; // Maximum topdown path length to block without outputs 124 unsigned Depth; // Maximum bottomup path length to block without inputs 125 126 unsigned getNumHighLatencySuccessors() const { 127 return NumHighLatencySuccessors; 128 } 129 130 bool isHighLatencyBlock() { return HighLatencyBlock; } 131 132 // This is approximative. 133 // Ideally should take into accounts some instructions (rcp, etc) 134 // are 4 times slower. 135 int getCost() { return SUnits.size(); } 136 137 // The block Predecessors and Successors must be all registered 138 // before fastSchedule(). 139 // Fast schedule with no particular requirement. 140 void fastSchedule(); 141 142 std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; } 143 144 // Complete schedule that will try to minimize reg pressure and 145 // low latencies, and will fill liveins and liveouts. 146 // Needs all MIs to be grouped between BeginBlock and EndBlock. 147 // The MIs can be moved after the scheduling, 148 // it is just used to allow correct track of live registers. 149 void schedule(MachineBasicBlock::iterator BeginBlock, 150 MachineBasicBlock::iterator EndBlock); 151 152 bool isScheduled() { return Scheduled; } 153 154 // Needs the block to be scheduled inside 155 // TODO: find a way to compute it. 156 std::vector<unsigned> &getInternalAdditionalRegUsage() { 157 return InternalAdditionalPressure; 158 } 159 160 std::set<unsigned> &getInRegs() { return LiveInRegs; } 161 std::set<unsigned> &getOutRegs() { return LiveOutRegs; } 162 163 void printDebug(bool Full); 164 165 private: 166 struct SISchedCandidate : SISchedulerCandidate { 167 // The best SUnit candidate. 168 SUnit *SU = nullptr; 169 170 unsigned SGPRUsage; 171 unsigned VGPRUsage; 172 bool IsLowLatency; 173 unsigned LowLatencyOffset; 174 bool HasLowLatencyNonWaitedParent; 175 176 SISchedCandidate() = default; 177 178 bool isValid() const { return SU; } 179 180 // Copy the status of another candidate without changing policy. 181 void setBest(SISchedCandidate &Best) { 182 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 183 SU = Best.SU; 184 Reason = Best.Reason; 185 SGPRUsage = Best.SGPRUsage; 186 VGPRUsage = Best.VGPRUsage; 187 IsLowLatency = Best.IsLowLatency; 188 LowLatencyOffset = Best.LowLatencyOffset; 189 HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent; 190 } 191 }; 192 193 void undoSchedule(); 194 195 void undoReleaseSucc(SUnit *SU, SDep *SuccEdge); 196 void releaseSucc(SUnit *SU, SDep *SuccEdge); 197 // InOrOutBlock: restrict to links pointing inside the block (true), 198 // or restrict to links pointing outside the block (false). 199 void releaseSuccessors(SUnit *SU, bool InOrOutBlock); 200 201 void nodeScheduled(SUnit *SU); 202 void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand); 203 void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand); 204 SUnit* pickNode(); 205 void traceCandidate(const SISchedCandidate &Cand); 206 void initRegPressure(MachineBasicBlock::iterator BeginBlock, 207 MachineBasicBlock::iterator EndBlock); 208 }; 209 210 struct SIScheduleBlocks { 211 std::vector<SIScheduleBlock*> Blocks; 212 std::vector<int> TopDownIndex2Block; 213 std::vector<int> TopDownBlock2Index; 214 }; 215 216 enum SISchedulerBlockCreatorVariant { 217 LatenciesAlone, 218 LatenciesGrouped, 219 LatenciesAlonePlusConsecutive 220 }; 221 222 class SIScheduleBlockCreator { 223 SIScheduleDAGMI *DAG; 224 // unique_ptr handles freeing memory for us. 225 std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs; 226 std::map<SISchedulerBlockCreatorVariant, 227 SIScheduleBlocks> Blocks; 228 std::vector<SIScheduleBlock*> CurrentBlocks; 229 std::vector<int> Node2CurrentBlock; 230 231 // Topological sort 232 // Maps topological index to the node number. 233 std::vector<int> TopDownIndex2Block; 234 std::vector<int> TopDownBlock2Index; 235 std::vector<int> BottomUpIndex2Block; 236 237 // 0 -> Color not given. 238 // 1 to SUnits.size() -> Reserved group (you should only add elements to them). 239 // Above -> Other groups. 240 int NextReservedID; 241 int NextNonReservedID; 242 std::vector<int> CurrentColoring; 243 std::vector<int> CurrentTopDownReservedDependencyColoring; 244 std::vector<int> CurrentBottomUpReservedDependencyColoring; 245 246 public: 247 SIScheduleBlockCreator(SIScheduleDAGMI *DAG); 248 249 SIScheduleBlocks 250 getBlocks(SISchedulerBlockCreatorVariant BlockVariant); 251 252 bool isSUInBlock(SUnit *SU, unsigned ID); 253 254 private: 255 // Give a Reserved color to every high latency. 256 void colorHighLatenciesAlone(); 257 258 // Create groups of high latencies with a Reserved color. 259 void colorHighLatenciesGroups(); 260 261 // Compute coloring for topdown and bottom traversals with 262 // different colors depending on dependencies on Reserved colors. 263 void colorComputeReservedDependencies(); 264 265 // Give color to all non-colored SUs according to Reserved groups dependencies. 266 void colorAccordingToReservedDependencies(); 267 268 // Divides Blocks having no bottom up or top down dependencies on Reserved groups. 269 // The new colors are computed according to the dependencies on the other blocks 270 // formed with colorAccordingToReservedDependencies. 271 void colorEndsAccordingToDependencies(); 272 273 // Cut groups into groups with SUs in consecutive order (except for Reserved groups). 274 void colorForceConsecutiveOrderInGroup(); 275 276 // Merge Constant loads that have all their users into another group to the group. 277 // (TODO: else if all their users depend on the same group, put them there) 278 void colorMergeConstantLoadsNextGroup(); 279 280 // Merge SUs that have all their users into another group to the group 281 void colorMergeIfPossibleNextGroup(); 282 283 // Merge SUs that have all their users into another group to the group, 284 // but only for Reserved groups. 285 void colorMergeIfPossibleNextGroupOnlyForReserved(); 286 287 // Merge SUs that have all their users into another group to the group, 288 // but only if the group is no more than a few SUs. 289 void colorMergeIfPossibleSmallGroupsToNextGroup(); 290 291 // Divides Blocks with important size. 292 // Idea of implementation: attribute new colors depending on topdown and 293 // bottom up links to other blocks. 294 void cutHugeBlocks(); 295 296 // Put in one group all instructions with no users in this scheduling region 297 // (we'd want these groups be at the end). 298 void regroupNoUserInstructions(); 299 300 // Give Reserved color to export instructions 301 void colorExports(); 302 303 void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); 304 305 void topologicalSort(); 306 307 void scheduleInsideBlocks(); 308 309 void fillStats(); 310 }; 311 312 enum SISchedulerBlockSchedulerVariant { 313 BlockLatencyRegUsage, 314 BlockRegUsageLatency, 315 BlockRegUsage 316 }; 317 318 class SIScheduleBlockScheduler { 319 SIScheduleDAGMI *DAG; 320 SISchedulerBlockSchedulerVariant Variant; 321 std::vector<SIScheduleBlock*> Blocks; 322 323 std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages; 324 std::set<unsigned> LiveRegs; 325 // Num of schedulable unscheduled blocks reading the register. 326 std::map<unsigned, unsigned> LiveRegsConsumers; 327 328 std::vector<unsigned> LastPosHighLatencyParentScheduled; 329 int LastPosWaitedHighLatency; 330 331 std::vector<SIScheduleBlock*> BlocksScheduled; 332 unsigned NumBlockScheduled; 333 std::vector<SIScheduleBlock*> ReadyBlocks; 334 335 unsigned VregCurrentUsage; 336 unsigned SregCurrentUsage; 337 338 // Currently is only approximation. 339 unsigned maxVregUsage; 340 unsigned maxSregUsage; 341 342 std::vector<unsigned> BlockNumPredsLeft; 343 std::vector<unsigned> BlockNumSuccsLeft; 344 345 public: 346 SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 347 SISchedulerBlockSchedulerVariant Variant, 348 SIScheduleBlocks BlocksStruct); 349 ~SIScheduleBlockScheduler() = default; 350 351 std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; } 352 353 unsigned getVGPRUsage() { return maxVregUsage; } 354 unsigned getSGPRUsage() { return maxSregUsage; } 355 356 private: 357 struct SIBlockSchedCandidate : SISchedulerCandidate { 358 // The best Block candidate. 359 SIScheduleBlock *Block = nullptr; 360 361 bool IsHighLatency; 362 int VGPRUsageDiff; 363 unsigned NumSuccessors; 364 unsigned NumHighLatencySuccessors; 365 unsigned LastPosHighLatParentScheduled; 366 unsigned Height; 367 368 SIBlockSchedCandidate() = default; 369 370 bool isValid() const { return Block; } 371 372 // Copy the status of another candidate without changing policy. 373 void setBest(SIBlockSchedCandidate &Best) { 374 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 375 Block = Best.Block; 376 Reason = Best.Reason; 377 IsHighLatency = Best.IsHighLatency; 378 VGPRUsageDiff = Best.VGPRUsageDiff; 379 NumSuccessors = Best.NumSuccessors; 380 NumHighLatencySuccessors = Best.NumHighLatencySuccessors; 381 LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled; 382 Height = Best.Height; 383 } 384 }; 385 386 bool tryCandidateLatency(SIBlockSchedCandidate &Cand, 387 SIBlockSchedCandidate &TryCand); 388 bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 389 SIBlockSchedCandidate &TryCand); 390 SIScheduleBlock *pickBlock(); 391 392 void addLiveRegs(std::set<unsigned> &Regs); 393 void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs); 394 void releaseBlockSuccs(SIScheduleBlock *Parent); 395 void blockScheduled(SIScheduleBlock *Block); 396 397 // Check register pressure change 398 // by scheduling a block with these LiveIn and LiveOut. 399 std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs, 400 std::set<unsigned> &OutRegs); 401 402 void schedule(); 403 }; 404 405 struct SIScheduleBlockResult { 406 std::vector<unsigned> SUs; 407 unsigned MaxSGPRUsage; 408 unsigned MaxVGPRUsage; 409 }; 410 411 class SIScheduler { 412 SIScheduleDAGMI *DAG; 413 SIScheduleBlockCreator BlockCreator; 414 415 public: 416 SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {} 417 418 ~SIScheduler() = default; 419 420 struct SIScheduleBlockResult 421 scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 422 SISchedulerBlockSchedulerVariant ScheduleVariant); 423 }; 424 425 class SIScheduleDAGMI final : public ScheduleDAGMILive { 426 const SIInstrInfo *SITII; 427 const SIRegisterInfo *SITRI; 428 429 std::vector<SUnit> SUnitsLinksBackup; 430 431 // For moveLowLatencies. After all Scheduling variants are tested. 432 std::vector<unsigned> ScheduledSUnits; 433 std::vector<unsigned> ScheduledSUnitsInv; 434 435 public: 436 SIScheduleDAGMI(MachineSchedContext *C); 437 438 ~SIScheduleDAGMI() override; 439 440 // Entry point for the schedule. 441 void schedule() override; 442 443 // To init Block's RPTracker. 444 void initRPTracker(RegPressureTracker &RPTracker) { 445 RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false); 446 } 447 448 MachineBasicBlock *getBB() { return BB; } 449 MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; } 450 MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; } 451 LiveIntervals *getLIS() { return LIS; } 452 MachineRegisterInfo *getMRI() { return &MRI; } 453 const TargetRegisterInfo *getTRI() { return TRI; } 454 ScheduleDAGTopologicalSort *GetTopo() { return &Topo; } 455 SUnit &getEntrySU() { return EntrySU; } 456 SUnit& getExitSU() { return ExitSU; } 457 458 void restoreSULinksLeft(); 459 460 template<typename _Iterator> void fillVgprSgprCost(_Iterator First, 461 _Iterator End, 462 unsigned &VgprUsage, 463 unsigned &SgprUsage); 464 465 std::set<unsigned> getInRegs() { 466 std::set<unsigned> InRegs; 467 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 468 InRegs.insert(RegMaskPair.RegUnit); 469 } 470 return InRegs; 471 } 472 473 std::set<unsigned> getOutRegs() { 474 std::set<unsigned> OutRegs; 475 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 476 OutRegs.insert(RegMaskPair.RegUnit); 477 } 478 return OutRegs; 479 }; 480 481 private: 482 void topologicalSort(); 483 // After scheduling is done, improve low latency placements. 484 void moveLowLatencies(); 485 486 public: 487 // Some stats for scheduling inside blocks. 488 std::vector<unsigned> IsLowLatencySU; 489 std::vector<unsigned> LowLatencyOffset; 490 std::vector<unsigned> IsHighLatencySU; 491 // Topological sort 492 // Maps topological index to the node number. 493 std::vector<int> TopDownIndex2SU; 494 std::vector<int> BottomUpIndex2SU; 495 }; 496 497 } // end namespace llvm 498 499 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H 500