1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file This file defines a set of schedule DAG mutations that can be used to 10 // override default scheduler behavior to enforce specific scheduling patterns. 11 // They should be used in cases where runtime performance considerations such as 12 // inter-wavefront interactions, mean that compile-time heuristics cannot 13 // predict the optimal instruction ordering, or in kernels where optimum 14 // instruction scheduling is important enough to warrant manual intervention. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUIGroupLP.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "igrouplp" 31 32 namespace { 33 34 static cl::opt<bool> EnableExactSolver( 35 "amdgpu-igrouplp-exact-solver", cl::Hidden, 36 cl::desc("Whether to use the exponential time solver to fit " 37 "the instructions to the pipeline as closely as " 38 "possible."), 39 cl::init(false)); 40 41 static cl::opt<unsigned> CutoffForExact( 42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43 cl::desc("The maximum number of scheduling group conflicts " 44 "which we attempt to solve with the exponential time " 45 "exact solver. Problem sizes greater than this will" 46 "be solved by the less accurate greedy algorithm. Selecting " 47 "solver by size is superseded by manually selecting " 48 "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 49 50 static cl::opt<uint64_t> MaxBranchesExplored( 51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52 cl::desc("The amount of branches that we are willing to explore with" 53 "the exact algorithm before giving up.")); 54 55 static cl::opt<bool> UseCostHeur( 56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57 cl::desc("Whether to use the cost heuristic to make choices as we " 58 "traverse the search space using the exact solver. Defaulted " 59 "to on, and if turned off, we will use the node order -- " 60 "attempting to put the later nodes in the later sched groups. " 61 "Experimentally, results are mixed, so this should be set on a " 62 "case-by-case basis.")); 63 64 // Components of the mask that determines which instruction types may be may be 65 // classified into a SchedGroup. 66 enum class SchedGroupMask { 67 NONE = 0u, 68 ALU = 1u << 0, 69 VALU = 1u << 1, 70 SALU = 1u << 2, 71 MFMA = 1u << 3, 72 VMEM = 1u << 4, 73 VMEM_READ = 1u << 5, 74 VMEM_WRITE = 1u << 6, 75 DS = 1u << 7, 76 DS_READ = 1u << 8, 77 DS_WRITE = 1u << 9, 78 TRANS = 1u << 10, 79 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 80 DS_READ | DS_WRITE | TRANS, 81 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 82 }; 83 84 class SchedGroup; 85 86 // InstructionRule class is used to enact a filter which determines whether or 87 // not an SU maps to a given SchedGroup. It contains complementary data 88 // structures (e.g Cache) to help those filters. 89 class InstructionRule { 90 protected: 91 const SIInstrInfo *TII; 92 unsigned SGID; 93 // A cache made available to the Filter to store SUnits for subsequent 94 // invocations of the Filter 95 std::optional<SmallVector<SUnit *, 4>> Cache; 96 97 public: 98 virtual bool 99 apply(const SUnit *, const ArrayRef<SUnit *>, 100 SmallVectorImpl<SchedGroup> &) { 101 return true; 102 }; 103 104 InstructionRule(const SIInstrInfo *TII, unsigned SGID, 105 bool NeedsCache = false) 106 : TII(TII), SGID(SGID) { 107 if (NeedsCache) { 108 Cache = SmallVector<SUnit *, 4>(); 109 } 110 } 111 112 virtual ~InstructionRule() = default; 113 }; 114 115 using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>; 116 117 // Classify instructions into groups to enable fine tuned control over the 118 // scheduler. These groups may be more specific than current SchedModel 119 // instruction classes. 120 class SchedGroup { 121 private: 122 // Mask that defines which instruction types can be classified into this 123 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 124 // and SCHED_GROUP_BARRIER. 125 SchedGroupMask SGMask; 126 127 // Maximum number of SUnits that can be added to this group. 128 std::optional<unsigned> MaxSize; 129 130 // SchedGroups will only synchronize with other SchedGroups that have the same 131 // SyncID. 132 int SyncID = 0; 133 134 // SGID is used to map instructions to candidate SchedGroups 135 unsigned SGID; 136 137 // The different rules each instruction in this SchedGroup must conform to 138 SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 139 140 // Count of the number of created SchedGroups, used to initialize SGID. 141 static unsigned NumSchedGroups; 142 143 // Try to add and edge from SU A to SU B. 144 bool tryAddEdge(SUnit *A, SUnit *B); 145 146 // Use SGMask to determine whether we can classify MI as a member of this 147 // SchedGroup object. 148 bool canAddMI(const MachineInstr &MI) const; 149 150 public: 151 // Collection of SUnits that are classified as members of this group. 152 SmallVector<SUnit *, 32> Collection; 153 154 ScheduleDAGInstrs *DAG; 155 const SIInstrInfo *TII; 156 157 // Returns true if SU can be added to this SchedGroup. 158 bool canAddSU(SUnit &SU) const; 159 160 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 161 // MakePred is true, SU will be a predecessor of the SUnits in this 162 // SchedGroup, otherwise SU will be a successor. 163 void link(SUnit &SU, bool MakePred = false); 164 165 // Add DAG dependencies and track which edges are added, and the count of 166 // missed edges 167 int link(SUnit &SU, bool MakePred, 168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 169 170 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 171 // Use the predicate to determine whether SU should be a predecessor (P = 172 // true) or a successor (P = false) of this SchedGroup. 173 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 174 175 // Add DAG dependencies such that SUnits in this group shall be ordered 176 // before SUnits in OtherGroup. 177 void link(SchedGroup &OtherGroup); 178 179 // Returns true if no more instructions may be added to this group. 180 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 181 182 // Append a constraint that SUs must meet in order to fit into this 183 // SchedGroup. Since many rules involve the relationship between a SchedGroup 184 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 185 // time (rather than SchedGroup init time.) 186 void addRule(std::shared_ptr<InstructionRule> NewRule) { 187 Rules.push_back(NewRule); 188 } 189 190 // Returns true if the SU matches all rules 191 bool allowedByRules(const SUnit *SU, 192 SmallVectorImpl<SchedGroup> &SyncPipe) const { 193 for (auto &Rule : Rules) { 194 if (!Rule.get()->apply(SU, Collection, SyncPipe)) 195 return false; 196 } 197 return true; 198 } 199 200 // Add SU to the SchedGroup. 201 void add(SUnit &SU) { 202 LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 203 << format_hex((int)SGMask, 10, true) << " adding " 204 << *SU.getInstr()); 205 Collection.push_back(&SU); 206 } 207 208 // Remove last element in the SchedGroup 209 void pop() { Collection.pop_back(); } 210 211 // Identify and add all relevant SUs from the DAG to this SchedGroup. 212 void initSchedGroup(); 213 214 // Add instructions to the SchedGroup bottom up starting from RIter. 215 // PipelineInstrs is a set of instructions that should not be added to the 216 // SchedGroup even when the other conditions for adding it are satisfied. 217 // RIter will be added to the SchedGroup as well, and dependencies will be 218 // added so that RIter will always be scheduled at the end of the group. 219 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 220 SUnitsToCandidateSGsMap &SyncedInstrs); 221 222 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 223 224 int getSyncID() { return SyncID; } 225 226 int getSGID() { return SGID; } 227 228 SchedGroupMask getMask() { return SGMask; } 229 230 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 231 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 232 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { 233 SGID = NumSchedGroups++; 234 } 235 236 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 237 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 238 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { 239 SGID = NumSchedGroups++; 240 } 241 }; 242 243 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 244 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 245 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 246 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 247 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 248 249 while (!SU.Preds.empty()) 250 for (auto &P : SU.Preds) 251 SU.removePred(P); 252 253 while (!SU.Succs.empty()) 254 for (auto &S : SU.Succs) 255 for (auto &SP : S.getSUnit()->Preds) 256 if (SP.getSUnit() == &SU) 257 S.getSUnit()->removePred(SP); 258 } 259 260 using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>; 261 using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>; 262 263 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 264 // in non-trivial cases. For example, if the requested pipeline is 265 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 266 // in the DAG, then we will have an instruction that can not be trivially 267 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 268 // to find a good solution to the pipeline -- a greedy algorithm and an exact 269 // algorithm. The exact algorithm has an exponential time complexity and should 270 // only be used for small sized problems or medium sized problems where an exact 271 // solution is highly desired. 272 class PipelineSolver { 273 ScheduleDAGMI *DAG; 274 275 // Instructions that can be assigned to multiple SchedGroups 276 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 277 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 278 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 279 // The current working pipeline 280 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 281 // The pipeline that has the best solution found so far 282 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 283 284 // Whether or not we actually have any SyncedInstrs to try to solve. 285 bool NeedsSolver = false; 286 287 // Compute an estimate of the size of search tree -- the true size is 288 // the product of each conflictedInst.Matches.size() across all SyncPipelines 289 unsigned computeProblemSize(); 290 291 // The cost penalty of not assigning a SU to a SchedGroup 292 int MissPenalty = 0; 293 294 // Costs in terms of the number of edges we are unable to add 295 int BestCost = -1; 296 int CurrCost = 0; 297 298 // Index pointing to the conflicting instruction that is currently being 299 // fitted 300 int CurrConflInstNo = 0; 301 // Index to the pipeline that is currently being fitted 302 int CurrSyncGroupIdx = 0; 303 // The first non trivial pipeline 304 int BeginSyncGroupIdx = 0; 305 306 // How many branches we have explored 307 uint64_t BranchesExplored = 0; 308 309 // The direction in which we process the candidate SchedGroups per SU 310 bool IsBottomUp = true; 311 312 // Update indices to fit next conflicting instruction 313 void advancePosition(); 314 // Recede indices to attempt to find better fit for previous conflicting 315 // instruction 316 void retreatPosition(); 317 318 // The exponential time algorithm which finds the provably best fit 319 bool solveExact(); 320 // The polynomial time algorithm which attempts to find a good fit 321 bool solveGreedy(); 322 // Find the best SchedGroup for the current SU using the heuristic given all 323 // current information. One step in the greedy algorithm. Templated against 324 // the SchedGroup iterator (either reverse or forward). 325 template <typename T> 326 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 327 T E); 328 // Whether or not the current solution is optimal 329 bool checkOptimal(); 330 // Populate the ready list, prioiritizing fewest missed edges first 331 // Templated against the SchedGroup iterator (either reverse or forward). 332 template <typename T> 333 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 334 T E); 335 // Add edges corresponding to the SchedGroups as assigned by solver 336 void makePipeline(); 337 // Link the SchedGroups in the best found pipeline. 338 // Tmplated against the SchedGroup iterator (either reverse or forward). 339 template <typename T> void linkSchedGroups(T I, T E); 340 // Add the edges from the SU to the other SchedGroups in pipeline, and 341 // return the number of edges missed. 342 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 343 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 344 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 345 /// returns the cost (in terms of missed pipeline edges), and tracks the edges 346 /// added in \p AddedEdges 347 template <typename T> 348 int linkSUnit(SUnit *SU, int SGID, 349 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 350 /// Remove the edges passed via \p AddedEdges 351 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 352 // Convert the passed in maps to arrays for bidirectional iterators 353 void convertSyncMapsToArrays(); 354 355 void reset(); 356 357 public: 358 // Invoke the solver to map instructions to instruction groups. Heuristic && 359 // command-line-option determines to use exact or greedy algorithm. 360 void solve(); 361 362 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 363 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 364 ScheduleDAGMI *DAG, bool IsBottomUp = true) 365 : DAG(DAG), SyncedInstrs(SyncedInstrs), 366 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 367 368 for (auto &PipelineInstrs : SyncedInstrs) { 369 if (PipelineInstrs.second.size() > 0) { 370 NeedsSolver = true; 371 break; 372 } 373 } 374 375 if (!NeedsSolver) 376 return; 377 378 convertSyncMapsToArrays(); 379 380 CurrPipeline = BestPipeline; 381 382 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 383 PipelineInstrs[BeginSyncGroupIdx].size() == 0) 384 ++BeginSyncGroupIdx; 385 386 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 387 return; 388 } 389 }; 390 391 void PipelineSolver::reset() { 392 393 for (auto &SyncPipeline : CurrPipeline) { 394 for (auto &SG : SyncPipeline) { 395 SmallVector<SUnit *, 32> TempCollection = SG.Collection; 396 SG.Collection.clear(); 397 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 398 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 399 }); 400 if (SchedBarr != TempCollection.end()) 401 SG.Collection.push_back(*SchedBarr); 402 } 403 } 404 405 CurrSyncGroupIdx = BeginSyncGroupIdx; 406 CurrConflInstNo = 0; 407 CurrCost = 0; 408 } 409 410 void PipelineSolver::convertSyncMapsToArrays() { 411 for (auto &SyncPipe : SyncedSchedGroups) { 412 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 413 } 414 415 int PipelineIDx = SyncedInstrs.size() - 1; 416 PipelineInstrs.resize(SyncedInstrs.size()); 417 for (auto &SyncInstrMap : SyncedInstrs) { 418 for (auto &SUsToCandSGs : SyncInstrMap.second) { 419 if (PipelineInstrs[PipelineIDx].size() == 0) { 420 PipelineInstrs[PipelineIDx].push_back( 421 std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 422 continue; 423 } 424 auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 425 // Insert them in sorted order -- this allows for good parsing order in 426 // the greedy algorithm 427 while (SortPosition != PipelineInstrs[PipelineIDx].end() && 428 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 429 ++SortPosition; 430 PipelineInstrs[PipelineIDx].insert( 431 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 432 } 433 --PipelineIDx; 434 } 435 } 436 437 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 438 for (; I != E; ++I) { 439 auto &GroupA = *I; 440 for (auto J = std::next(I); J != E; ++J) { 441 auto &GroupB = *J; 442 GroupA.link(GroupB); 443 } 444 } 445 } 446 447 void PipelineSolver::makePipeline() { 448 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 449 for (auto &SyncPipeline : BestPipeline) { 450 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 451 for (auto &SG : SyncPipeline) { 452 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 453 << " has: \n"); 454 SUnit *SGBarr = nullptr; 455 for (auto &SU : SG.Collection) { 456 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 457 SGBarr = SU; 458 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 459 } 460 // Command line requested IGroupLP doesn't have SGBarr 461 if (!SGBarr) 462 continue; 463 resetEdges(*SGBarr, DAG); 464 SG.link(*SGBarr, false); 465 } 466 } 467 468 for (auto &SyncPipeline : BestPipeline) { 469 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 470 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 471 } 472 } 473 474 template <typename T> 475 int PipelineSolver::linkSUnit( 476 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 477 T I, T E) { 478 bool MakePred = false; 479 int AddedCost = 0; 480 for (; I < E; ++I) { 481 if (I->getSGID() == SGID) { 482 MakePred = true; 483 continue; 484 } 485 auto Group = *I; 486 AddedCost += Group.link(*SU, MakePred, AddedEdges); 487 assert(AddedCost >= 0); 488 } 489 return AddedCost; 490 } 491 492 int PipelineSolver::addEdges( 493 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 494 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 495 496 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 497 // instructions that are the ultimate successors in the resultant mutation. 498 // Therefore, in such a configuration, the SchedGroups occurring before the 499 // candidate SGID are successors of the candidate SchedGroup, thus the current 500 // SU should be linked as a predecessor to SUs in those SchedGroups. The 501 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 502 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 503 // IsBottomUp (in reverse). 504 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 505 SyncPipeline.rend()) 506 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 507 SyncPipeline.end()); 508 } 509 510 void PipelineSolver::removeEdges( 511 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 512 // Only remove the edges that we have added when testing 513 // the fit. 514 for (auto &PredSuccPair : EdgesToRemove) { 515 SUnit *Pred = PredSuccPair.first; 516 SUnit *Succ = PredSuccPair.second; 517 518 auto Match = llvm::find_if( 519 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 520 if (Match != Succ->Preds.end()) { 521 assert(Match->isArtificial()); 522 Succ->removePred(*Match); 523 } 524 } 525 } 526 527 void PipelineSolver::advancePosition() { 528 ++CurrConflInstNo; 529 530 if (static_cast<size_t>(CurrConflInstNo) >= 531 PipelineInstrs[CurrSyncGroupIdx].size()) { 532 CurrConflInstNo = 0; 533 ++CurrSyncGroupIdx; 534 // Advance to next non-trivial pipeline 535 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 536 PipelineInstrs[CurrSyncGroupIdx].size() == 0) 537 ++CurrSyncGroupIdx; 538 } 539 } 540 541 void PipelineSolver::retreatPosition() { 542 assert(CurrConflInstNo >= 0); 543 assert(CurrSyncGroupIdx >= 0); 544 545 if (CurrConflInstNo > 0) { 546 --CurrConflInstNo; 547 return; 548 } 549 550 if (CurrConflInstNo == 0) { 551 // If we return to the starting position, we have explored 552 // the entire tree 553 if (CurrSyncGroupIdx == BeginSyncGroupIdx) 554 return; 555 556 --CurrSyncGroupIdx; 557 // Go to previous non-trivial pipeline 558 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 559 --CurrSyncGroupIdx; 560 561 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 562 } 563 } 564 565 bool PipelineSolver::checkOptimal() { 566 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 567 if (BestCost == -1 || CurrCost < BestCost) { 568 BestPipeline = CurrPipeline; 569 BestCost = CurrCost; 570 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 571 } 572 assert(BestCost >= 0); 573 } 574 575 bool DoneExploring = false; 576 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 577 DoneExploring = true; 578 579 return (DoneExploring || BestCost == 0); 580 } 581 582 template <typename T> 583 void PipelineSolver::populateReadyList( 584 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 585 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 586 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 587 assert(CurrSU.second.size() >= 1); 588 589 for (; I != E; ++I) { 590 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 591 int CandSGID = *I; 592 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 593 return SG.getSGID() == CandSGID; 594 }); 595 assert(Match); 596 597 if (UseCostHeur) { 598 if (Match->isFull()) { 599 ReadyList.push_back(std::pair(*I, MissPenalty)); 600 continue; 601 } 602 603 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 604 ReadyList.push_back(std::pair(*I, TempCost)); 605 removeEdges(AddedEdges); 606 } else 607 ReadyList.push_back(std::pair(*I, -1)); 608 } 609 610 if (UseCostHeur) { 611 std::sort(ReadyList.begin(), ReadyList.end(), 612 [](std::pair<int, int> A, std::pair<int, int> B) { 613 return A.second < B.second; 614 }); 615 } 616 617 assert(ReadyList.size() == CurrSU.second.size()); 618 } 619 620 bool PipelineSolver::solveExact() { 621 if (checkOptimal()) 622 return true; 623 624 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 625 return false; 626 627 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 628 assert(static_cast<size_t>(CurrConflInstNo) < 629 PipelineInstrs[CurrSyncGroupIdx].size()); 630 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 631 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 632 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 633 634 // SchedGroup -> Cost pairs 635 SmallVector<std::pair<int, int>, 4> ReadyList; 636 // Prioritize the candidate sched groups in terms of lowest cost first 637 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 638 CurrSU.second.rend()) 639 : populateReadyList(ReadyList, CurrSU.second.begin(), 640 CurrSU.second.end()); 641 642 auto I = ReadyList.begin(); 643 auto E = ReadyList.end(); 644 for (; I != E; ++I) { 645 // If we are trying SGs in least cost order, and the current SG is cost 646 // infeasible, then all subsequent SGs will also be cost infeasible, so we 647 // can prune. 648 if (BestCost != -1 && (CurrCost + I->second > BestCost)) 649 return false; 650 651 int CandSGID = I->first; 652 int AddedCost = 0; 653 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 654 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 655 SchedGroup *Match; 656 for (auto &SG : SyncPipeline) { 657 if (SG.getSGID() == CandSGID) 658 Match = &SG; 659 } 660 661 if (Match->isFull()) 662 continue; 663 664 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 665 continue; 666 667 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 668 << (int)Match->getMask() << "and ID " << CandSGID 669 << "\n"); 670 Match->add(*CurrSU.first); 671 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 672 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 673 CurrCost += AddedCost; 674 advancePosition(); 675 ++BranchesExplored; 676 bool FinishedExploring = false; 677 // If the Cost after adding edges is greater than a known solution, 678 // backtrack 679 if (CurrCost < BestCost || BestCost == -1) { 680 if (solveExact()) { 681 FinishedExploring = BestCost != 0; 682 if (!FinishedExploring) 683 return true; 684 } 685 } 686 687 retreatPosition(); 688 CurrCost -= AddedCost; 689 removeEdges(AddedEdges); 690 Match->pop(); 691 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 692 if (FinishedExploring) 693 return true; 694 } 695 696 // Try the pipeline where the current instruction is omitted 697 // Potentially if we omit a problematic instruction from the pipeline, 698 // all the other instructions can nicely fit. 699 CurrCost += MissPenalty; 700 advancePosition(); 701 702 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 703 704 bool FinishedExploring = false; 705 if (CurrCost < BestCost || BestCost == -1) { 706 if (solveExact()) { 707 bool FinishedExploring = BestCost != 0; 708 if (!FinishedExploring) 709 return true; 710 } 711 } 712 713 retreatPosition(); 714 CurrCost -= MissPenalty; 715 return FinishedExploring; 716 } 717 718 template <typename T> 719 void PipelineSolver::greedyFind( 720 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 721 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 722 int BestNodeCost = -1; 723 int TempCost; 724 SchedGroup *BestGroup = nullptr; 725 int BestGroupID = -1; 726 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 727 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 728 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 729 730 // Since we have added the potential SchedGroups from bottom up, but 731 // traversed the DAG from top down, parse over the groups from last to 732 // first. If we fail to do this for the greedy algorithm, the solution will 733 // likely not be good in more complex cases. 734 for (; I != E; ++I) { 735 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 736 int CandSGID = *I; 737 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { 738 return SG.getSGID() == CandSGID; 739 }); 740 assert(Match); 741 742 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 743 << (int)Match->getMask() << "\n"); 744 745 if (Match->isFull()) { 746 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 747 continue; 748 } 749 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 750 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 751 continue; 752 } 753 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 754 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 755 if (TempCost < BestNodeCost || BestNodeCost == -1) { 756 BestGroup = Match; 757 BestNodeCost = TempCost; 758 BestGroupID = CandSGID; 759 } 760 removeEdges(AddedEdges); 761 if (BestNodeCost == 0) 762 break; 763 } 764 765 if (BestGroupID != -1) { 766 BestGroup->add(*CurrSU.first); 767 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 768 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 769 << (int)BestGroup->getMask() << "\n"); 770 BestCost += TempCost; 771 } else 772 BestCost += MissPenalty; 773 774 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 775 } 776 777 bool PipelineSolver::solveGreedy() { 778 BestCost = 0; 779 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 780 781 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 782 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 783 IsBottomUp 784 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 785 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 786 advancePosition(); 787 } 788 BestPipeline = CurrPipeline; 789 removeEdges(AddedEdges); 790 return false; 791 } 792 793 unsigned PipelineSolver::computeProblemSize() { 794 unsigned ProblemSize = 0; 795 for (auto &PipeConflicts : PipelineInstrs) { 796 ProblemSize += PipeConflicts.size(); 797 } 798 799 return ProblemSize; 800 } 801 802 void PipelineSolver::solve() { 803 if (!NeedsSolver) 804 return; 805 806 unsigned ProblemSize = computeProblemSize(); 807 assert(ProblemSize > 0); 808 809 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 810 MissPenalty = (ProblemSize / 2) + 1; 811 812 LLVM_DEBUG(DAG->dump()); 813 if (EnableExactSolver || BelowCutoff) { 814 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 815 solveGreedy(); 816 reset(); 817 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 818 if (BestCost > 0) { 819 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 820 solveExact(); 821 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 822 } 823 } else { // Use the Greedy Algorithm by default 824 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 825 solveGreedy(); 826 } 827 828 makePipeline(); 829 LLVM_DEBUG(dbgs() << "After applying mutation\n"); 830 LLVM_DEBUG(DAG->dump()); 831 } 832 833 enum IGLPStrategyID : int { 834 MFMASmallGemmOptID = 0, 835 MFMASmallGemmSingleWaveOptID = 1, 836 MFMAExpInterleave = 2 837 }; 838 839 // Implement a IGLP scheduling strategy. 840 class IGLPStrategy { 841 protected: 842 ScheduleDAGInstrs *DAG; 843 844 const SIInstrInfo *TII; 845 846 public: 847 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. 848 virtual bool applyIGLPStrategy( 849 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 850 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 851 AMDGPU::SchedulingPhase Phase) = 0; 852 853 // Returns true if this strategy should be applied to a ScheduleDAG. 854 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 855 AMDGPU::SchedulingPhase Phase) = 0; 856 857 bool IsBottomUp = true; 858 859 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 860 : DAG(DAG), TII(TII) {} 861 862 virtual ~IGLPStrategy() = default; 863 }; 864 865 class MFMASmallGemmOpt final : public IGLPStrategy { 866 private: 867 public: 868 bool applyIGLPStrategy( 869 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 870 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 871 AMDGPU::SchedulingPhase Phase) override; 872 873 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 874 AMDGPU::SchedulingPhase Phase) override { 875 return true; 876 } 877 878 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 879 : IGLPStrategy(DAG, TII) { 880 IsBottomUp = true; 881 } 882 }; 883 884 bool MFMASmallGemmOpt::applyIGLPStrategy( 885 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 886 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 887 AMDGPU::SchedulingPhase Phase) { 888 // Count the number of MFMA instructions. 889 unsigned MFMACount = 0; 890 for (const MachineInstr &I : *DAG) 891 if (TII->isMFMAorWMMA(I)) 892 ++MFMACount; 893 894 const unsigned PipelineSyncID = 0; 895 SchedGroup *SG = nullptr; 896 for (unsigned I = 0; I < MFMACount * 3; ++I) { 897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 898 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 900 901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 902 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 904 } 905 906 return true; 907 } 908 909 class MFMAExpInterleaveOpt final : public IGLPStrategy { 910 private: 911 // The count of TRANS SUs involved in the interleaved pipeline 912 static unsigned TransPipeCount; 913 // The count of MFMA SUs involved in the interleaved pipeline 914 static unsigned MFMAPipeCount; 915 // The count of Add SUs involved in the interleaved pipeline 916 static unsigned AddPipeCount; 917 // The number of transitive MFMA successors for each TRANS SU 918 static unsigned MFMAEnablement; 919 // The number of transitive TRANS predecessors for each MFMA SU 920 static unsigned ExpRequirement; 921 // The count of independent "chains" of MFMA instructions in the pipeline 922 static unsigned MFMAChains; 923 // The length of each independent "chain" of MFMA instructions 924 static unsigned MFMAChainLength; 925 // Whether or not the pipeline has V_CVT instructions 926 static bool HasCvt; 927 // Whether or not there are instructions between the TRANS instruction and 928 // V_CVT 929 static bool HasChainBetweenCvt; 930 // The first occuring DS_READ which feeds an MFMA chain 931 static std::optional<unsigned> FirstPipeDSR; 932 // The MFMAPipe SUs with no MFMA predecessors 933 SmallVector<SUnit *, 4> MFMAChainSeeds; 934 // Compute the heuristics for the pipeline, returning whether or not the DAG 935 // is well formatted for the mutation 936 bool analyzeDAG(const SIInstrInfo *TII); 937 938 /// Whether or not the instruction is a transitive predecessor of an MFMA 939 /// instruction 940 class IsPipeExp final : public InstructionRule { 941 public: 942 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 943 SmallVectorImpl<SchedGroup> &SyncPipe) override { 944 945 auto DAG = SyncPipe[0].DAG; 946 947 if (Cache->empty()) { 948 auto I = DAG->SUnits.rbegin(); 949 auto E = DAG->SUnits.rend(); 950 for (; I != E; I++) { 951 if (TII->isMFMAorWMMA(*I->getInstr())) 952 Cache->push_back(&*I); 953 } 954 if (Cache->empty()) 955 return false; 956 } 957 958 auto Reaches = (std::any_of( 959 Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) { 960 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU)); 961 })); 962 963 return Reaches; 964 } 965 IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 966 : InstructionRule(TII, SGID, NeedsCache) {} 967 }; 968 969 /// Whether or not the instruction is a transitive predecessor of the 970 /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction 971 class EnablesNthMFMA final : public InstructionRule { 972 private: 973 unsigned Number = 1; 974 975 public: 976 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 977 SmallVectorImpl<SchedGroup> &SyncPipe) override { 978 bool FoundTrans = false; 979 unsigned Counter = 1; 980 auto DAG = SyncPipe[0].DAG; 981 982 if (Cache->empty()) { 983 SmallVector<SUnit *, 8> Worklist; 984 985 auto I = DAG->SUnits.begin(); 986 auto E = DAG->SUnits.end(); 987 for (; I != E; I++) { 988 if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) { 989 if (Counter == Number) { 990 Cache->push_back(&*I); 991 break; 992 } 993 ++Counter; 994 } 995 if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode())) 996 FoundTrans = true; 997 } 998 if (Cache->empty()) 999 return false; 1000 } 1001 1002 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); 1003 } 1004 1005 EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID, 1006 bool NeedsCache = false) 1007 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} 1008 }; 1009 1010 /// Whether or not the instruction enables the exact MFMA that is the \p 1011 /// Number th MFMA in the chain starting with \p ChainSeed 1012 class EnablesNthMFMAInChain final : public InstructionRule { 1013 private: 1014 unsigned Number = 1; 1015 SUnit *ChainSeed; 1016 1017 public: 1018 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1019 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1020 auto DAG = SyncPipe[0].DAG; 1021 1022 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) 1023 return false; 1024 1025 if (Cache->empty()) { 1026 auto TempSU = ChainSeed; 1027 auto Depth = Number; 1028 while (Depth > 0) { 1029 --Depth; 1030 bool Found = false; 1031 for (auto &Succ : TempSU->Succs) { 1032 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { 1033 TempSU = Succ.getSUnit(); 1034 Found = true; 1035 break; 1036 } 1037 } 1038 if (!Found) 1039 return false; 1040 } 1041 1042 Cache->push_back(TempSU); 1043 } 1044 // If we failed to find the instruction to be placed into the cache, we 1045 // would have already exited. 1046 assert(!Cache->empty()); 1047 1048 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); 1049 } 1050 1051 EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed, 1052 const SIInstrInfo *TII, unsigned SGID, 1053 bool NeedsCache = false) 1054 : InstructionRule(TII, SGID, NeedsCache), Number(Number), 1055 ChainSeed(ChainSeed) {} 1056 }; 1057 1058 /// Whether or not the instruction has less than \p Size immediate successors. 1059 /// If \p HasIntermediary is true, this tests also whether all successors of 1060 /// the SUnit have less than \p Size successors. 1061 class LessThanNSuccs final : public InstructionRule { 1062 private: 1063 unsigned Size = 1; 1064 bool HasIntermediary = false; 1065 1066 public: 1067 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1068 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1069 if (!SyncPipe.size()) 1070 return false; 1071 1072 auto SuccSize = std::count_if( 1073 SU->Succs.begin(), SU->Succs.end(), 1074 [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); 1075 if (SuccSize >= Size) 1076 return false; 1077 1078 if (HasIntermediary) { 1079 for (auto Succ : SU->Succs) { 1080 auto SuccSize = std::count_if( 1081 Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), 1082 [](const SDep &SuccSucc) { 1083 return SuccSucc.getKind() == SDep::Data; 1084 }); 1085 if (SuccSize >= Size) 1086 return false; 1087 } 1088 } 1089 1090 return true; 1091 } 1092 LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID, 1093 bool HasIntermediary = false, bool NeedsCache = false) 1094 : InstructionRule(TII, SGID, NeedsCache), Size(Size), 1095 HasIntermediary(HasIntermediary) {} 1096 }; 1097 1098 /// Whether or not the instruction has greater than or equal to \p Size 1099 /// immediate successors. If \p HasIntermediary is true, this tests also 1100 /// whether all successors of the SUnit have greater than or equal to \p Size 1101 /// successors. 1102 class GreaterThanOrEqualToNSuccs final : public InstructionRule { 1103 private: 1104 unsigned Size = 1; 1105 bool HasIntermediary = false; 1106 1107 public: 1108 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1109 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1110 if (!SyncPipe.size()) 1111 return false; 1112 1113 auto SuccSize = std::count_if( 1114 SU->Succs.begin(), SU->Succs.end(), 1115 [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); 1116 if (SuccSize >= Size) 1117 return true; 1118 1119 if (HasIntermediary) { 1120 for (auto Succ : SU->Succs) { 1121 auto SuccSize = std::count_if( 1122 Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), 1123 [](const SDep &SuccSucc) { 1124 return SuccSucc.getKind() == SDep::Data; 1125 }); 1126 if (SuccSize >= Size) 1127 return true; 1128 } 1129 } 1130 1131 return false; 1132 } 1133 GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII, 1134 unsigned SGID, bool HasIntermediary = false, 1135 bool NeedsCache = false) 1136 : InstructionRule(TII, SGID, NeedsCache), Size(Size), 1137 HasIntermediary(HasIntermediary) {} 1138 }; 1139 1140 // Whether or not the instruction is a relevant V_CVT instruction. 1141 class IsCvt final : public InstructionRule { 1142 public: 1143 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1144 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1145 auto Opc = SU->getInstr()->getOpcode(); 1146 return Opc == AMDGPU::V_CVT_F16_F32_e32 || 1147 Opc == AMDGPU::V_CVT_I32_F32_e32; 1148 } 1149 IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1150 : InstructionRule(TII, SGID, NeedsCache) {} 1151 }; 1152 1153 // Whether or not the instruction is FMA_F32. 1154 class IsFMA final : public InstructionRule { 1155 public: 1156 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1157 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1158 return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 || 1159 SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32; 1160 } 1161 IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1162 : InstructionRule(TII, SGID, NeedsCache) {} 1163 }; 1164 1165 // Whether or not the instruction is a V_ADD_F32 instruction. 1166 class IsPipeAdd final : public InstructionRule { 1167 public: 1168 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1169 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1170 return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32; 1171 } 1172 IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1173 : InstructionRule(TII, SGID, NeedsCache) {} 1174 }; 1175 1176 /// Whether or not the instruction is an immediate RAW successor 1177 /// of the SchedGroup \p Distance steps before. 1178 class IsSuccOfPrevNthGroup final : public InstructionRule { 1179 private: 1180 unsigned Distance = 1; 1181 1182 public: 1183 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1184 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1185 SchedGroup *OtherGroup = nullptr; 1186 if (!SyncPipe.size()) 1187 return false; 1188 1189 for (auto &PipeSG : SyncPipe) { 1190 if ((unsigned)PipeSG.getSGID() == SGID - Distance) 1191 OtherGroup = &PipeSG; 1192 } 1193 1194 if (!OtherGroup) 1195 return false; 1196 if (!OtherGroup->Collection.size()) 1197 return true; 1198 1199 for (auto &OtherEle : OtherGroup->Collection) { 1200 for (auto &Succ : OtherEle->Succs) { 1201 if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data) 1202 return true; 1203 } 1204 } 1205 1206 return false; 1207 } 1208 IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1209 unsigned SGID, bool NeedsCache = false) 1210 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1211 }; 1212 1213 /// Whether or not the instruction is a transitive successor of any 1214 /// instruction the the SchedGroup \p Distance steps before. 1215 class IsReachableFromPrevNthGroup final : public InstructionRule { 1216 private: 1217 unsigned Distance = 1; 1218 1219 public: 1220 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1221 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1222 SchedGroup *OtherGroup = nullptr; 1223 if (!SyncPipe.size()) 1224 return false; 1225 1226 for (auto &PipeSG : SyncPipe) { 1227 if ((unsigned)PipeSG.getSGID() == SGID - Distance) 1228 OtherGroup = &PipeSG; 1229 } 1230 1231 if (!OtherGroup) 1232 return false; 1233 if (!OtherGroup->Collection.size()) 1234 return true; 1235 1236 auto DAG = SyncPipe[0].DAG; 1237 1238 for (auto &OtherEle : OtherGroup->Collection) 1239 if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle)) 1240 return true; 1241 1242 return false; 1243 } 1244 IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1245 unsigned SGID, bool NeedsCache = false) 1246 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1247 }; 1248 1249 /// Whether or not the instruction occurs after the SU with NodeNUm \p Number 1250 class OccursAtOrAfterNode final : public InstructionRule { 1251 private: 1252 unsigned Number = 1; 1253 1254 public: 1255 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1256 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1257 1258 return SU->NodeNum >= Number; 1259 } 1260 OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID, 1261 bool NeedsCache = false) 1262 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} 1263 }; 1264 1265 /// Whether or not the SU is exactly the \p Number th MFMA in the chain 1266 /// starting with \p ChainSeed 1267 class IsExactMFMA final : public InstructionRule { 1268 private: 1269 unsigned Number = 1; 1270 SUnit *ChainSeed; 1271 1272 public: 1273 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1274 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1275 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) 1276 return false; 1277 1278 if (Cache->empty()) { 1279 auto TempSU = ChainSeed; 1280 auto Depth = Number; 1281 while (Depth > 0) { 1282 --Depth; 1283 bool Found = false; 1284 for (auto &Succ : TempSU->Succs) { 1285 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { 1286 TempSU = Succ.getSUnit(); 1287 Found = true; 1288 break; 1289 } 1290 } 1291 if (!Found) { 1292 return false; 1293 } 1294 } 1295 Cache->push_back(TempSU); 1296 } 1297 // If we failed to find the instruction to be placed into the cache, we 1298 // would have already exited. 1299 assert(!Cache->empty()); 1300 1301 return (*Cache)[0] == SU; 1302 } 1303 1304 IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII, 1305 unsigned SGID, bool NeedsCache = false) 1306 : InstructionRule(TII, SGID, NeedsCache), Number(Number), 1307 ChainSeed(ChainSeed) {} 1308 }; 1309 1310 // Whether the instruction occurs after the first TRANS instruction. This 1311 // implies the instruction can not be a predecessor of the first TRANS 1312 // insruction 1313 class OccursAfterExp final : public InstructionRule { 1314 public: 1315 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1316 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1317 1318 SmallVector<SUnit *, 12> Worklist; 1319 auto DAG = SyncPipe[0].DAG; 1320 if (Cache->empty()) { 1321 for (auto &SU : DAG->SUnits) 1322 if (TII->isTRANS(SU.getInstr()->getOpcode())) { 1323 Cache->push_back(&SU); 1324 break; 1325 } 1326 if (Cache->empty()) 1327 return false; 1328 } 1329 1330 return SU->NodeNum > (*Cache)[0]->NodeNum; 1331 } 1332 1333 OccursAfterExp(const SIInstrInfo *TII, unsigned SGID, 1334 bool NeedsCache = false) 1335 : InstructionRule(TII, SGID, NeedsCache) {} 1336 }; 1337 1338 public: 1339 bool applyIGLPStrategy( 1340 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1341 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1342 AMDGPU::SchedulingPhase Phase) override; 1343 1344 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 1345 AMDGPU::SchedulingPhase Phase) override; 1346 1347 MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 1348 : IGLPStrategy(DAG, TII) { 1349 IsBottomUp = false; 1350 } 1351 }; 1352 1353 unsigned MFMAExpInterleaveOpt::TransPipeCount = 0; 1354 unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0; 1355 unsigned MFMAExpInterleaveOpt::AddPipeCount = 0; 1356 unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0; 1357 unsigned MFMAExpInterleaveOpt::ExpRequirement = 0; 1358 unsigned MFMAExpInterleaveOpt::MFMAChains = 0; 1359 unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0; 1360 bool MFMAExpInterleaveOpt::HasCvt = false; 1361 bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false; 1362 std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt; 1363 1364 bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { 1365 SmallVector<SUnit *, 10> ExpPipeCands; 1366 SmallVector<SUnit *, 10> MFMAPipeCands; 1367 SmallVector<SUnit *, 10> MFMAPipeSUs; 1368 SmallVector<SUnit *, 10> PackSUs; 1369 SmallVector<SUnit *, 10> CvtSUs; 1370 1371 auto isBitPack = [](unsigned Opc) { 1372 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64; 1373 }; 1374 1375 auto isCvt = [](unsigned Opc) { 1376 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32; 1377 }; 1378 1379 auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; }; 1380 1381 AddPipeCount = 0; 1382 for (SUnit &SU : DAG->SUnits) { 1383 auto Opc = SU.getInstr()->getOpcode(); 1384 if (TII->isTRANS(Opc)) { 1385 // Avoid counting a potential bonus V_EXP which all the MFMA depend on 1386 if (SU.Succs.size() >= 7) 1387 continue; 1388 for (auto &Succ : SU.Succs) { 1389 if (Succ.getSUnit()->Succs.size() >= 7) 1390 continue; 1391 } 1392 ExpPipeCands.push_back(&SU); 1393 } 1394 1395 if (TII->isMFMAorWMMA(*SU.getInstr())) 1396 MFMAPipeCands.push_back(&SU); 1397 1398 if (isBitPack(Opc)) 1399 PackSUs.push_back(&SU); 1400 1401 if (isCvt(Opc)) 1402 CvtSUs.push_back(&SU); 1403 1404 if (isAdd(Opc)) 1405 ++AddPipeCount; 1406 } 1407 1408 if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size())) 1409 return false; 1410 1411 TransPipeCount = 0; 1412 1413 std::optional<SUnit *> TempMFMA; 1414 std::optional<SUnit *> TempExp; 1415 // Count the number of EXPs that reach an MFMA 1416 for (auto &PredSU : ExpPipeCands) { 1417 for (auto &SuccSU : MFMAPipeCands) { 1418 if (DAG->IsReachable(SuccSU, PredSU)) { 1419 if (!TempExp) { 1420 TempExp = PredSU; 1421 TempMFMA = SuccSU; 1422 } 1423 MFMAPipeSUs.push_back(SuccSU); 1424 ++TransPipeCount; 1425 break; 1426 } 1427 } 1428 } 1429 1430 if (!(TempExp && TempMFMA)) 1431 return false; 1432 1433 HasChainBetweenCvt = 1434 std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(), 1435 [&isCvt](SDep &Succ) { 1436 return isCvt(Succ.getSUnit()->getInstr()->getOpcode()); 1437 }) == (*TempExp)->Succs.end(); 1438 1439 // Count the number of MFMAs that are reached by an EXP 1440 for (auto &SuccSU : MFMAPipeCands) { 1441 if (MFMAPipeSUs.size() && 1442 std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(), 1443 [&SuccSU](SUnit *PotentialMatch) { 1444 return PotentialMatch->NodeNum == SuccSU->NodeNum; 1445 }) != MFMAPipeSUs.end()) 1446 continue; 1447 1448 for (auto &PredSU : ExpPipeCands) { 1449 if (DAG->IsReachable(SuccSU, PredSU)) { 1450 MFMAPipeSUs.push_back(SuccSU); 1451 break; 1452 } 1453 } 1454 } 1455 1456 MFMAPipeCount = MFMAPipeSUs.size(); 1457 1458 assert(TempExp && TempMFMA); 1459 assert(MFMAPipeCount > 0); 1460 1461 std::optional<SUnit *> TempCvt; 1462 for (auto &SuccSU : CvtSUs) { 1463 if (DAG->IsReachable(SuccSU, *TempExp)) { 1464 TempCvt = SuccSU; 1465 break; 1466 } 1467 } 1468 1469 HasCvt = false; 1470 if (TempCvt.has_value()) { 1471 for (auto &SuccSU : MFMAPipeSUs) { 1472 if (DAG->IsReachable(SuccSU, *TempCvt)) { 1473 HasCvt = true; 1474 break; 1475 } 1476 } 1477 } 1478 1479 MFMAChains = 0; 1480 for (auto &MFMAPipeSU : MFMAPipeSUs) { 1481 if (is_contained(MFMAChainSeeds, MFMAPipeSU)) 1482 continue; 1483 if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(), 1484 [&TII](SDep &Succ) { 1485 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); 1486 })) { 1487 MFMAChainSeeds.push_back(MFMAPipeSU); 1488 ++MFMAChains; 1489 } 1490 } 1491 1492 if (!MFMAChains) 1493 return false; 1494 1495 for (auto Pred : MFMAChainSeeds[0]->Preds) { 1496 if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) && 1497 Pred.getSUnit()->getInstr()->mayLoad()) 1498 FirstPipeDSR = Pred.getSUnit()->NodeNum; 1499 } 1500 1501 MFMAChainLength = MFMAPipeCount / MFMAChains; 1502 1503 // The number of bit pack operations that depend on a single V_EXP 1504 unsigned PackSuccCount = std::count_if( 1505 PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) { 1506 return DAG->IsReachable(VPack, *TempExp); 1507 }); 1508 1509 // The number of bit pack operations an MFMA depends on 1510 unsigned PackPredCount = 1511 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), 1512 [&isBitPack](SDep &Pred) { 1513 auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); 1514 return isBitPack(Opc); 1515 }); 1516 1517 auto PackPred = 1518 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), 1519 [&isBitPack](SDep &Pred) { 1520 auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); 1521 return isBitPack(Opc); 1522 }); 1523 1524 if (PackPred == (*TempMFMA)->Preds.end()) 1525 return false; 1526 1527 MFMAEnablement = 0; 1528 ExpRequirement = 0; 1529 // How many MFMAs depend on a single bit pack operation 1530 MFMAEnablement = 1531 std::count_if(PackPred->getSUnit()->Succs.begin(), 1532 PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) { 1533 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); 1534 }); 1535 1536 // The number of MFMAs that depend on a single V_EXP 1537 MFMAEnablement *= PackSuccCount; 1538 1539 // The number of V_EXPs required to resolve all dependencies for an MFMA 1540 ExpRequirement = 1541 std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(), 1542 [this, &PackPred](SUnit *ExpBase) { 1543 return DAG->IsReachable(PackPred->getSUnit(), ExpBase); 1544 }); 1545 1546 ExpRequirement *= PackPredCount; 1547 return true; 1548 } 1549 1550 bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, 1551 AMDGPU::SchedulingPhase Phase) { 1552 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 1553 const SIInstrInfo *TII = ST.getInstrInfo(); 1554 1555 if (Phase != AMDGPU::SchedulingPhase::PostRA) 1556 MFMAChainSeeds.clear(); 1557 if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII)) 1558 return false; 1559 1560 return true; 1561 } 1562 1563 bool MFMAExpInterleaveOpt::applyIGLPStrategy( 1564 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1565 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 1566 AMDGPU::SchedulingPhase Phase) { 1567 1568 bool IsSmallKernelType = 1569 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; 1570 bool IsLargeKernelType = 1571 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64; 1572 1573 if (!(IsSmallKernelType || IsLargeKernelType)) 1574 return false; 1575 1576 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 1577 const SIInstrInfo *TII = ST.getInstrInfo(); 1578 1579 unsigned PipelineSyncID = 0; 1580 SchedGroup *SG = nullptr; 1581 1582 unsigned MFMAChain = 0; 1583 unsigned PositionInChain = 0; 1584 unsigned CurrMFMAForTransPosition = 0; 1585 1586 auto incrementTransPosition = [&MFMAChain, &PositionInChain, 1587 &CurrMFMAForTransPosition]() { 1588 CurrMFMAForTransPosition += MFMAEnablement; 1589 PositionInChain = (CurrMFMAForTransPosition / MFMAChains); 1590 MFMAChain = CurrMFMAForTransPosition % MFMAChains; 1591 }; 1592 1593 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() { 1594 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; 1595 return (TempMFMAForTrans / MFMAChains); 1596 }; 1597 1598 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() { 1599 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; 1600 return TempMFMAForTrans % MFMAChains; 1601 }; 1602 1603 unsigned CurrMFMAPosition = 0; 1604 unsigned MFMAChainForMFMA = 0; 1605 unsigned PositionInChainForMFMA = 0; 1606 1607 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA, 1608 &PositionInChainForMFMA]() { 1609 ++CurrMFMAPosition; 1610 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains; 1611 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains; 1612 }; 1613 1614 bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA; 1615 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains); 1616 1617 bool UsesFMA = IsSmallKernelType || !IsPostRA; 1618 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR; 1619 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA); 1620 bool UsesVALU = IsSmallKernelType; 1621 1622 // PHASE 1: "Prefetch" 1623 if (UsesFMA) { 1624 // First Round FMA 1625 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1626 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); 1627 if (!IsPostRA && MFMAChains) { 1628 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1629 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1630 true)); 1631 } else 1632 SG->addRule( 1633 std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); 1634 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1635 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1636 1637 // Second Round FMA 1638 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1639 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); 1640 if (!IsPostRA && MFMAChains) { 1641 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1642 getNextTransPositionInChain(), 1643 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); 1644 } else 1645 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, 1646 SG->getSGID(), true)); 1647 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1648 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1649 } 1650 1651 if (UsesDSRead) { 1652 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1653 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); 1654 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, 1655 SG->getSGID())); 1656 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1657 } 1658 1659 // First Round EXP 1660 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1661 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII); 1662 if (!IsPostRA && MFMAChains) 1663 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1664 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true)); 1665 else 1666 SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); 1667 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1668 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1669 HasChainBetweenCvt)); 1670 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1671 1672 incrementTransPosition(); 1673 1674 // First Round CVT, Third Round FMA, Second Round EXP; interleaved 1675 for (unsigned I = 0; I < ExpRequirement; I++) { 1676 // First Round CVT 1677 if (UsesCvt) { 1678 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1679 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1680 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); 1681 if (HasChainBetweenCvt) 1682 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( 1683 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); 1684 else 1685 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>( 1686 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); 1687 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1688 } 1689 1690 // Third Round FMA 1691 if (UsesFMA) { 1692 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1693 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1694 if (!IsPostRA && MFMAChains) { 1695 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1696 getNextTransPositionInChain(), 1697 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); 1698 } else 1699 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1, 1700 TII, SG->getSGID(), true)); 1701 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1702 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1703 } 1704 1705 // Second Round EXP 1706 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1707 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1708 if (!IsPostRA && MFMAChains) 1709 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1710 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1711 true)); 1712 else 1713 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, 1714 SG->getSGID(), true)); 1715 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1716 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1717 HasChainBetweenCvt)); 1718 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1719 } 1720 1721 // The "extra" EXP which enables all MFMA 1722 // TODO: UsesExtraExp 1723 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1724 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1725 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1726 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>( 1727 8, TII, SG->getSGID(), HasChainBetweenCvt)); 1728 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1729 1730 // PHASE 2: Main Interleave Loop 1731 1732 // The number of MFMAs per iteration 1733 unsigned MFMARatio = 1734 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1; 1735 // The number of Exps per iteration 1736 unsigned ExpRatio = 1737 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement; 1738 // The reamaining Exps 1739 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement) 1740 ? TransPipeCount - (2 * ExpRequirement) 1741 : 0; 1742 unsigned ExpLoopCount = RemainingExp / ExpRatio; 1743 // In loop MFMAs 1744 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2) 1745 ? MFMAPipeCount - (MFMAEnablement * 2) 1746 : 0; 1747 unsigned MFMALoopCount = MFMAInLoop / MFMARatio; 1748 unsigned VALUOps = 1749 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount; 1750 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount); 1751 1752 for (unsigned I = 0; I < LoopSize; I++) { 1753 if (!(I * ExpRatio % ExpRequirement)) 1754 incrementTransPosition(); 1755 1756 // Round N MFMA 1757 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1758 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII); 1759 if (!IsPostRA && MFMAChains) 1760 SG->addRule(std::make_shared<IsExactMFMA>( 1761 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII, 1762 SG->getSGID(), true)); 1763 else 1764 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); 1765 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1766 incrementMFMAPosition(); 1767 1768 if (UsesVALU) { 1769 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1770 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII); 1771 SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID())); 1772 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1773 } 1774 1775 if (UsesDSRead && !(I % 4)) { 1776 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1777 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); 1778 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, 1779 SG->getSGID())); 1780 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1781 } 1782 1783 // CVT, EXP, FMA Interleaving 1784 for (unsigned J = 0; J < ExpRatio; J++) { 1785 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1); 1786 auto MaxMFMAOffset = 1787 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio; 1788 1789 // Round N + 1 CVT 1790 if (UsesCvt) { 1791 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1792 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1793 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); 1794 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1; 1795 auto DSROffset = I / 4 + 1; 1796 auto MaxDSROffset = MaxMFMAOffset / 4; 1797 // TODO: UsesExtraExp 1798 auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1; 1799 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) + 1800 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff + 1801 ExpOffset; 1802 if (HasChainBetweenCvt) 1803 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( 1804 CurrentOffset, TII, SG->getSGID())); 1805 else 1806 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII, 1807 SG->getSGID())); 1808 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1809 } 1810 1811 // Round N + 3 FMA 1812 if (UsesFMA) { 1813 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1814 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); 1815 if (!IsPostRA && MFMAChains) 1816 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1817 getNextTransPositionInChain(), 1818 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), 1819 true)); 1820 else 1821 SG->addRule(std::make_shared<EnablesNthMFMA>( 1822 (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1, 1823 TII, SG->getSGID(), true)); 1824 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); 1825 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1826 } 1827 1828 // Round N + 2 Exp 1829 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1830 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); 1831 if (!IsPostRA && MFMAChains) 1832 SG->addRule(std::make_shared<EnablesNthMFMAInChain>( 1833 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), 1834 true)); 1835 else 1836 SG->addRule(std::make_shared<EnablesNthMFMA>( 1837 (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1, 1838 TII, SG->getSGID(), true)); 1839 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); 1840 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), 1841 HasChainBetweenCvt)); 1842 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1843 } 1844 } 1845 1846 // PHASE 3: Remaining MFMAs 1847 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1848 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII); 1849 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); 1850 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1851 return true; 1852 } 1853 1854 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 1855 private: 1856 // Whether the DS_READ is a predecessor of first four MFMA in region 1857 class EnablesInitialMFMA final : public InstructionRule { 1858 public: 1859 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1860 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1861 if (!SyncPipe.size()) 1862 return false; 1863 int MFMAsFound = 0; 1864 if (!Cache->size()) { 1865 for (auto &Elt : SyncPipe[0].DAG->SUnits) { 1866 if (TII->isMFMAorWMMA(*Elt.getInstr())) { 1867 ++MFMAsFound; 1868 if (MFMAsFound > 4) 1869 break; 1870 Cache->push_back(&Elt); 1871 } 1872 } 1873 } 1874 1875 assert(Cache->size()); 1876 auto DAG = SyncPipe[0].DAG; 1877 for (auto &Elt : *Cache) { 1878 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 1879 return true; 1880 } 1881 return false; 1882 } 1883 1884 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 1885 bool NeedsCache = false) 1886 : InstructionRule(TII, SGID, NeedsCache) {} 1887 }; 1888 1889 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 1890 class IsPermForDSW final : public InstructionRule { 1891 public: 1892 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1893 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1894 auto MI = SU->getInstr(); 1895 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 1896 return false; 1897 1898 bool FitsInGroup = false; 1899 // Does the VALU have a DS_WRITE successor 1900 if (!Collection.size()) { 1901 for (auto &Succ : SU->Succs) { 1902 SUnit *SuccUnit = Succ.getSUnit(); 1903 if (TII->isDS(*SuccUnit->getInstr()) && 1904 SuccUnit->getInstr()->mayStore()) { 1905 Cache->push_back(SuccUnit); 1906 FitsInGroup = true; 1907 } 1908 } 1909 return FitsInGroup; 1910 } 1911 1912 assert(Cache->size()); 1913 1914 // Does the VALU have a DS_WRITE successor that is the same as other 1915 // VALU already in the group. The V_PERMs will all share 1 DS_W succ 1916 return llvm::any_of(*Cache, [&SU](SUnit *Elt) { 1917 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { 1918 return ThisSucc.getSUnit() == Elt; 1919 }); 1920 }); 1921 } 1922 1923 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1924 : InstructionRule(TII, SGID, NeedsCache) {} 1925 }; 1926 1927 // Whether the SU is a successor of any element in previous SchedGroup 1928 class IsSuccOfPrevGroup final : public InstructionRule { 1929 public: 1930 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1931 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1932 SchedGroup *OtherGroup = nullptr; 1933 for (auto &PipeSG : SyncPipe) { 1934 if ((unsigned)PipeSG.getSGID() == SGID - 1) { 1935 OtherGroup = &PipeSG; 1936 } 1937 } 1938 1939 if (!OtherGroup) 1940 return false; 1941 if (!OtherGroup->Collection.size()) 1942 return true; 1943 1944 // Does the previous VALU have this DS_Write as a successor 1945 return (std::any_of(OtherGroup->Collection.begin(), 1946 OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 1947 return std::any_of(Elt->Succs.begin(), 1948 Elt->Succs.end(), 1949 [&SU](SDep &Succ) { 1950 return Succ.getSUnit() == SU; 1951 }); 1952 })); 1953 } 1954 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 1955 bool NeedsCache = false) 1956 : InstructionRule(TII, SGID, NeedsCache) {} 1957 }; 1958 1959 // Whether the combined load width of group is 128 bits 1960 class VMEMSize final : public InstructionRule { 1961 public: 1962 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1963 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1964 auto MI = SU->getInstr(); 1965 if (MI->getOpcode() == TargetOpcode::BUNDLE) 1966 return false; 1967 if (!Collection.size()) 1968 return true; 1969 1970 int NumBits = 0; 1971 1972 auto TRI = TII->getRegisterInfo(); 1973 auto &MRI = MI->getParent()->getParent()->getRegInfo(); 1974 for (auto &Elt : Collection) { 1975 auto Op = Elt->getInstr()->getOperand(0); 1976 auto Size = 1977 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 1978 NumBits += Size; 1979 } 1980 1981 if (NumBits < 128) { 1982 assert(TII->isVMEM(*MI) && MI->mayLoad()); 1983 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 1984 MRI, MI->getOperand(0))) <= 1985 128) 1986 return true; 1987 } 1988 1989 return false; 1990 } 1991 1992 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1993 : InstructionRule(TII, SGID, NeedsCache) {} 1994 }; 1995 1996 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 1997 /// that is \p Distance steps away 1998 class SharesPredWithPrevNthGroup final : public InstructionRule { 1999 private: 2000 unsigned Distance = 1; 2001 2002 public: 2003 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 2004 SmallVectorImpl<SchedGroup> &SyncPipe) override { 2005 SchedGroup *OtherGroup = nullptr; 2006 if (!SyncPipe.size()) 2007 return false; 2008 2009 if (!Cache->size()) { 2010 2011 for (auto &PipeSG : SyncPipe) { 2012 if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 2013 OtherGroup = &PipeSG; 2014 } 2015 } 2016 2017 if (!OtherGroup) 2018 return false; 2019 if (!OtherGroup->Collection.size()) 2020 return true; 2021 2022 for (auto &OtherEle : OtherGroup->Collection) { 2023 for (auto &Pred : OtherEle->Preds) { 2024 if (Pred.getSUnit()->getInstr()->getOpcode() == 2025 AMDGPU::V_PERM_B32_e64) 2026 Cache->push_back(Pred.getSUnit()); 2027 } 2028 } 2029 2030 // If the other group has no PERM preds, then this group won't share any 2031 if (!Cache->size()) 2032 return false; 2033 } 2034 2035 auto DAG = SyncPipe[0].DAG; 2036 // Does the previous DS_WRITE share a V_PERM predecessor with this 2037 // VMEM_READ 2038 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { 2039 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 2040 }); 2041 } 2042 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 2043 unsigned SGID, bool NeedsCache = false) 2044 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 2045 }; 2046 2047 public: 2048 bool applyIGLPStrategy( 2049 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 2050 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 2051 AMDGPU::SchedulingPhase Phase) override; 2052 2053 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, 2054 AMDGPU::SchedulingPhase Phase) override { 2055 return true; 2056 } 2057 2058 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 2059 : IGLPStrategy(DAG, TII) { 2060 IsBottomUp = false; 2061 } 2062 }; 2063 2064 static unsigned DSWCount = 0; 2065 static unsigned DSWWithPermCount = 0; 2066 static unsigned DSWWithSharedVMEMCount = 0; 2067 2068 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 2069 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 2070 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 2071 AMDGPU::SchedulingPhase Phase) { 2072 unsigned MFMACount = 0; 2073 unsigned DSRCount = 0; 2074 2075 bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial; 2076 2077 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && 2078 DSWWithSharedVMEMCount == 0)) && 2079 "DSWCounters should be zero in pre-RA scheduling!"); 2080 SmallVector<SUnit *, 6> DSWithPerms; 2081 for (auto &SU : DAG->SUnits) { 2082 auto I = SU.getInstr(); 2083 if (TII->isMFMAorWMMA(*I)) 2084 ++MFMACount; 2085 else if (TII->isDS(*I)) { 2086 if (I->mayLoad()) 2087 ++DSRCount; 2088 else if (I->mayStore() && IsInitial) { 2089 ++DSWCount; 2090 for (auto Pred : SU.Preds) { 2091 if (Pred.getSUnit()->getInstr()->getOpcode() == 2092 AMDGPU::V_PERM_B32_e64) { 2093 DSWithPerms.push_back(&SU); 2094 break; 2095 } 2096 } 2097 } 2098 } 2099 } 2100 2101 if (IsInitial) { 2102 DSWWithPermCount = DSWithPerms.size(); 2103 auto I = DSWithPerms.begin(); 2104 auto E = DSWithPerms.end(); 2105 2106 // Get the count of DS_WRITES with V_PERM predecessors which 2107 // have loop carried dependencies (WAR) on the same VMEM_READs. 2108 // We consider partial overlap as a miss -- in other words, 2109 // for a given DS_W, we only consider another DS_W as matching 2110 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 2111 // for every V_PERM pred of this DS_W. 2112 DenseMap<MachineInstr *, SUnit *> VMEMLookup; 2113 SmallVector<SUnit *, 6> Counted; 2114 for (; I != E; I++) { 2115 SUnit *Cand = nullptr; 2116 bool MissedAny = false; 2117 for (auto &Pred : (*I)->Preds) { 2118 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 2119 continue; 2120 2121 if (Cand && llvm::is_contained(Counted, Cand)) 2122 break; 2123 2124 for (auto &Succ : Pred.getSUnit()->Succs) { 2125 auto MI = Succ.getSUnit()->getInstr(); 2126 if (!TII->isVMEM(*MI) || !MI->mayLoad()) 2127 continue; 2128 2129 if (MissedAny || !VMEMLookup.size()) { 2130 MissedAny = true; 2131 VMEMLookup[MI] = *I; 2132 continue; 2133 } 2134 2135 if (!VMEMLookup.contains(MI)) { 2136 MissedAny = true; 2137 VMEMLookup[MI] = *I; 2138 continue; 2139 } 2140 2141 Cand = VMEMLookup[MI]; 2142 if (llvm::is_contained(Counted, Cand)) { 2143 MissedAny = true; 2144 break; 2145 } 2146 } 2147 } 2148 if (!MissedAny && Cand) { 2149 DSWWithSharedVMEMCount += 2; 2150 Counted.push_back(Cand); 2151 Counted.push_back(*I); 2152 } 2153 } 2154 } 2155 2156 assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 2157 SchedGroup *SG; 2158 unsigned PipelineSyncID = 0; 2159 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 2160 if (DSWWithPermCount) { 2161 for (unsigned I = 0; I < MFMACount; I++) { 2162 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2163 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2164 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2165 2166 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2167 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 2168 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2169 } 2170 } 2171 2172 PipelineSyncID = 1; 2173 // Phase 1: Break up DS_READ and MFMA clusters. 2174 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 2175 // prefetch 2176 2177 // Make ready initial MFMA 2178 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2179 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 2180 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 2181 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2182 2183 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2184 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2185 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2186 2187 // Interleave MFMA with DS_READ prefetch 2188 for (unsigned I = 0; I < DSRCount - 4; ++I) { 2189 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2190 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 2191 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2192 2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2194 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2195 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2196 } 2197 2198 // Phase 2a: Loop carried dependency with V_PERM 2199 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 2200 // depend on. Interleave MFMA to keep XDL unit busy throughout. 2201 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 2202 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2203 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 2204 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 2205 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2206 2207 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2208 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2209 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 2210 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2211 2212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2213 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2214 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2215 1, TII, SG->getSGID(), true)); 2216 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2217 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2218 2219 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2220 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2221 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2222 2223 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2224 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2225 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2226 3, TII, SG->getSGID(), true)); 2227 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2229 2230 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2231 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2232 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2233 } 2234 2235 // Phase 2b: Loop carried dependency without V_PERM 2236 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 2237 // Interleave MFMA to keep XDL unit busy throughout. 2238 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 2239 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2240 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2241 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2242 2243 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2244 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2245 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2246 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2247 2248 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2249 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2251 } 2252 2253 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 2254 // ultimately used by two DS_WRITE 2255 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 2256 // depend on. Interleave MFMA to keep XDL unit busy throughout. 2257 2258 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 2259 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2260 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 2261 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 2262 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2263 2264 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2265 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2266 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 2267 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2268 2269 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2270 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2271 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2272 2273 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2274 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 2275 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 2276 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2277 2278 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2279 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 2280 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); 2281 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2282 2283 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2284 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2285 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2286 2287 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2288 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2289 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2290 2, TII, SG->getSGID(), true)); 2291 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2292 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2293 2294 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2295 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2297 2298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2299 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 2300 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 2301 4, TII, SG->getSGID(), true)); 2302 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); 2303 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2304 2305 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 2306 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 2307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 2308 } 2309 2310 return true; 2311 } 2312 2313 static std::unique_ptr<IGLPStrategy> 2314 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 2315 const SIInstrInfo *TII) { 2316 switch (ID) { 2317 case MFMASmallGemmOptID: 2318 return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 2319 case MFMASmallGemmSingleWaveOptID: 2320 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 2321 case MFMAExpInterleave: 2322 return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); 2323 } 2324 2325 llvm_unreachable("Unknown IGLPStrategyID"); 2326 } 2327 2328 class IGroupLPDAGMutation : public ScheduleDAGMutation { 2329 private: 2330 const SIInstrInfo *TII; 2331 2332 ScheduleDAGMI *DAG; 2333 2334 // Organize lists of SchedGroups by their SyncID. SchedGroups / 2335 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 2336 // between then. 2337 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 2338 2339 // Used to track instructions that can be mapped to multiple sched groups 2340 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 2341 2342 // Add DAG edges that enforce SCHED_BARRIER ordering. 2343 void addSchedBarrierEdges(SUnit &SU); 2344 2345 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 2346 // not be reordered accross the SCHED_BARRIER. This is used for the base 2347 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 2348 // SCHED_BARRIER will always block all instructions that can be classified 2349 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 2350 // and may only synchronize with some SchedGroups. Returns the inverse of 2351 // Mask. SCHED_BARRIER's mask describes which instruction types should be 2352 // allowed to be scheduled across it. Invert the mask to get the 2353 // SchedGroupMask of instructions that should be barred. 2354 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 2355 2356 // Create SchedGroups for a SCHED_GROUP_BARRIER. 2357 void initSchedGroupBarrierPipelineStage( 2358 std::vector<SUnit>::reverse_iterator RIter); 2359 2360 bool initIGLPOpt(SUnit &SU); 2361 2362 public: 2363 void apply(ScheduleDAGInstrs *DAGInstrs) override; 2364 2365 // The order in which the PipelineSolver should process the candidate 2366 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 2367 // created SchedGroup first, and will consider that as the ultimate 2368 // predecessor group when linking. TOP_DOWN instead links and processes the 2369 // first created SchedGroup first. 2370 bool IsBottomUp = true; 2371 2372 // The scheduling phase this application of IGLP corresponds with. 2373 AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; 2374 2375 IGroupLPDAGMutation() = default; 2376 IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {} 2377 }; 2378 2379 unsigned SchedGroup::NumSchedGroups = 0; 2380 2381 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 2382 if (A != B && DAG->canAddEdge(B, A)) { 2383 DAG->addEdge(B, SDep(A, SDep::Artificial)); 2384 return true; 2385 } 2386 return false; 2387 } 2388 2389 bool SchedGroup::canAddMI(const MachineInstr &MI) const { 2390 bool Result = false; 2391 if (MI.isMetaInstruction()) 2392 Result = false; 2393 2394 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 2395 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || 2396 TII->isTRANS(MI))) 2397 Result = true; 2398 2399 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 2400 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) 2401 Result = true; 2402 2403 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 2404 TII->isSALU(MI)) 2405 Result = true; 2406 2407 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 2408 TII->isMFMAorWMMA(MI)) 2409 Result = true; 2410 2411 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 2412 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2413 Result = true; 2414 2415 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 2416 MI.mayLoad() && 2417 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2418 Result = true; 2419 2420 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 2421 MI.mayStore() && 2422 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 2423 Result = true; 2424 2425 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 2426 TII->isDS(MI)) 2427 Result = true; 2428 2429 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 2430 MI.mayLoad() && TII->isDS(MI)) 2431 Result = true; 2432 2433 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 2434 MI.mayStore() && TII->isDS(MI)) 2435 Result = true; 2436 2437 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && 2438 TII->isTRANS(MI)) 2439 Result = true; 2440 2441 LLVM_DEBUG( 2442 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 2443 << (Result ? " could classify " : " unable to classify ") << MI); 2444 2445 return Result; 2446 } 2447 2448 int SchedGroup::link(SUnit &SU, bool MakePred, 2449 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 2450 int MissedEdges = 0; 2451 for (auto *A : Collection) { 2452 SUnit *B = &SU; 2453 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 2454 continue; 2455 if (MakePred) 2456 std::swap(A, B); 2457 2458 if (DAG->IsReachable(B, A)) 2459 continue; 2460 2461 // tryAddEdge returns false if there is a dependency that makes adding 2462 // the A->B edge impossible, otherwise it returns true; 2463 bool Added = tryAddEdge(A, B); 2464 if (Added) 2465 AddedEdges.emplace_back(A, B); 2466 else 2467 ++MissedEdges; 2468 } 2469 2470 return MissedEdges; 2471 } 2472 2473 void SchedGroup::link(SUnit &SU, bool MakePred) { 2474 for (auto *A : Collection) { 2475 SUnit *B = &SU; 2476 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 2477 continue; 2478 if (MakePred) 2479 std::swap(A, B); 2480 2481 tryAddEdge(A, B); 2482 } 2483 } 2484 2485 void SchedGroup::link(SUnit &SU, 2486 function_ref<bool(const SUnit *A, const SUnit *B)> P) { 2487 for (auto *A : Collection) { 2488 SUnit *B = &SU; 2489 if (P(A, B)) 2490 std::swap(A, B); 2491 2492 tryAddEdge(A, B); 2493 } 2494 } 2495 2496 void SchedGroup::link(SchedGroup &OtherGroup) { 2497 for (auto *B : OtherGroup.Collection) 2498 link(*B); 2499 } 2500 2501 bool SchedGroup::canAddSU(SUnit &SU) const { 2502 MachineInstr &MI = *SU.getInstr(); 2503 if (MI.getOpcode() != TargetOpcode::BUNDLE) 2504 return canAddMI(MI); 2505 2506 // Special case for bundled MIs. 2507 const MachineBasicBlock *MBB = MI.getParent(); 2508 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 2509 while (E != MBB->end() && E->isBundledWithPred()) 2510 ++E; 2511 2512 // Return true if all of the bundled MIs can be added to this group. 2513 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 2514 } 2515 2516 void SchedGroup::initSchedGroup() { 2517 for (auto &SU : DAG->SUnits) { 2518 if (isFull()) 2519 break; 2520 2521 if (canAddSU(SU)) 2522 add(SU); 2523 } 2524 } 2525 2526 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 2527 SUnitsToCandidateSGsMap &SyncedInstrs) { 2528 SUnit &InitSU = *RIter; 2529 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 2530 auto &SU = *RIter; 2531 if (isFull()) 2532 break; 2533 2534 if (canAddSU(SU)) 2535 SyncedInstrs[&SU].push_back(SGID); 2536 } 2537 2538 add(InitSU); 2539 assert(MaxSize); 2540 (*MaxSize)++; 2541 } 2542 2543 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 2544 auto I = DAG->SUnits.rbegin(); 2545 auto E = DAG->SUnits.rend(); 2546 for (; I != E; ++I) { 2547 auto &SU = *I; 2548 if (isFull()) 2549 break; 2550 if (canAddSU(SU)) 2551 SyncedInstrs[&SU].push_back(SGID); 2552 } 2553 } 2554 2555 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 2556 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 2557 if (!TSchedModel || DAGInstrs->SUnits.empty()) 2558 return; 2559 2560 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 2561 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 2562 TII = ST.getInstrInfo(); 2563 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 2564 SyncedSchedGroups.clear(); 2565 SyncedInstrs.clear(); 2566 bool FoundSB = false; 2567 bool FoundIGLP = false; 2568 bool ShouldApplyIGLP = false; 2569 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 2570 unsigned Opc = R->getInstr()->getOpcode(); 2571 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 2572 if (Opc == AMDGPU::SCHED_BARRIER) { 2573 addSchedBarrierEdges(*R); 2574 FoundSB = true; 2575 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 2576 initSchedGroupBarrierPipelineStage(R); 2577 FoundSB = true; 2578 } else if (Opc == AMDGPU::IGLP_OPT) { 2579 resetEdges(*R, DAG); 2580 if (!FoundSB && !FoundIGLP) { 2581 FoundIGLP = true; 2582 ShouldApplyIGLP = initIGLPOpt(*R); 2583 } 2584 } 2585 } 2586 2587 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { 2588 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 2589 // PipelineSolver performs the mutation by adding the edges it 2590 // determined as the best 2591 PS.solve(); 2592 return; 2593 } 2594 } 2595 2596 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 2597 MachineInstr &MI = *SchedBarrier.getInstr(); 2598 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 2599 // Remove all existing edges from the SCHED_BARRIER that were added due to the 2600 // instruction having side effects. 2601 resetEdges(SchedBarrier, DAG); 2602 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " 2603 << MI.getOperand(0).getImm() << "\n"); 2604 auto InvertedMask = 2605 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 2606 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 2607 SG.initSchedGroup(); 2608 2609 // Preserve original instruction ordering relative to the SCHED_BARRIER. 2610 SG.link( 2611 SchedBarrier, 2612 (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 2613 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 2614 } 2615 2616 SchedGroupMask 2617 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 2618 // Invert mask and erase bits for types of instructions that are implied to be 2619 // allowed past the SCHED_BARRIER. 2620 SchedGroupMask InvertedMask = ~Mask; 2621 2622 // ALU implies VALU, SALU, MFMA, TRANS. 2623 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 2624 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & 2625 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; 2626 // VALU, SALU, MFMA, TRANS implies ALU. 2627 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 2628 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 2629 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || 2630 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) 2631 InvertedMask &= ~SchedGroupMask::ALU; 2632 2633 // VMEM implies VMEM_READ, VMEM_WRITE. 2634 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 2635 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 2636 // VMEM_READ, VMEM_WRITE implies VMEM. 2637 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 2638 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 2639 InvertedMask &= ~SchedGroupMask::VMEM; 2640 2641 // DS implies DS_READ, DS_WRITE. 2642 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 2643 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 2644 // DS_READ, DS_WRITE implies DS. 2645 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 2646 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 2647 InvertedMask &= ~SchedGroupMask::DS; 2648 2649 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask 2650 << "\n"); 2651 2652 return InvertedMask; 2653 } 2654 2655 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 2656 std::vector<SUnit>::reverse_iterator RIter) { 2657 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 2658 // to the instruction having side effects. 2659 resetEdges(*RIter, DAG); 2660 MachineInstr &SGB = *RIter->getInstr(); 2661 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 2662 int32_t SGMask = SGB.getOperand(0).getImm(); 2663 int32_t Size = SGB.getOperand(1).getImm(); 2664 int32_t SyncID = SGB.getOperand(2).getImm(); 2665 2666 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 2667 Size, SyncID, DAG, TII); 2668 2669 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 2670 } 2671 2672 bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 2673 IGLPStrategyID StrategyID = 2674 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 2675 auto S = createIGLPStrategy(StrategyID, DAG, TII); 2676 if (!S->shouldApplyStrategy(DAG, Phase)) 2677 return false; 2678 2679 IsBottomUp = S->IsBottomUp; 2680 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); 2681 } 2682 2683 } // namespace 2684 2685 namespace llvm { 2686 2687 /// \p Phase specifes whether or not this is a reentry into the 2688 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the 2689 /// same scheduling region (e.g. pre and post-RA scheduling / multiple 2690 /// scheduling "phases"), we can reenter this mutation framework more than once 2691 /// for a given region. 2692 std::unique_ptr<ScheduleDAGMutation> 2693 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { 2694 return std::make_unique<IGroupLPDAGMutation>(Phase); 2695 } 2696 2697 } // end namespace llvm 2698