1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file This file defines a set of schedule DAG mutations that can be used to 10 // override default scheduler behavior to enforce specific scheduling patterns. 11 // They should be used in cases where runtime performance considerations such as 12 // inter-wavefront interactions, mean that compile-time heuristics cannot 13 // predict the optimal instruction ordering, or in kernels where optimum 14 // instruction scheduling is important enough to warrant manual intervention. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUIGroupLP.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "igrouplp" 31 32 namespace { 33 34 static cl::opt<bool> EnableExactSolver( 35 "amdgpu-igrouplp-exact-solver", cl::Hidden, 36 cl::desc("Whether to use the exponential time solver to fit " 37 "the instructions to the pipeline as closely as " 38 "possible."), 39 cl::init(false)); 40 41 static cl::opt<unsigned> CutoffForExact( 42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, 43 cl::desc("The maximum number of scheduling group conflicts " 44 "which we attempt to solve with the exponential time " 45 "exact solver. Problem sizes greater than this will" 46 "be solved by the less accurate greedy algorithm. Selecting " 47 "solver by size is superseded by manually selecting " 48 "the solver (e.g. by amdgpu-igrouplp-exact-solver")); 49 50 static cl::opt<uint64_t> MaxBranchesExplored( 51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, 52 cl::desc("The amount of branches that we are willing to explore with" 53 "the exact algorithm before giving up.")); 54 55 static cl::opt<bool> UseCostHeur( 56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, 57 cl::desc("Whether to use the cost heuristic to make choices as we " 58 "traverse the search space using the exact solver. Defaulted " 59 "to on, and if turned off, we will use the node order -- " 60 "attempting to put the later nodes in the later sched groups. " 61 "Experimentally, results are mixed, so this should be set on a " 62 "case-by-case basis.")); 63 64 // Components of the mask that determines which instruction types may be may be 65 // classified into a SchedGroup. 66 enum class SchedGroupMask { 67 NONE = 0u, 68 ALU = 1u << 0, 69 VALU = 1u << 1, 70 SALU = 1u << 2, 71 MFMA = 1u << 3, 72 VMEM = 1u << 4, 73 VMEM_READ = 1u << 5, 74 VMEM_WRITE = 1u << 6, 75 DS = 1u << 7, 76 DS_READ = 1u << 8, 77 DS_WRITE = 1u << 9, 78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | 79 DS_READ | DS_WRITE, 80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 81 }; 82 83 class SchedGroup; 84 85 // InstructionRule class is used to enact a filter which determines whether or 86 // not an SU maps to a given SchedGroup. It contains complementary data 87 // structures (e.g Cache) to help those filters. 88 class InstructionRule { 89 protected: 90 const SIInstrInfo *TII; 91 unsigned SGID; 92 // A cache made available to the Filter to store SUnits for subsequent 93 // invocations of the Filter 94 std::optional<SmallVector<SUnit *, 4>> Cache; 95 96 public: 97 virtual bool 98 apply(const SUnit *, const ArrayRef<SUnit *>, 99 SmallVectorImpl<SchedGroup> &) { 100 return true; 101 }; 102 103 InstructionRule(const SIInstrInfo *TII, unsigned SGID, 104 bool NeedsCache = false) 105 : TII(TII), SGID(SGID) { 106 if (NeedsCache) { 107 Cache = SmallVector<SUnit *, 4>(); 108 } 109 } 110 111 virtual ~InstructionRule() = default; 112 }; 113 114 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; 115 116 // Classify instructions into groups to enable fine tuned control over the 117 // scheduler. These groups may be more specific than current SchedModel 118 // instruction classes. 119 class SchedGroup { 120 private: 121 // Mask that defines which instruction types can be classified into this 122 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER 123 // and SCHED_GROUP_BARRIER. 124 SchedGroupMask SGMask; 125 126 // Maximum number of SUnits that can be added to this group. 127 std::optional<unsigned> MaxSize; 128 129 // SchedGroups will only synchronize with other SchedGroups that have the same 130 // SyncID. 131 int SyncID = 0; 132 133 // SGID is used to map instructions to candidate SchedGroups 134 unsigned SGID; 135 136 // The different rules each instruction in this SchedGroup must conform to 137 SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; 138 139 // Count of the number of created SchedGroups, used to initialize SGID. 140 static unsigned NumSchedGroups; 141 142 const SIInstrInfo *TII; 143 144 // Try to add and edge from SU A to SU B. 145 bool tryAddEdge(SUnit *A, SUnit *B); 146 147 // Use SGMask to determine whether we can classify MI as a member of this 148 // SchedGroup object. 149 bool canAddMI(const MachineInstr &MI) const; 150 151 public: 152 // Collection of SUnits that are classified as members of this group. 153 SmallVector<SUnit *, 32> Collection; 154 155 ScheduleDAGInstrs *DAG; 156 157 // Returns true if SU can be added to this SchedGroup. 158 bool canAddSU(SUnit &SU) const; 159 160 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 161 // MakePred is true, SU will be a predecessor of the SUnits in this 162 // SchedGroup, otherwise SU will be a successor. 163 void link(SUnit &SU, bool MakePred = false); 164 165 // Add DAG dependencies and track which edges are added, and the count of 166 // missed edges 167 int link(SUnit &SU, bool MakePred, 168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 169 170 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. 171 // Use the predicate to determine whether SU should be a predecessor (P = 172 // true) or a successor (P = false) of this SchedGroup. 173 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); 174 175 // Add DAG dependencies such that SUnits in this group shall be ordered 176 // before SUnits in OtherGroup. 177 void link(SchedGroup &OtherGroup); 178 179 // Returns true if no more instructions may be added to this group. 180 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } 181 182 // Append a constraint that SUs must meet in order to fit into this 183 // SchedGroup. Since many rules involve the relationship between a SchedGroup 184 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve 185 // time (rather than SchedGroup init time.) 186 void addRule(std::shared_ptr<InstructionRule> NewRule) { 187 Rules.push_back(NewRule); 188 } 189 190 // Returns true if the SU matches all rules 191 bool allowedByRules(const SUnit *SU, 192 SmallVectorImpl<SchedGroup> &SyncPipe) const { 193 if (Rules.empty()) 194 return true; 195 for (size_t I = 0; I < Rules.size(); I++) { 196 auto TheRule = Rules[I].get(); 197 if (!TheRule->apply(SU, Collection, SyncPipe)) { 198 return false; 199 } 200 } 201 return true; 202 } 203 204 // Add SU to the SchedGroup. 205 void add(SUnit &SU) { 206 LLVM_DEBUG(dbgs() << "For SchedGroup with mask " 207 << format_hex((int)SGMask, 10, true) << " adding " 208 << *SU.getInstr()); 209 Collection.push_back(&SU); 210 } 211 212 // Remove last element in the SchedGroup 213 void pop() { Collection.pop_back(); } 214 215 // Identify and add all relevant SUs from the DAG to this SchedGroup. 216 void initSchedGroup(); 217 218 // Add instructions to the SchedGroup bottom up starting from RIter. 219 // PipelineInstrs is a set of instructions that should not be added to the 220 // SchedGroup even when the other conditions for adding it are satisfied. 221 // RIter will be added to the SchedGroup as well, and dependencies will be 222 // added so that RIter will always be scheduled at the end of the group. 223 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 224 SUnitsToCandidateSGsMap &SyncedInstrs); 225 226 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); 227 228 int getSyncID() { return SyncID; } 229 230 int getSGID() { return SGID; } 231 232 SchedGroupMask getMask() { return SGMask; } 233 234 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 235 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 236 : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) { 237 SGID = NumSchedGroups++; 238 } 239 240 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, 241 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 242 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) { 243 SGID = NumSchedGroups++; 244 } 245 }; 246 247 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. 248 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { 249 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || 250 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || 251 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); 252 253 while (!SU.Preds.empty()) 254 for (auto &P : SU.Preds) 255 SU.removePred(P); 256 257 while (!SU.Succs.empty()) 258 for (auto &S : SU.Succs) 259 for (auto &SP : S.getSUnit()->Preds) 260 if (SP.getSUnit() == &SU) 261 S.getSUnit()->removePred(SP); 262 } 263 264 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair; 265 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec; 266 267 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline 268 // in non-trivial cases. For example, if the requested pipeline is 269 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction 270 // in the DAG, then we will have an instruction that can not be trivially 271 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms 272 // to find a good solution to the pipeline -- a greedy algorithm and an exact 273 // algorithm. The exact algorithm has an exponential time complexity and should 274 // only be used for small sized problems or medium sized problems where an exact 275 // solution is highly desired. 276 class PipelineSolver { 277 ScheduleDAGMI *DAG; 278 279 // Instructions that can be assigned to multiple SchedGroups 280 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 281 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; 282 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 283 // The current working pipeline 284 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; 285 // The pipeline that has the best solution found so far 286 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; 287 288 // Whether or not we actually have any SyncedInstrs to try to solve. 289 bool NeedsSolver = false; 290 291 // Compute an estimate of the size of search tree -- the true size is 292 // the product of each conflictedInst.Matches.size() across all SyncPipelines 293 unsigned computeProblemSize(); 294 295 // The cost penalty of not assigning a SU to a SchedGroup 296 int MissPenalty = 0; 297 298 // Costs in terms of the number of edges we are unable to add 299 int BestCost = -1; 300 int CurrCost = 0; 301 302 // Index pointing to the conflicting instruction that is currently being 303 // fitted 304 int CurrConflInstNo = 0; 305 // Index to the pipeline that is currently being fitted 306 int CurrSyncGroupIdx = 0; 307 // The first non trivial pipeline 308 int BeginSyncGroupIdx = 0; 309 310 // How many branches we have explored 311 uint64_t BranchesExplored = 0; 312 313 // The direction in which we process the candidate SchedGroups per SU 314 bool IsBottomUp = 1; 315 316 // Update indices to fit next conflicting instruction 317 void advancePosition(); 318 // Recede indices to attempt to find better fit for previous conflicting 319 // instruction 320 void retreatPosition(); 321 322 // The exponential time algorithm which finds the provably best fit 323 bool solveExact(); 324 // The polynomial time algorithm which attempts to find a good fit 325 bool solveGreedy(); 326 // Find the best SchedGroup for the current SU using the heuristic given all 327 // current information. One step in the greedy algorithm. Templated against 328 // the SchedGroup iterator (either reverse or forward). 329 template <typename T> 330 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, 331 T E); 332 // Whether or not the current solution is optimal 333 bool checkOptimal(); 334 // Populate the ready list, prioiritizing fewest missed edges first 335 // Templated against the SchedGroup iterator (either reverse or forward). 336 template <typename T> 337 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, 338 T E); 339 // Add edges corresponding to the SchedGroups as assigned by solver 340 void makePipeline(); 341 // Link the SchedGroups in the best found pipeline. 342 // Tmplated against the SchedGroup iterator (either reverse or forward). 343 template <typename T> void linkSchedGroups(T I, T E); 344 // Add the edges from the SU to the other SchedGroups in pipeline, and 345 // return the number of edges missed. 346 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 347 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 348 // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It 349 // returns the cost (in terms of missed pipeline edges), and tracks the edges 350 // added in \p AddedEdges 351 template <typename T> 352 int linkSUnit(SUnit *SU, int SGID, 353 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); 354 // Remove the edges passed via \p AddedEdges 355 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); 356 // Convert the passed in maps to arrays for bidirectional iterators 357 void convertSyncMapsToArrays(); 358 359 void reset(); 360 361 public: 362 // Invoke the solver to map instructions to instruction groups. Heuristic && 363 // command-line-option determines to use exact or greedy algorithm. 364 void solve(); 365 366 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, 367 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 368 ScheduleDAGMI *DAG, bool IsBottomUp = 1) 369 : DAG(DAG), SyncedInstrs(SyncedInstrs), 370 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { 371 372 for (auto &PipelineInstrs : SyncedInstrs) { 373 if (PipelineInstrs.second.size() > 0) { 374 NeedsSolver = true; 375 break; 376 } 377 } 378 379 if (!NeedsSolver) 380 return; 381 382 convertSyncMapsToArrays(); 383 384 CurrPipeline = BestPipeline; 385 386 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && 387 PipelineInstrs[BeginSyncGroupIdx].size() == 0) 388 ++BeginSyncGroupIdx; 389 390 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) 391 return; 392 } 393 }; 394 395 void PipelineSolver::reset() { 396 397 for (auto &SyncPipeline : CurrPipeline) { 398 for (auto &SG : SyncPipeline) { 399 SmallVector<SUnit *, 32> TempCollection = SG.Collection; 400 SG.Collection.clear(); 401 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { 402 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; 403 }); 404 if (SchedBarr != TempCollection.end()) 405 SG.Collection.push_back(*SchedBarr); 406 } 407 } 408 409 CurrSyncGroupIdx = BeginSyncGroupIdx; 410 CurrConflInstNo = 0; 411 CurrCost = 0; 412 } 413 414 void PipelineSolver::convertSyncMapsToArrays() { 415 for (auto &SyncPipe : SyncedSchedGroups) { 416 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); 417 } 418 419 int PipelineIDx = SyncedInstrs.size() - 1; 420 PipelineInstrs.resize(SyncedInstrs.size()); 421 for (auto &SyncInstrMap : SyncedInstrs) { 422 for (auto &SUsToCandSGs : SyncInstrMap.second) { 423 if (PipelineInstrs[PipelineIDx].size() == 0) { 424 PipelineInstrs[PipelineIDx].push_back( 425 std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 426 continue; 427 } 428 auto SortPosition = PipelineInstrs[PipelineIDx].begin(); 429 // Insert them in sorted order -- this allows for good parsing order in 430 // the greedy algorithm 431 while (SortPosition != PipelineInstrs[PipelineIDx].end() && 432 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) 433 ++SortPosition; 434 PipelineInstrs[PipelineIDx].insert( 435 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); 436 } 437 --PipelineIDx; 438 } 439 } 440 441 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { 442 for (; I != E; ++I) { 443 auto &GroupA = *I; 444 for (auto J = std::next(I); J != E; ++J) { 445 auto &GroupB = *J; 446 GroupA.link(GroupB); 447 } 448 } 449 } 450 451 void PipelineSolver::makePipeline() { 452 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations 453 for (auto &SyncPipeline : BestPipeline) { 454 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); 455 for (auto &SG : SyncPipeline) { 456 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() 457 << " has: \n"); 458 SUnit *SGBarr = nullptr; 459 for (auto &SU : SG.Collection) { 460 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 461 SGBarr = SU; 462 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); 463 } 464 // Command line requested IGroupLP doesn't have SGBarr 465 if (!SGBarr) 466 continue; 467 resetEdges(*SGBarr, DAG); 468 SG.link(*SGBarr, false); 469 } 470 } 471 472 for (auto &SyncPipeline : BestPipeline) { 473 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) 474 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); 475 } 476 } 477 478 template <typename T> 479 int PipelineSolver::linkSUnit( 480 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 481 T I, T E) { 482 bool MakePred = false; 483 int AddedCost = 0; 484 for (; I < E; ++I) { 485 if (I->getSGID() == SGID) { 486 MakePred = true; 487 continue; 488 } 489 auto Group = *I; 490 AddedCost += Group.link(*SU, MakePred, AddedEdges); 491 assert(AddedCost >= 0); 492 } 493 return AddedCost; 494 } 495 496 int PipelineSolver::addEdges( 497 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, 498 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 499 500 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the 501 // instructions that are the ultimate successors in the resultant mutation. 502 // Therefore, in such a configuration, the SchedGroups occurring before the 503 // candidate SGID are successors of the candidate SchedGroup, thus the current 504 // SU should be linked as a predecessor to SUs in those SchedGroups. The 505 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple 506 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using 507 // IsBottomUp (in reverse). 508 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), 509 SyncPipeline.rend()) 510 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), 511 SyncPipeline.end()); 512 } 513 514 void PipelineSolver::removeEdges( 515 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { 516 // Only remove the edges that we have added when testing 517 // the fit. 518 for (auto &PredSuccPair : EdgesToRemove) { 519 SUnit *Pred = PredSuccPair.first; 520 SUnit *Succ = PredSuccPair.second; 521 522 auto Match = llvm::find_if( 523 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); 524 if (Match != Succ->Preds.end()) { 525 assert(Match->isArtificial()); 526 Succ->removePred(*Match); 527 } 528 } 529 } 530 531 void PipelineSolver::advancePosition() { 532 ++CurrConflInstNo; 533 534 if (static_cast<size_t>(CurrConflInstNo) >= 535 PipelineInstrs[CurrSyncGroupIdx].size()) { 536 CurrConflInstNo = 0; 537 ++CurrSyncGroupIdx; 538 // Advance to next non-trivial pipeline 539 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && 540 PipelineInstrs[CurrSyncGroupIdx].size() == 0) 541 ++CurrSyncGroupIdx; 542 } 543 } 544 545 void PipelineSolver::retreatPosition() { 546 assert(CurrConflInstNo >= 0); 547 assert(CurrSyncGroupIdx >= 0); 548 549 if (CurrConflInstNo > 0) { 550 --CurrConflInstNo; 551 return; 552 } 553 554 if (CurrConflInstNo == 0) { 555 // If we return to the starting position, we have explored 556 // the entire tree 557 if (CurrSyncGroupIdx == BeginSyncGroupIdx) 558 return; 559 560 --CurrSyncGroupIdx; 561 // Go to previous non-trivial pipeline 562 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) 563 --CurrSyncGroupIdx; 564 565 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; 566 } 567 } 568 569 bool PipelineSolver::checkOptimal() { 570 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { 571 if (BestCost == -1 || CurrCost < BestCost) { 572 BestPipeline = CurrPipeline; 573 BestCost = CurrCost; 574 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); 575 } 576 assert(BestCost >= 0); 577 } 578 579 bool DoneExploring = false; 580 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) 581 DoneExploring = true; 582 583 return (DoneExploring || BestCost == 0); 584 } 585 586 template <typename T> 587 void PipelineSolver::populateReadyList( 588 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { 589 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 590 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 591 assert(CurrSU.second.size() >= 1); 592 593 for (; I != E; ++I) { 594 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 595 int CandSGID = *I; 596 SchedGroup *Match; 597 for (auto &SG : SyncPipeline) { 598 if (SG.getSGID() == CandSGID) 599 Match = &SG; 600 } 601 602 if (UseCostHeur) { 603 if (Match->isFull()) { 604 ReadyList.push_back(std::pair(*I, MissPenalty)); 605 continue; 606 } 607 608 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 609 ReadyList.push_back(std::pair(*I, TempCost)); 610 removeEdges(AddedEdges); 611 } else 612 ReadyList.push_back(std::pair(*I, -1)); 613 } 614 615 if (UseCostHeur) { 616 std::sort(ReadyList.begin(), ReadyList.end(), 617 [](std::pair<int, int> A, std::pair<int, int> B) { 618 return A.second < B.second; 619 }); 620 } 621 622 assert(ReadyList.size() == CurrSU.second.size()); 623 } 624 625 bool PipelineSolver::solveExact() { 626 if (checkOptimal()) 627 return true; 628 629 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) 630 return false; 631 632 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); 633 assert(static_cast<size_t>(CurrConflInstNo) < 634 PipelineInstrs[CurrSyncGroupIdx].size()); 635 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 636 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 637 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 638 639 // SchedGroup -> Cost pairs 640 SmallVector<std::pair<int, int>, 4> ReadyList; 641 // Prioritize the candidate sched groups in terms of lowest cost first 642 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), 643 CurrSU.second.rend()) 644 : populateReadyList(ReadyList, CurrSU.second.begin(), 645 CurrSU.second.end()); 646 647 auto I = ReadyList.begin(); 648 auto E = ReadyList.end(); 649 for (; I != E; ++I) { 650 // If we are trying SGs in least cost order, and the current SG is cost 651 // infeasible, then all subsequent SGs will also be cost infeasible, so we 652 // can prune. 653 if (BestCost != -1 && (CurrCost + I->second > BestCost)) 654 return false; 655 656 int CandSGID = I->first; 657 int AddedCost = 0; 658 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 659 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 660 SchedGroup *Match; 661 for (auto &SG : SyncPipeline) { 662 if (SG.getSGID() == CandSGID) 663 Match = &SG; 664 } 665 666 if (Match->isFull()) 667 continue; 668 669 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) 670 continue; 671 672 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " 673 << (int)Match->getMask() << "and ID " << CandSGID 674 << "\n"); 675 Match->add(*CurrSU.first); 676 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 677 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); 678 CurrCost += AddedCost; 679 advancePosition(); 680 ++BranchesExplored; 681 bool FinishedExploring = false; 682 // If the Cost after adding edges is greater than a known solution, 683 // backtrack 684 if (CurrCost < BestCost || BestCost == -1) { 685 if (solveExact()) { 686 FinishedExploring = BestCost != 0; 687 if (!FinishedExploring) 688 return true; 689 } 690 } 691 692 retreatPosition(); 693 CurrCost -= AddedCost; 694 removeEdges(AddedEdges); 695 Match->pop(); 696 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 697 if (FinishedExploring) 698 return true; 699 } 700 701 // Try the pipeline where the current instruction is omitted 702 // Potentially if we omit a problematic instruction from the pipeline, 703 // all the other instructions can nicely fit. 704 CurrCost += MissPenalty; 705 advancePosition(); 706 707 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); 708 709 bool FinishedExploring = false; 710 if (CurrCost < BestCost || BestCost == -1) { 711 if (solveExact()) { 712 bool FinishedExploring = BestCost != 0; 713 if (!FinishedExploring) 714 return true; 715 } 716 } 717 718 retreatPosition(); 719 CurrCost -= MissPenalty; 720 return FinishedExploring; 721 } 722 723 template <typename T> 724 void PipelineSolver::greedyFind( 725 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { 726 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 727 int BestNodeCost = -1; 728 int TempCost; 729 SchedGroup *BestGroup = nullptr; 730 int BestGroupID = -1; 731 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; 732 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum 733 << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); 734 735 // Since we have added the potential SchedGroups from bottom up, but 736 // traversed the DAG from top down, parse over the groups from last to 737 // first. If we fail to do this for the greedy algorithm, the solution will 738 // likely not be good in more complex cases. 739 for (; I != E; ++I) { 740 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 741 int CandSGID = *I; 742 SchedGroup *Match; 743 for (auto &SG : SyncPipeline) { 744 if (SG.getSGID() == CandSGID) 745 Match = &SG; 746 } 747 748 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " 749 << (int)Match->getMask() << "\n"); 750 751 if (Match->isFull()) { 752 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); 753 continue; 754 } 755 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { 756 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); 757 continue; 758 } 759 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); 760 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); 761 if (TempCost < BestNodeCost || BestNodeCost == -1) { 762 BestGroup = Match; 763 BestNodeCost = TempCost; 764 BestGroupID = CandSGID; 765 } 766 removeEdges(AddedEdges); 767 if (BestNodeCost == 0) 768 break; 769 } 770 771 if (BestGroupID != -1) { 772 BestGroup->add(*CurrSU.first); 773 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); 774 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" 775 << (int)BestGroup->getMask() << "\n"); 776 BestCost += TempCost; 777 } else 778 BestCost += MissPenalty; 779 780 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; 781 } 782 783 bool PipelineSolver::solveGreedy() { 784 BestCost = 0; 785 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; 786 787 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { 788 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; 789 IsBottomUp 790 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) 791 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); 792 advancePosition(); 793 } 794 BestPipeline = CurrPipeline; 795 removeEdges(AddedEdges); 796 return false; 797 } 798 799 unsigned PipelineSolver::computeProblemSize() { 800 unsigned ProblemSize = 0; 801 for (auto &PipeConflicts : PipelineInstrs) { 802 ProblemSize += PipeConflicts.size(); 803 } 804 805 return ProblemSize; 806 } 807 808 void PipelineSolver::solve() { 809 if (!NeedsSolver) 810 return; 811 812 unsigned ProblemSize = computeProblemSize(); 813 assert(ProblemSize > 0); 814 815 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; 816 MissPenalty = (ProblemSize / 2) + 1; 817 818 LLVM_DEBUG(DAG->dump()); 819 if (EnableExactSolver || BelowCutoff) { 820 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); 821 solveGreedy(); 822 reset(); 823 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); 824 if (BestCost > 0) { 825 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); 826 solveExact(); 827 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); 828 } 829 } else { // Use the Greedy Algorithm by default 830 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); 831 solveGreedy(); 832 } 833 834 makePipeline(); 835 LLVM_DEBUG(dbgs() << "After applying mutation\n"); 836 LLVM_DEBUG(DAG->dump()); 837 } 838 839 enum IGLPStrategyID : int { 840 MFMASmallGemmOptID = 0, 841 MFMASmallGemmSingleWaveOptID = 1, 842 }; 843 844 // Implement a IGLP scheduling strategy. 845 class IGLPStrategy { 846 protected: 847 ScheduleDAGInstrs *DAG; 848 849 const SIInstrInfo *TII; 850 851 public: 852 // Add SchedGroups to \p Pipeline to implement this Strategy. 853 virtual void applyIGLPStrategy( 854 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 855 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0; 856 857 // Returns true if this strategy should be applied to a ScheduleDAG. 858 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; 859 860 bool IsBottomUp = 1; 861 862 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 863 : DAG(DAG), TII(TII) {} 864 865 virtual ~IGLPStrategy() = default; 866 }; 867 868 class MFMASmallGemmOpt final : public IGLPStrategy { 869 private: 870 public: 871 void applyIGLPStrategy( 872 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 873 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; 874 875 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 876 877 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 878 : IGLPStrategy(DAG, TII) { 879 IsBottomUp = 1; 880 } 881 }; 882 883 void MFMASmallGemmOpt::applyIGLPStrategy( 884 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 885 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { 886 // Count the number of MFMA instructions. 887 unsigned MFMACount = 0; 888 for (const MachineInstr &I : *DAG) 889 if (TII->isMFMAorWMMA(I)) 890 ++MFMACount; 891 892 const unsigned PipelineSyncID = 0; 893 SchedGroup *SG = nullptr; 894 for (unsigned I = 0; I < MFMACount * 3; ++I) { 895 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 896 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); 897 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 898 899 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 900 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 901 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 902 } 903 } 904 905 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { 906 private: 907 // Whether the DS_READ is a predecessor of first four MFMA in region 908 class EnablesInitialMFMA final : public InstructionRule { 909 public: 910 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 911 SmallVectorImpl<SchedGroup> &SyncPipe) override { 912 if (!SyncPipe.size()) 913 return false; 914 int MFMAsFound = 0; 915 if (!Cache->size()) { 916 for (auto &Elt : SyncPipe[0].DAG->SUnits) { 917 if (TII->isMFMAorWMMA(*Elt.getInstr())) { 918 ++MFMAsFound; 919 if (MFMAsFound > 4) 920 break; 921 Cache->push_back(&Elt); 922 } 923 } 924 } 925 926 assert(Cache->size()); 927 auto DAG = SyncPipe[0].DAG; 928 for (auto &Elt : *Cache) { 929 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) 930 return true; 931 } 932 return false; 933 } 934 935 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, 936 bool NeedsCache = false) 937 : InstructionRule(TII, SGID, NeedsCache) {} 938 }; 939 940 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE 941 class IsPermForDSW final : public InstructionRule { 942 public: 943 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 944 SmallVectorImpl<SchedGroup> &SyncPipe) override { 945 auto MI = SU->getInstr(); 946 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) 947 return false; 948 949 bool FitsInGroup = false; 950 // Does the VALU have a DS_WRITE successor 951 if (!Collection.size()) { 952 for (auto &Succ : SU->Succs) { 953 SUnit *SuccUnit = Succ.getSUnit(); 954 if (TII->isDS(*SuccUnit->getInstr()) && 955 SuccUnit->getInstr()->mayStore()) { 956 Cache->push_back(SuccUnit); 957 FitsInGroup = true; 958 } 959 } 960 return FitsInGroup; 961 } 962 963 assert(Cache->size()); 964 965 // Does the VALU have a DS_WRITE successor that is the same as other 966 // VALU already in the group. The V_PERMs will all share 1 DS_W succ 967 return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) { 968 return std::any_of(SU->Succs.begin(), SU->Succs.end(), 969 [&Elt](const SDep &ThisSucc) { 970 return ThisSucc.getSUnit() == Elt; 971 }); 972 }); 973 } 974 975 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 976 : InstructionRule(TII, SGID, NeedsCache) {} 977 }; 978 979 // Whether the SU is a successor of any element in previous SchedGroup 980 class IsSuccOfPrevGroup final : public InstructionRule { 981 public: 982 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 983 SmallVectorImpl<SchedGroup> &SyncPipe) override { 984 SchedGroup *OtherGroup = nullptr; 985 for (auto &PipeSG : SyncPipe) { 986 if ((unsigned)PipeSG.getSGID() == SGID - 1) { 987 OtherGroup = &PipeSG; 988 } 989 } 990 991 if (!OtherGroup) 992 return false; 993 if (!OtherGroup->Collection.size()) 994 return true; 995 996 // Does the previous VALU have this DS_Write as a successor 997 return (std::any_of(OtherGroup->Collection.begin(), 998 OtherGroup->Collection.end(), [&SU](SUnit *Elt) { 999 return std::any_of(Elt->Succs.begin(), 1000 Elt->Succs.end(), 1001 [&SU](SDep &Succ) { 1002 return Succ.getSUnit() == SU; 1003 }); 1004 })); 1005 } 1006 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, 1007 bool NeedsCache = false) 1008 : InstructionRule(TII, SGID, NeedsCache) {} 1009 }; 1010 1011 // Whether the combined load width of group is 128 bits 1012 class VMEMSize final : public InstructionRule { 1013 public: 1014 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1015 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1016 auto MI = SU->getInstr(); 1017 if (MI->getOpcode() == TargetOpcode::BUNDLE) 1018 return false; 1019 if (!Collection.size()) 1020 return true; 1021 1022 int NumBits = 0; 1023 1024 auto TRI = TII->getRegisterInfo(); 1025 auto &MRI = MI->getParent()->getParent()->getRegInfo(); 1026 for (auto &Elt : Collection) { 1027 auto Op = Elt->getInstr()->getOperand(0); 1028 auto Size = 1029 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); 1030 NumBits += Size; 1031 } 1032 1033 if (NumBits < 128) { 1034 assert(TII->isVMEM(*MI) && MI->mayLoad()); 1035 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( 1036 MRI, MI->getOperand(0))) <= 1037 128) 1038 return true; 1039 } 1040 1041 return false; 1042 } 1043 1044 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) 1045 : InstructionRule(TII, SGID, NeedsCache) {} 1046 }; 1047 1048 // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 1049 // that is /p Distance steps away 1050 class SharesPredWithPrevNthGroup final : public InstructionRule { 1051 private: 1052 unsigned Distance = 1; 1053 1054 public: 1055 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, 1056 SmallVectorImpl<SchedGroup> &SyncPipe) override { 1057 SchedGroup *OtherGroup = nullptr; 1058 if (!SyncPipe.size()) 1059 return false; 1060 1061 if (!Cache->size()) { 1062 1063 for (auto &PipeSG : SyncPipe) { 1064 if ((unsigned)PipeSG.getSGID() == SGID - Distance) { 1065 OtherGroup = &PipeSG; 1066 } 1067 } 1068 1069 if (!OtherGroup) 1070 return false; 1071 if (!OtherGroup->Collection.size()) 1072 return true; 1073 1074 for (auto &OtherEle : OtherGroup->Collection) { 1075 for (auto &Pred : OtherEle->Preds) { 1076 if (Pred.getSUnit()->getInstr()->getOpcode() == 1077 AMDGPU::V_PERM_B32_e64) 1078 Cache->push_back(Pred.getSUnit()); 1079 } 1080 } 1081 } 1082 1083 assert(Cache->size()); 1084 auto DAG = SyncPipe[0].DAG; 1085 // Does the previous DS_WRITE share a V_PERM predecessor with this 1086 // VMEM_READ 1087 return ( 1088 std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) { 1089 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); 1090 })); 1091 } 1092 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, 1093 unsigned SGID, bool NeedsCache = false) 1094 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} 1095 }; 1096 1097 public: 1098 void applyIGLPStrategy( 1099 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1100 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; 1101 1102 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } 1103 1104 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) 1105 : IGLPStrategy(DAG, TII) { 1106 IsBottomUp = 0; 1107 } 1108 }; 1109 1110 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( 1111 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, 1112 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { 1113 unsigned MFMACount = 0; 1114 unsigned DSWCount = 0; 1115 unsigned DSWWithPermCount = 0; 1116 unsigned DSWWithSharedVMEMCount = 0; 1117 unsigned DSRCount = 0; 1118 SmallVector<SUnit *, 6> DSWithPerms; 1119 for (auto &SU : DAG->SUnits) { 1120 auto I = SU.getInstr(); 1121 if (TII->isMFMAorWMMA(*I)) 1122 ++MFMACount; 1123 else if (TII->isDS(*I)) { 1124 if (I->mayLoad()) 1125 ++DSRCount; 1126 else if (I->mayStore()) { 1127 ++DSWCount; 1128 for (auto Pred : SU.Preds) { 1129 if (Pred.getSUnit()->getInstr()->getOpcode() == 1130 AMDGPU::V_PERM_B32_e64) { 1131 DSWithPerms.push_back(&SU); 1132 break; 1133 } 1134 } 1135 } 1136 } 1137 } 1138 DSWWithPermCount = DSWithPerms.size(); 1139 auto I = DSWithPerms.begin(); 1140 auto E = DSWithPerms.end(); 1141 1142 // Get the count of DS_WRITES with V_PERM predecessors which 1143 // have loop carried dependencies (WAR) on the same VMEM_READs. 1144 // We consider partial overlap as a miss -- in other words, 1145 // for a given DS_W, we only consider another DS_W as matching 1146 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred 1147 // for every V_PERM pred of this DS_W. 1148 DenseMap<MachineInstr *, SUnit *> VMEMLookup; 1149 SmallVector<SUnit *, 6> Counted; 1150 for (; I != E; I++) { 1151 SUnit *Cand = nullptr; 1152 bool MissedAny = false; 1153 for (auto &Pred : (*I)->Preds) { 1154 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) 1155 continue; 1156 1157 if (Cand && 1158 std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) 1159 break; 1160 1161 for (auto &Succ : Pred.getSUnit()->Succs) { 1162 auto MI = Succ.getSUnit()->getInstr(); 1163 if (!TII->isVMEM(*MI) || !MI->mayLoad()) 1164 continue; 1165 1166 if (MissedAny || !VMEMLookup.size()) { 1167 MissedAny = true; 1168 VMEMLookup[MI] = *I; 1169 continue; 1170 } 1171 1172 if (!VMEMLookup.contains(MI)) { 1173 MissedAny = true; 1174 VMEMLookup[MI] = *I; 1175 continue; 1176 } 1177 1178 Cand = VMEMLookup[MI]; 1179 if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) { 1180 MissedAny = true; 1181 break; 1182 } 1183 } 1184 } 1185 if (!MissedAny && Cand) { 1186 DSWWithSharedVMEMCount += 2; 1187 Counted.push_back(Cand); 1188 Counted.push_back(*I); 1189 } 1190 } 1191 1192 assert(DSWWithSharedVMEMCount <= DSWWithPermCount); 1193 SchedGroup *SG; 1194 unsigned PipelineSyncID = 0; 1195 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs 1196 if (DSWWithPermCount) { 1197 for (unsigned I = 0; I < MFMACount; I++) { 1198 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1199 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1200 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1201 1202 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1203 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); 1204 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1205 } 1206 } 1207 1208 PipelineSyncID = 1; 1209 // Phase 1: Break up DS_READ and MFMA clusters. 1210 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ 1211 // prefetch 1212 1213 // Make ready initial MFMA 1214 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1215 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); 1216 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); 1217 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1218 1219 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1220 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1221 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1222 1223 // Interleave MFMA with DS_READ prefetch 1224 for (unsigned I = 0; I < DSRCount - 4; ++I) { 1225 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1226 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); 1227 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1228 1229 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1230 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1231 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1232 } 1233 1234 // Phase 2a: Loop carried dependency with V_PERM 1235 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 1236 // depend on. Interleave MFMA to keep XDL unit busy throughout. 1237 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { 1238 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1239 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 1240 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 1241 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1242 1243 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1244 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1245 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 1246 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1247 1248 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1249 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1250 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1251 1, TII, SG->getSGID(), true)); 1252 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1253 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1254 1255 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1256 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1257 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1258 1259 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1260 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1261 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1262 3, TII, SG->getSGID(), true)); 1263 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1264 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1265 1266 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1267 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1268 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1269 } 1270 1271 // Phase 2b: Loop carried dependency without V_PERM 1272 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. 1273 // Interleave MFMA to keep XDL unit busy throughout. 1274 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { 1275 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1276 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1277 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1278 1279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1280 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1281 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1282 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1283 1284 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1285 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1286 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1287 } 1288 1289 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are 1290 // ultimately used by two DS_WRITE 1291 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they 1292 // depend on. Interleave MFMA to keep XDL unit busy throughout. 1293 1294 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { 1295 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1296 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 1297 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 1298 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1299 1300 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1301 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1302 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 1303 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1304 1305 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1306 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1308 1309 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1310 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); 1311 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); 1312 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1313 1314 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1315 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); 1316 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); 1317 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1318 1319 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1320 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1321 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1322 1323 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1324 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1325 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1326 2, TII, SG->getSGID(), true)); 1327 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1328 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1329 1330 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1331 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1332 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1333 1334 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1335 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); 1336 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( 1337 4, TII, SG->getSGID(), true)); 1338 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); 1339 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1340 1341 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( 1342 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); 1343 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); 1344 } 1345 } 1346 1347 static std::unique_ptr<IGLPStrategy> 1348 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, 1349 const SIInstrInfo *TII) { 1350 switch (ID) { 1351 case MFMASmallGemmOptID: 1352 return std::make_unique<MFMASmallGemmOpt>(DAG, TII); 1353 case MFMASmallGemmSingleWaveOptID: 1354 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); 1355 } 1356 1357 llvm_unreachable("Unknown IGLPStrategyID"); 1358 } 1359 1360 class IGroupLPDAGMutation : public ScheduleDAGMutation { 1361 private: 1362 const SIInstrInfo *TII; 1363 1364 ScheduleDAGMI *DAG; 1365 1366 // Organize lists of SchedGroups by their SyncID. SchedGroups / 1367 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added 1368 // between then. 1369 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; 1370 1371 // Used to track instructions that can be mapped to multiple sched groups 1372 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; 1373 1374 // Add DAG edges that enforce SCHED_BARRIER ordering. 1375 void addSchedBarrierEdges(SUnit &SU); 1376 1377 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 1378 // not be reordered accross the SCHED_BARRIER. This is used for the base 1379 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that 1380 // SCHED_BARRIER will always block all instructions that can be classified 1381 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size 1382 // and may only synchronize with some SchedGroups. Returns the inverse of 1383 // Mask. SCHED_BARRIER's mask describes which instruction types should be 1384 // allowed to be scheduled across it. Invert the mask to get the 1385 // SchedGroupMask of instructions that should be barred. 1386 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; 1387 1388 // Create SchedGroups for a SCHED_GROUP_BARRIER. 1389 void initSchedGroupBarrierPipelineStage( 1390 std::vector<SUnit>::reverse_iterator RIter); 1391 1392 void initIGLPOpt(SUnit &SU); 1393 1394 public: 1395 void apply(ScheduleDAGInstrs *DAGInstrs) override; 1396 1397 // The order in which the PipelineSolver should process the candidate 1398 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last 1399 // created SchedGroup first, and will consider that as the ultimate 1400 // predecessor group when linking. TOP_DOWN instead links and processes the 1401 // first created SchedGroup first. 1402 bool IsBottomUp = 1; 1403 1404 IGroupLPDAGMutation() = default; 1405 }; 1406 1407 unsigned SchedGroup::NumSchedGroups = 0; 1408 1409 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { 1410 if (A != B && DAG->canAddEdge(B, A)) { 1411 DAG->addEdge(B, SDep(A, SDep::Artificial)); 1412 return true; 1413 } 1414 return false; 1415 } 1416 1417 bool SchedGroup::canAddMI(const MachineInstr &MI) const { 1418 bool Result = false; 1419 if (MI.isMetaInstruction()) 1420 Result = false; 1421 1422 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && 1423 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI))) 1424 Result = true; 1425 1426 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && 1427 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI)) 1428 Result = true; 1429 1430 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && 1431 TII->isSALU(MI)) 1432 Result = true; 1433 1434 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && 1435 TII->isMFMAorWMMA(MI)) 1436 Result = true; 1437 1438 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && 1439 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1440 Result = true; 1441 1442 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && 1443 MI.mayLoad() && 1444 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1445 Result = true; 1446 1447 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && 1448 MI.mayStore() && 1449 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) 1450 Result = true; 1451 1452 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && 1453 TII->isDS(MI)) 1454 Result = true; 1455 1456 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && 1457 MI.mayLoad() && TII->isDS(MI)) 1458 Result = true; 1459 1460 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && 1461 MI.mayStore() && TII->isDS(MI)) 1462 Result = true; 1463 1464 LLVM_DEBUG( 1465 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) 1466 << (Result ? " could classify " : " unable to classify ") << MI); 1467 1468 return Result; 1469 } 1470 1471 int SchedGroup::link(SUnit &SU, bool MakePred, 1472 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { 1473 int MissedEdges = 0; 1474 for (auto *A : Collection) { 1475 SUnit *B = &SU; 1476 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1477 continue; 1478 if (MakePred) 1479 std::swap(A, B); 1480 1481 if (DAG->IsReachable(B, A)) 1482 continue; 1483 1484 // tryAddEdge returns false if there is a dependency that makes adding 1485 // the A->B edge impossible, otherwise it returns true; 1486 bool Added = tryAddEdge(A, B); 1487 if (Added) 1488 AddedEdges.push_back(std::pair(A, B)); 1489 else 1490 ++MissedEdges; 1491 } 1492 1493 return MissedEdges; 1494 } 1495 1496 void SchedGroup::link(SUnit &SU, bool MakePred) { 1497 for (auto *A : Collection) { 1498 SUnit *B = &SU; 1499 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) 1500 continue; 1501 if (MakePred) 1502 std::swap(A, B); 1503 1504 tryAddEdge(A, B); 1505 } 1506 } 1507 1508 void SchedGroup::link(SUnit &SU, 1509 function_ref<bool(const SUnit *A, const SUnit *B)> P) { 1510 for (auto *A : Collection) { 1511 SUnit *B = &SU; 1512 if (P(A, B)) 1513 std::swap(A, B); 1514 1515 tryAddEdge(A, B); 1516 } 1517 } 1518 1519 void SchedGroup::link(SchedGroup &OtherGroup) { 1520 for (auto *B : OtherGroup.Collection) 1521 link(*B); 1522 } 1523 1524 bool SchedGroup::canAddSU(SUnit &SU) const { 1525 MachineInstr &MI = *SU.getInstr(); 1526 if (MI.getOpcode() != TargetOpcode::BUNDLE) 1527 return canAddMI(MI); 1528 1529 // Special case for bundled MIs. 1530 const MachineBasicBlock *MBB = MI.getParent(); 1531 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 1532 while (E != MBB->end() && E->isBundledWithPred()) 1533 ++E; 1534 1535 // Return true if all of the bundled MIs can be added to this group. 1536 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); 1537 } 1538 1539 void SchedGroup::initSchedGroup() { 1540 for (auto &SU : DAG->SUnits) { 1541 if (isFull()) 1542 break; 1543 1544 if (canAddSU(SU)) 1545 add(SU); 1546 } 1547 } 1548 1549 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, 1550 SUnitsToCandidateSGsMap &SyncedInstrs) { 1551 SUnit &InitSU = *RIter; 1552 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { 1553 auto &SU = *RIter; 1554 if (isFull()) 1555 break; 1556 1557 if (canAddSU(SU)) 1558 SyncedInstrs[&SU].push_back(SGID); 1559 } 1560 1561 add(InitSU); 1562 assert(MaxSize); 1563 (*MaxSize)++; 1564 } 1565 1566 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { 1567 auto I = DAG->SUnits.rbegin(); 1568 auto E = DAG->SUnits.rend(); 1569 for (; I != E; ++I) { 1570 auto &SU = *I; 1571 if (isFull()) 1572 break; 1573 1574 if (canAddSU(SU)) 1575 SyncedInstrs[&SU].push_back(SGID); 1576 } 1577 } 1578 1579 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 1580 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1581 if (!TSchedModel || DAGInstrs->SUnits.empty()) 1582 return; 1583 1584 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 1585 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1586 TII = ST.getInstrInfo(); 1587 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 1588 SyncedSchedGroups.clear(); 1589 SyncedInstrs.clear(); 1590 bool foundSB = false; 1591 bool foundIGLP = false; 1592 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { 1593 unsigned Opc = R->getInstr()->getOpcode(); 1594 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. 1595 if (Opc == AMDGPU::SCHED_BARRIER) { 1596 addSchedBarrierEdges(*R); 1597 foundSB = true; 1598 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { 1599 initSchedGroupBarrierPipelineStage(R); 1600 foundSB = true; 1601 } else if (Opc == AMDGPU::IGLP_OPT) { 1602 resetEdges(*R, DAG); 1603 if (!foundSB && !foundIGLP) 1604 initIGLPOpt(*R); 1605 foundIGLP = true; 1606 } 1607 } 1608 1609 if (foundSB || foundIGLP) { 1610 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); 1611 // PipelineSolver performs the mutation by adding the edges it 1612 // determined as the best 1613 PS.solve(); 1614 } 1615 } 1616 1617 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 1618 MachineInstr &MI = *SchedBarrier.getInstr(); 1619 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 1620 // Remove all existing edges from the SCHED_BARRIER that were added due to the 1621 // instruction having side effects. 1622 resetEdges(SchedBarrier, DAG); 1623 auto InvertedMask = 1624 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); 1625 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); 1626 SG.initSchedGroup(); 1627 // Preserve original instruction ordering relative to the SCHED_BARRIER. 1628 SG.link( 1629 SchedBarrier, 1630 (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 1631 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); 1632 } 1633 1634 SchedGroupMask 1635 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { 1636 // Invert mask and erase bits for types of instructions that are implied to be 1637 // allowed past the SCHED_BARRIER. 1638 SchedGroupMask InvertedMask = ~Mask; 1639 1640 // ALU implies VALU, SALU, MFMA. 1641 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) 1642 InvertedMask &= 1643 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; 1644 // VALU, SALU, MFMA implies ALU. 1645 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || 1646 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || 1647 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) 1648 InvertedMask &= ~SchedGroupMask::ALU; 1649 1650 // VMEM implies VMEM_READ, VMEM_WRITE. 1651 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) 1652 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; 1653 // VMEM_READ, VMEM_WRITE implies VMEM. 1654 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || 1655 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) 1656 InvertedMask &= ~SchedGroupMask::VMEM; 1657 1658 // DS implies DS_READ, DS_WRITE. 1659 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) 1660 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; 1661 // DS_READ, DS_WRITE implies DS. 1662 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || 1663 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) 1664 InvertedMask &= ~SchedGroupMask::DS; 1665 1666 return InvertedMask; 1667 } 1668 1669 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( 1670 std::vector<SUnit>::reverse_iterator RIter) { 1671 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due 1672 // to the instruction having side effects. 1673 resetEdges(*RIter, DAG); 1674 MachineInstr &SGB = *RIter->getInstr(); 1675 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); 1676 int32_t SGMask = SGB.getOperand(0).getImm(); 1677 int32_t Size = SGB.getOperand(1).getImm(); 1678 int32_t SyncID = SGB.getOperand(2).getImm(); 1679 1680 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, 1681 Size, SyncID, DAG, TII); 1682 1683 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); 1684 } 1685 1686 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { 1687 IGLPStrategyID StrategyID = 1688 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); 1689 auto S = createIGLPStrategy(StrategyID, DAG, TII); 1690 if (S->shouldApplyStrategy(DAG)) { 1691 IsBottomUp = S->IsBottomUp; 1692 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); 1693 } 1694 } 1695 1696 } // namespace 1697 1698 namespace llvm { 1699 1700 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { 1701 return std::make_unique<IGroupLPDAGMutation>(); 1702 } 1703 1704 } // end namespace llvm 1705