1 //======----------- WindowScheduler.cpp - window scheduler -------------======// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // An implementation of the Window Scheduling software pipelining algorithm. 10 // 11 // The fundamental concept of the window scheduling algorithm involves folding 12 // the original MBB at a specific position, followed by list scheduling on the 13 // folded MIs. The optimal scheduling result is then chosen from various folding 14 // positions as the final scheduling outcome. 15 // 16 // The primary challenge in this algorithm lies in generating the folded MIs and 17 // establishing their dependencies. We have innovatively employed a new MBB, 18 // created by copying the original MBB three times, known as TripleMBB. This 19 // TripleMBB enables the convenient implementation of MI folding and dependency 20 // establishment. To facilitate the algorithm's implementation, we have also 21 // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. 22 // 23 // Another challenge in the algorithm is the scheduling of phis. Semantically, 24 // it is difficult to place the phis in the window and perform list scheduling. 25 // Therefore, we schedule these phis separately after each list scheduling. 26 // 27 // The provided implementation is designed for use before the Register Allocator 28 // (RA). If the target requires implementation after RA, it is recommended to 29 // reimplement analyseII(), schedulePhi(), and expand(). Additionally, 30 // target-specific logic can be added in initialize(), preProcess(), and 31 // postProcess(). 32 // 33 // Lastly, it is worth mentioning that getSearchIndexes() is an important 34 // function. We have experimented with more complex heuristics on downstream 35 // target and achieved favorable results. 36 // 37 //===----------------------------------------------------------------------===// 38 #include "llvm/CodeGen/WindowScheduler.h" 39 #include "llvm/ADT/Statistic.h" 40 #include "llvm/CodeGen/LiveIntervals.h" 41 #include "llvm/CodeGen/MachineLoopInfo.h" 42 #include "llvm/CodeGen/MachinePipeliner.h" 43 #include "llvm/CodeGen/ModuloSchedule.h" 44 #include "llvm/CodeGen/TargetPassConfig.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/TimeProfiler.h" 48 49 using namespace llvm; 50 51 #define DEBUG_TYPE "pipeliner" 52 53 namespace { 54 STATISTIC(NumTryWindowSchedule, 55 "Number of loops that we attempt to use window scheduling"); 56 STATISTIC(NumTryWindowSearch, 57 "Number of times that we run list schedule in the window scheduling"); 58 STATISTIC(NumWindowSchedule, 59 "Number of loops that we successfully use window scheduling"); 60 STATISTIC(NumFailAnalyseII, 61 "Window scheduling abort due to the failure of the II analysis"); 62 63 cl::opt<unsigned> 64 WindowSearchNum("window-search-num", 65 cl::desc("The number of searches per loop in the window " 66 "algorithm. 0 means no search number limit."), 67 cl::Hidden, cl::init(6)); 68 69 cl::opt<unsigned> WindowSearchRatio( 70 "window-search-ratio", 71 cl::desc("The ratio of searches per loop in the window algorithm. 100 " 72 "means search all positions in the loop, while 0 means not " 73 "performing any search."), 74 cl::Hidden, cl::init(40)); 75 76 cl::opt<unsigned> WindowIICoeff( 77 "window-ii-coeff", 78 cl::desc( 79 "The coefficient used when initializing II in the window algorithm."), 80 cl::Hidden, cl::init(5)); 81 82 cl::opt<unsigned> WindowRegionLimit( 83 "window-region-limit", 84 cl::desc( 85 "The lower limit of the scheduling region in the window algorithm."), 86 cl::Hidden, cl::init(3)); 87 88 cl::opt<unsigned> WindowDiffLimit( 89 "window-diff-limit", 90 cl::desc("The lower limit of the difference between best II and base II in " 91 "the window algorithm. If the difference is smaller than " 92 "this lower limit, window scheduling will not be performed."), 93 cl::Hidden, cl::init(2)); 94 } // namespace 95 96 // WindowIILimit serves as an indicator of abnormal scheduling results and could 97 // potentially be referenced by the derived target window scheduler. 98 cl::opt<unsigned> 99 WindowIILimit("window-ii-limit", 100 cl::desc("The upper limit of II in the window algorithm."), 101 cl::Hidden, cl::init(1000)); 102 103 WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) 104 : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), 105 Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), 106 TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { 107 TripleDAG = std::unique_ptr<ScheduleDAGInstrs>( 108 createMachineScheduler(/*OnlyBuildGraph=*/true)); 109 } 110 111 bool WindowScheduler::run() { 112 if (!initialize()) { 113 LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n"); 114 return false; 115 } 116 // The window algorithm is time-consuming, and its compilation time should be 117 // taken into consideration. 118 TimeTraceScope Scope("WindowSearch"); 119 ++NumTryWindowSchedule; 120 // Performing the relevant processing before window scheduling. 121 preProcess(); 122 // The main window scheduling begins. 123 std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler()); 124 auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio); 125 for (unsigned Idx : SearchIndexes) { 126 OriToCycle.clear(); 127 ++NumTryWindowSearch; 128 // The scheduling starts with non-phi instruction, so SchedPhiNum needs to 129 // be added to Idx. 130 unsigned Offset = Idx + SchedPhiNum; 131 auto Range = getScheduleRange(Offset, SchedInstrNum); 132 SchedDAG->startBlock(MBB); 133 SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum); 134 SchedDAG->schedule(); 135 LLVM_DEBUG(SchedDAG->dump()); 136 unsigned II = analyseII(*SchedDAG, Offset); 137 if (II == WindowIILimit) { 138 restoreTripleMBB(); 139 LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n"); 140 ++NumFailAnalyseII; 141 continue; 142 } 143 schedulePhi(Offset, II); 144 updateScheduleResult(Offset, II); 145 restoreTripleMBB(); 146 LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " 147 << II << ".\n"); 148 } 149 // Performing the relevant processing after window scheduling. 150 postProcess(); 151 // Check whether the scheduling result is valid. 152 if (!isScheduleValid()) { 153 LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n"); 154 return false; 155 } 156 LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset 157 << " and Best II is " << BestII << ".\n"); 158 // Expand the scheduling result to prologue, kernel, and epilogue. 159 expand(); 160 ++NumWindowSchedule; 161 return true; 162 } 163 164 ScheduleDAGInstrs * 165 WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { 166 return OnlyBuildGraph 167 ? new ScheduleDAGMI( 168 Context, std::make_unique<PostGenericScheduler>(Context), 169 true) 170 : Context->PassConfig->createMachineScheduler(Context); 171 } 172 173 bool WindowScheduler::initialize() { 174 if (!Subtarget->enableWindowScheduler()) { 175 LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n"); 176 return false; 177 } 178 // Initialized the member variables used by window algorithm. 179 OriMIs.clear(); 180 TriMIs.clear(); 181 TriToOri.clear(); 182 OriToCycle.clear(); 183 SchedResult.clear(); 184 SchedPhiNum = 0; 185 SchedInstrNum = 0; 186 BestII = UINT_MAX; 187 BestOffset = 0; 188 BaseII = 0; 189 // List scheduling used in the window algorithm depends on LiveIntervals. 190 if (!Context->LIS) { 191 LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n"); 192 return false; 193 } 194 // Check each MI in MBB. 195 SmallSet<Register, 8> PrevDefs; 196 SmallSet<Register, 8> PrevUses; 197 auto IsLoopCarried = [&](MachineInstr &Phi) { 198 // Two cases are checked here: (1)The virtual register defined by the 199 // preceding phi is used by the succeeding phi;(2)The preceding phi uses the 200 // virtual register defined by the succeeding phi. 201 if (PrevUses.count(Phi.getOperand(0).getReg())) 202 return true; 203 PrevDefs.insert(Phi.getOperand(0).getReg()); 204 for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { 205 if (PrevDefs.count(Phi.getOperand(I).getReg())) 206 return true; 207 PrevUses.insert(Phi.getOperand(I).getReg()); 208 } 209 return false; 210 }; 211 auto PLI = TII->analyzeLoopForPipelining(MBB); 212 for (auto &MI : *MBB) { 213 if (MI.isMetaInstruction() || MI.isTerminator()) 214 continue; 215 if (MI.isPHI()) { 216 if (IsLoopCarried(MI)) { 217 LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n"); 218 return false; 219 } 220 ++SchedPhiNum; 221 ++BestOffset; 222 } else 223 ++SchedInstrNum; 224 if (TII->isSchedulingBoundary(MI, MBB, *MF)) { 225 LLVM_DEBUG( 226 dbgs() << "Boundary MI is not allowed in window scheduling!\n"); 227 return false; 228 } 229 if (PLI->shouldIgnoreForPipelining(&MI)) { 230 LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " 231 "window scheduling!\n"); 232 return false; 233 } 234 for (auto &Def : MI.all_defs()) 235 if (Def.isReg() && Def.getReg().isPhysical()) { 236 LLVM_DEBUG(dbgs() << "Physical registers are not supported in " 237 "window scheduling!\n"); 238 return false; 239 } 240 } 241 if (SchedInstrNum <= WindowRegionLimit) { 242 LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n"); 243 return false; 244 } 245 return true; 246 } 247 248 void WindowScheduler::preProcess() { 249 // Prior to window scheduling, it's necessary to backup the original MBB, 250 // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. 251 backupMBB(); 252 generateTripleMBB(); 253 TripleDAG->startBlock(MBB); 254 TripleDAG->enterRegion( 255 MBB, MBB->begin(), MBB->getFirstTerminator(), 256 std::distance(MBB->begin(), MBB->getFirstTerminator())); 257 TripleDAG->buildSchedGraph(Context->AA); 258 } 259 260 void WindowScheduler::postProcess() { 261 // After window scheduling, it's necessary to clear the TripleDAG and restore 262 // to the original MBB. 263 TripleDAG->exitRegion(); 264 TripleDAG->finishBlock(); 265 restoreMBB(); 266 } 267 268 void WindowScheduler::backupMBB() { 269 for (auto &MI : MBB->instrs()) 270 OriMIs.push_back(&MI); 271 // Remove MIs and the corresponding live intervals. 272 for (auto &MI : make_early_inc_range(*MBB)) { 273 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); 274 MBB->remove(&MI); 275 } 276 } 277 278 void WindowScheduler::restoreMBB() { 279 // Erase MIs and the corresponding live intervals. 280 for (auto &MI : make_early_inc_range(*MBB)) { 281 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); 282 MI.eraseFromParent(); 283 } 284 // Restore MBB to the state before window scheduling. 285 for (auto *MI : OriMIs) 286 MBB->push_back(MI); 287 updateLiveIntervals(); 288 } 289 290 void WindowScheduler::generateTripleMBB() { 291 const unsigned DuplicateNum = 3; 292 TriMIs.clear(); 293 TriToOri.clear(); 294 assert(OriMIs.size() > 0 && "The Original MIs were not backed up!"); 295 // Step 1: Performing the first copy of MBB instructions, excluding 296 // terminators. At the same time, we back up the anti-register of phis. 297 // DefPairs hold the old and new define register pairs. 298 DenseMap<Register, Register> DefPairs; 299 for (auto *MI : OriMIs) { 300 if (MI->isMetaInstruction() || MI->isTerminator()) 301 continue; 302 if (MI->isPHI()) 303 if (Register AntiReg = getAntiRegister(MI)) 304 DefPairs[MI->getOperand(0).getReg()] = AntiReg; 305 auto *NewMI = MF->CloneMachineInstr(MI); 306 MBB->push_back(NewMI); 307 TriMIs.push_back(NewMI); 308 TriToOri[NewMI] = MI; 309 } 310 // Step 2: Performing the remaining two copies of MBB instructions excluding 311 // phis, and the last one contains terminators. At the same time, registers 312 // are updated accordingly. 313 for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { 314 for (auto *MI : OriMIs) { 315 if (MI->isPHI() || MI->isMetaInstruction() || 316 (MI->isTerminator() && Cnt < DuplicateNum - 1)) 317 continue; 318 auto *NewMI = MF->CloneMachineInstr(MI); 319 DenseMap<Register, Register> NewDefs; 320 // New defines are updated. 321 for (auto MO : NewMI->all_defs()) 322 if (MO.isReg() && MO.getReg().isVirtual()) { 323 Register NewDef = 324 MRI->createVirtualRegister(MRI->getRegClass(MO.getReg())); 325 NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI); 326 NewDefs[MO.getReg()] = NewDef; 327 } 328 // New uses are updated. 329 for (auto DefRegPair : DefPairs) 330 if (NewMI->readsRegister(DefRegPair.first, TRI)) { 331 Register NewUse = DefRegPair.second; 332 // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': 333 // 334 // BB.3: DefPairs 335 // ================================== 336 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) 337 // ... 338 // ================================== 339 // ... 340 // %4 = sub i32 %1, %3 341 // ... 342 // %7 = add i32 %5, %6 343 // ... 344 // ---------------------------------- 345 // ... 346 // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) 347 // ... 348 // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) 349 // ... 350 // ---------------------------------- 351 // ... 352 // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) 353 // ... ^ 354 // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) 355 // ... 356 // ================================== 357 // < Terminators > 358 // ================================== 359 if (DefPairs.count(NewUse)) 360 NewUse = DefPairs[NewUse]; 361 NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI); 362 } 363 // DefPairs is updated at last. 364 for (auto &NewDef : NewDefs) 365 DefPairs[NewDef.first] = NewDef.second; 366 MBB->push_back(NewMI); 367 TriMIs.push_back(NewMI); 368 TriToOri[NewMI] = MI; 369 } 370 } 371 // Step 3: The registers used by phis are updated, and they are generated in 372 // the third copy of MBB. 373 // In the privious example, the old phi is: 374 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] 375 // The new phi is: 376 // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] 377 for (auto &Phi : MBB->phis()) { 378 for (auto DefRegPair : DefPairs) 379 if (Phi.readsRegister(DefRegPair.first, TRI)) 380 Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI); 381 } 382 updateLiveIntervals(); 383 } 384 385 void WindowScheduler::restoreTripleMBB() { 386 // After list scheduling, the MBB is restored in one traversal. 387 for (size_t I = 0; I < TriMIs.size(); ++I) { 388 auto *MI = TriMIs[I]; 389 auto OldPos = MBB->begin(); 390 std::advance(OldPos, I); 391 auto CurPos = MI->getIterator(); 392 if (CurPos != OldPos) { 393 MBB->splice(OldPos, MBB, CurPos); 394 Context->LIS->handleMove(*MI, /*UpdateFlags=*/false); 395 } 396 } 397 } 398 399 SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum, 400 unsigned SearchRatio) { 401 // We use SearchRatio to get the index range, and then evenly get the indexes 402 // according to the SearchNum. This is a simple huristic. Depending on the 403 // characteristics of the target, more complex algorithms can be used for both 404 // performance and compilation time. 405 assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!"); 406 unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; 407 unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; 408 SmallVector<unsigned> SearchIndexes; 409 for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) 410 SearchIndexes.push_back(Idx); 411 return SearchIndexes; 412 } 413 414 int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { 415 // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. 416 unsigned MaxDepth = 1; 417 for (auto &SU : DAG.SUnits) 418 MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth); 419 return MaxDepth * WindowIICoeff; 420 } 421 422 int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, 423 unsigned Offset) { 424 int InitII = getEstimatedII(DAG); 425 ResourceManager RM(Subtarget, &DAG); 426 RM.init(InitII); 427 // ResourceManager and DAG are used to calculate the maximum cycle for the 428 // scheduled MIs. Since MIs in the Region have already been scheduled, the 429 // emit cycles can be estimated in order here. 430 int CurCycle = 0; 431 auto Range = getScheduleRange(Offset, SchedInstrNum); 432 for (auto &MI : Range) { 433 auto *SU = DAG.getSUnit(&MI); 434 int ExpectCycle = CurCycle; 435 // The predecessors of current MI determine its earliest issue cycle. 436 for (auto &Pred : SU->Preds) { 437 if (Pred.isWeak()) 438 continue; 439 auto *PredMI = Pred.getSUnit()->getInstr(); 440 int PredCycle = getOriCycle(PredMI); 441 ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency()); 442 } 443 // Zero cost instructions do not need to check resource. 444 if (!TII->isZeroCost(MI.getOpcode())) { 445 // ResourceManager can be used to detect resource conflicts between the 446 // current MI and the previously inserted MIs. 447 while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) { 448 ++CurCycle; 449 if (CurCycle == (int)WindowIILimit) 450 return CurCycle; 451 } 452 RM.reserveResources(*SU, CurCycle); 453 } 454 OriToCycle[getOriMI(&MI)] = CurCycle; 455 LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." 456 << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); 457 } 458 LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n"); 459 return CurCycle; 460 } 461 462 // By utilizing TripleDAG, we can easily establish dependencies between A and B. 463 // Based on the MaxCycle and the issue cycle of A and B, we can determine 464 // whether it is necessary to add a stall cycle. This is because, without 465 // inserting the stall cycle, the latency constraint between A and B cannot be 466 // satisfied. The details are as follows: 467 // 468 // New MBB: 469 // ======================================== 470 // < Phis > 471 // ======================================== (sliding direction) 472 // MBB copy 1 | 473 // V 474 // 475 // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- 476 // | | 477 // ===================V==================== | 478 // MBB copy 2 < MI B > | 479 // | 480 // < MI A > V 481 // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ 482 // : 483 // ===================V==================== 484 // MBB copy 3 < MI B'> 485 // 486 // 487 // 488 // 489 // ======================================== 490 // < Terminators > 491 // ======================================== 492 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { 493 int MaxStallCycle = 0; 494 int CurrentII = MaxCycle + 1; 495 auto Range = getScheduleRange(Offset, SchedInstrNum); 496 for (auto &MI : Range) { 497 auto *SU = TripleDAG->getSUnit(&MI); 498 int DefCycle = getOriCycle(&MI); 499 for (auto &Succ : SU->Succs) { 500 if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU) 501 continue; 502 // If the expected cycle does not exceed CurrentII, no check is needed. 503 if (DefCycle + (int)Succ.getLatency() <= CurrentII) 504 continue; 505 // If the cycle of the scheduled MI A is less than that of the scheduled 506 // MI B, the scheduling will fail because the lifetime of the 507 // corresponding register exceeds II. 508 auto *SuccMI = Succ.getSUnit()->getInstr(); 509 int UseCycle = getOriCycle(SuccMI); 510 if (DefCycle < UseCycle) 511 return WindowIILimit; 512 // Get the stall cycle introduced by the register between two trips. 513 int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle; 514 MaxStallCycle = std::max(MaxStallCycle, StallCycle); 515 } 516 } 517 LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n"); 518 return MaxStallCycle; 519 } 520 521 unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { 522 LLVM_DEBUG(dbgs() << "Start analyzing II:\n"); 523 int MaxCycle = calculateMaxCycle(DAG, Offset); 524 if (MaxCycle == (int)WindowIILimit) 525 return MaxCycle; 526 int StallCycle = calculateStallCycle(Offset, MaxCycle); 527 if (StallCycle == (int)WindowIILimit) 528 return StallCycle; 529 // The value of II is equal to the maximum execution cycle plus 1. 530 return MaxCycle + StallCycle + 1; 531 } 532 533 void WindowScheduler::schedulePhi(int Offset, unsigned &II) { 534 LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n"); 535 for (auto &Phi : MBB->phis()) { 536 int LateCycle = INT_MAX; 537 auto *SU = TripleDAG->getSUnit(&Phi); 538 for (auto &Succ : SU->Succs) { 539 // Phi doesn't have any Anti successors. 540 if (Succ.getKind() != SDep::Data) 541 continue; 542 // Phi is scheduled before the successor of stage 0. The issue cycle of 543 // phi is the latest cycle in this interval. 544 auto *SuccMI = Succ.getSUnit()->getInstr(); 545 int Cycle = getOriCycle(SuccMI); 546 if (getOriStage(getOriMI(SuccMI), Offset) == 0) 547 LateCycle = std::min(LateCycle, Cycle); 548 } 549 // The anti-dependency of phi need to be handled separately in the same way. 550 if (Register AntiReg = getAntiRegister(&Phi)) { 551 auto *AntiMI = MRI->getVRegDef(AntiReg); 552 // AntiReg may be defined outside the kernel MBB. 553 if (AntiMI->getParent() == MBB) { 554 auto AntiCycle = getOriCycle(AntiMI); 555 if (getOriStage(getOriMI(AntiMI), Offset) == 0) 556 LateCycle = std::min(LateCycle, AntiCycle); 557 } 558 } 559 // If there is no limit to the late cycle, a default value is given. 560 if (LateCycle == INT_MAX) 561 LateCycle = (int)(II - 1); 562 LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); 563 // The issue cycle of phi is set to the latest cycle in the interval. 564 auto *OriPhi = getOriMI(&Phi); 565 OriToCycle[OriPhi] = LateCycle; 566 } 567 } 568 569 DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset, 570 unsigned II) { 571 // At each issue cycle, phi is placed before MIs in stage 0. So the simplest 572 // way is to put phi at the beginning of the current cycle. 573 DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs; 574 auto Range = getScheduleRange(Offset, SchedInstrNum); 575 for (auto &Phi : MBB->phis()) 576 CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi)); 577 for (auto &MI : Range) 578 CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI)); 579 // Each MI is assigned a separate ordered Id, which is used as a sort marker 580 // in the following expand process. 581 DenseMap<MachineInstr *, int> IssueOrder; 582 int Id = 0; 583 for (int Cycle = 0; Cycle < (int)II; ++Cycle) { 584 if (!CycleToMIs.count(Cycle)) 585 continue; 586 for (auto *MI : CycleToMIs[Cycle]) 587 IssueOrder[MI] = Id++; 588 } 589 return IssueOrder; 590 } 591 592 void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { 593 // At the first update, Offset is equal to SchedPhiNum. At this time, only 594 // BestII, BestOffset, and BaseII need to be updated. 595 if (Offset == SchedPhiNum) { 596 BestII = II; 597 BestOffset = SchedPhiNum; 598 BaseII = II; 599 return; 600 } 601 // The update will only continue if the II is smaller than BestII and the II 602 // is sufficiently small. 603 if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) 604 return; 605 BestII = II; 606 BestOffset = Offset; 607 // Record the result of the current list scheduling, noting that each MI is 608 // stored unordered in SchedResult. 609 SchedResult.clear(); 610 auto IssueOrder = getIssueOrder(Offset, II); 611 for (auto &Pair : OriToCycle) { 612 assert(IssueOrder.count(Pair.first) && "Cannot find original MI!"); 613 SchedResult.push_back(std::make_tuple(Pair.first, Pair.second, 614 getOriStage(Pair.first, Offset), 615 IssueOrder[Pair.first])); 616 } 617 } 618 619 void WindowScheduler::expand() { 620 // The MIs in the SchedResult are sorted by the issue order ID. 621 llvm::stable_sort(SchedResult, 622 [](const std::tuple<MachineInstr *, int, int, int> &A, 623 const std::tuple<MachineInstr *, int, int, int> &B) { 624 return std::get<3>(A) < std::get<3>(B); 625 }); 626 // Use the scheduling infrastructure for expansion, noting that InstrChanges 627 // is not supported here. 628 DenseMap<MachineInstr *, int> Cycles, Stages; 629 std::vector<MachineInstr *> OrderedInsts; 630 for (auto &Info : SchedResult) { 631 auto *MI = std::get<0>(Info); 632 OrderedInsts.push_back(MI); 633 Cycles[MI] = std::get<1>(Info); 634 Stages[MI] = std::get<2>(Info); 635 LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] 636 << "]: " << *MI); 637 } 638 ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), 639 std::move(Stages)); 640 ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, 641 ModuloScheduleExpander::InstrChangesTy()); 642 MSE.expand(); 643 MSE.cleanup(); 644 } 645 646 void WindowScheduler::updateLiveIntervals() { 647 SmallVector<Register, 128> UsedRegs; 648 for (MachineInstr &MI : *MBB) 649 for (const MachineOperand &MO : MI.operands()) { 650 if (!MO.isReg() || MO.getReg() == 0) 651 continue; 652 Register Reg = MO.getReg(); 653 if (!is_contained(UsedRegs, Reg)) 654 UsedRegs.push_back(Reg); 655 } 656 Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs); 657 } 658 659 iterator_range<MachineBasicBlock::iterator> 660 WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { 661 auto RegionBegin = MBB->begin(); 662 std::advance(RegionBegin, Offset); 663 auto RegionEnd = RegionBegin; 664 std::advance(RegionEnd, Num); 665 return make_range(RegionBegin, RegionEnd); 666 } 667 668 int WindowScheduler::getOriCycle(MachineInstr *NewMI) { 669 assert(TriToOri.count(NewMI) && "Cannot find original MI!"); 670 auto *OriMI = TriToOri[NewMI]; 671 assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!"); 672 return OriToCycle[OriMI]; 673 } 674 675 MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { 676 assert(TriToOri.count(NewMI) && "Cannot find original MI!"); 677 return TriToOri[NewMI]; 678 } 679 680 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { 681 assert(llvm::find(OriMIs, OriMI) != OriMIs.end() && 682 "Cannot find OriMI in OriMIs!"); 683 // If there is no instruction fold, all MI stages are 0. 684 if (Offset == SchedPhiNum) 685 return 0; 686 // For those MIs with an ID less than the Offset, their stages are set to 0, 687 // while the rest are set to 1. 688 unsigned Id = 0; 689 for (auto *MI : OriMIs) { 690 if (MI->isMetaInstruction()) 691 continue; 692 if (MI == OriMI) 693 break; 694 ++Id; 695 } 696 return Id >= (size_t)Offset ? 1 : 0; 697 } 698 699 Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { 700 assert(Phi->isPHI() && "Expecting PHI!"); 701 Register AntiReg; 702 for (auto MO : Phi->uses()) { 703 if (MO.isReg()) 704 AntiReg = MO.getReg(); 705 else if (MO.isMBB() && MO.getMBB() == MBB) 706 return AntiReg; 707 } 708 return 0; 709 } 710