1 //===- MachinePipeliner.h - Machine Software Pipeliner Pass -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner. 10 // 11 // Software pipelining (SWP) is an instruction scheduling technique for loops 12 // that overlap loop iterations and exploits ILP via a compiler transformation. 13 // 14 // Swing Modulo Scheduling is an implementation of software pipelining 15 // that generates schedules that are near optimal in terms of initiation 16 // interval, register requirements, and stage count. See the papers: 17 // 18 // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa, 19 // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996 20 // Conference on Parallel Architectures and Compilation Techiniques. 21 // 22 // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J. 23 // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE 24 // Transactions on Computers, Vol. 50, No. 3, 2001. 25 // 26 // "An Implementation of Swing Modulo Scheduling With Extensions for 27 // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at 28 // Urbana-Champaign, 2005. 29 // 30 // 31 // The SMS algorithm consists of three main steps after computing the minimal 32 // initiation interval (MII). 33 // 1) Analyze the dependence graph and compute information about each 34 // instruction in the graph. 35 // 2) Order the nodes (instructions) by priority based upon the heuristics 36 // described in the algorithm. 37 // 3) Attempt to schedule the nodes in the specified order using the MII. 38 // 39 //===----------------------------------------------------------------------===// 40 #ifndef LLVM_CODEGEN_MACHINEPIPELINER_H 41 #define LLVM_CODEGEN_MACHINEPIPELINER_H 42 43 #include "llvm/ADT/STLExtras.h" 44 #include "llvm/ADT/SetVector.h" 45 #include "llvm/CodeGen/DFAPacketizer.h" 46 #include "llvm/CodeGen/MachineDominators.h" 47 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" 48 #include "llvm/CodeGen/MachineScheduler.h" 49 #include "llvm/CodeGen/RegisterClassInfo.h" 50 #include "llvm/CodeGen/ScheduleDAGInstrs.h" 51 #include "llvm/CodeGen/ScheduleDAGMutation.h" 52 #include "llvm/CodeGen/TargetInstrInfo.h" 53 #include "llvm/CodeGen/WindowScheduler.h" 54 #include "llvm/InitializePasses.h" 55 56 #include <deque> 57 58 namespace llvm { 59 60 class AAResults; 61 class NodeSet; 62 class SMSchedule; 63 64 extern cl::opt<bool> SwpEnableCopyToPhi; 65 extern cl::opt<int> SwpForceIssueWidth; 66 67 /// The main class in the implementation of the target independent 68 /// software pipeliner pass. 69 class MachinePipeliner : public MachineFunctionPass { 70 public: 71 MachineFunction *MF = nullptr; 72 MachineOptimizationRemarkEmitter *ORE = nullptr; 73 const MachineLoopInfo *MLI = nullptr; 74 const MachineDominatorTree *MDT = nullptr; 75 const InstrItineraryData *InstrItins = nullptr; 76 const TargetInstrInfo *TII = nullptr; 77 RegisterClassInfo RegClassInfo; 78 bool disabledByPragma = false; 79 unsigned II_setByPragma = 0; 80 81 #ifndef NDEBUG 82 static int NumTries; 83 #endif 84 85 /// Cache the target analysis information about the loop. 86 struct LoopInfo { 87 MachineBasicBlock *TBB = nullptr; 88 MachineBasicBlock *FBB = nullptr; 89 SmallVector<MachineOperand, 4> BrCond; 90 MachineInstr *LoopInductionVar = nullptr; 91 MachineInstr *LoopCompare = nullptr; 92 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopPipelinerInfo = 93 nullptr; 94 }; 95 LoopInfo LI; 96 97 static char ID; 98 MachinePipeliner()99 MachinePipeliner() : MachineFunctionPass(ID) { 100 initializeMachinePipelinerPass(*PassRegistry::getPassRegistry()); 101 } 102 103 bool runOnMachineFunction(MachineFunction &MF) override; 104 105 void getAnalysisUsage(AnalysisUsage &AU) const override; 106 107 private: 108 void preprocessPhiNodes(MachineBasicBlock &B); 109 bool canPipelineLoop(MachineLoop &L); 110 bool scheduleLoop(MachineLoop &L); 111 bool swingModuloScheduler(MachineLoop &L); 112 void setPragmaPipelineOptions(MachineLoop &L); 113 bool runWindowScheduler(MachineLoop &L); 114 bool useSwingModuloScheduler(); 115 bool useWindowScheduler(bool Changed); 116 }; 117 118 /// Represents a dependence between two instruction. 119 class SwingSchedulerDDGEdge { 120 SUnit *Dst = nullptr; 121 SDep Pred; 122 unsigned Distance = 0; 123 bool IsValidationOnly = false; 124 125 public: 126 /// Creates an edge corresponding to an edge represented by \p PredOrSucc and 127 /// \p Dep in the original DAG. This pair has no information about the 128 /// direction of the edge, so we need to pass an additional argument \p 129 /// IsSucc. SwingSchedulerDDGEdge(SUnit * PredOrSucc,const SDep & Dep,bool IsSucc,bool IsValidationOnly)130 SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc, 131 bool IsValidationOnly) 132 : Dst(PredOrSucc), Pred(Dep), Distance(0u), 133 IsValidationOnly(IsValidationOnly) { 134 SUnit *Src = Dep.getSUnit(); 135 136 if (IsSucc) { 137 std::swap(Src, Dst); 138 Pred.setSUnit(Src); 139 } 140 141 // An anti-dependence to PHI means loop-carried dependence. 142 if (Pred.getKind() == SDep::Anti && Src->getInstr()->isPHI()) { 143 Distance = 1; 144 std::swap(Src, Dst); 145 auto Reg = Pred.getReg(); 146 Pred = SDep(Src, SDep::Kind::Data, Reg); 147 } 148 } 149 150 /// Returns the SUnit from which the edge comes (source node). getSrc()151 SUnit *getSrc() const { return Pred.getSUnit(); } 152 153 /// Returns the SUnit to which the edge points (destination node). getDst()154 SUnit *getDst() const { return Dst; } 155 156 /// Returns the latency value for the edge. getLatency()157 unsigned getLatency() const { return Pred.getLatency(); } 158 159 /// Sets the latency for the edge. setLatency(unsigned Latency)160 void setLatency(unsigned Latency) { Pred.setLatency(Latency); } 161 162 /// Returns the distance value for the edge. getDistance()163 unsigned getDistance() const { return Distance; } 164 165 /// Sets the distance value for the edge. setDistance(unsigned D)166 void setDistance(unsigned D) { Distance = D; } 167 168 /// Returns the register associated with the edge. getReg()169 Register getReg() const { return Pred.getReg(); } 170 171 /// Returns true if the edge represents anti dependence. isAntiDep()172 bool isAntiDep() const { return Pred.getKind() == SDep::Kind::Anti; } 173 174 /// Returns true if the edge represents output dependence. isOutputDep()175 bool isOutputDep() const { return Pred.getKind() == SDep::Kind::Output; } 176 177 /// Returns true if the edge represents a dependence that is not data, anti or 178 /// output dependence. isOrderDep()179 bool isOrderDep() const { return Pred.getKind() == SDep::Kind::Order; } 180 181 /// Returns true if the edge represents unknown scheduling barrier. isBarrier()182 bool isBarrier() const { return Pred.isBarrier(); } 183 184 /// Returns true if the edge represents an artificial dependence. isArtificial()185 bool isArtificial() const { return Pred.isArtificial(); } 186 187 /// Tests if this is a Data dependence that is associated with a register. isAssignedRegDep()188 bool isAssignedRegDep() const { return Pred.isAssignedRegDep(); } 189 190 /// Returns true for DDG nodes that we ignore when computing the cost 191 /// functions. We ignore the back-edge recurrence in order to avoid unbounded 192 /// recursion in the calculation of the ASAP, ALAP, etc functions. 193 bool ignoreDependence(bool IgnoreAnti) const; 194 195 /// Returns true if this edge is intended to be used only for validating the 196 /// schedule. isValidationOnly()197 bool isValidationOnly() const { return IsValidationOnly; } 198 }; 199 200 /// Represents loop-carried dependencies. Because SwingSchedulerDAG doesn't 201 /// assume cycle dependencies as the name suggests, such dependencies must be 202 /// handled separately. After DAG construction is finished, these dependencies 203 /// are added to SwingSchedulerDDG. 204 /// TODO: Also handle output-dependencies introduced by physical registers. 205 struct LoopCarriedEdges { 206 using OrderDep = SmallSetVector<SUnit *, 8>; 207 using OrderDepsType = DenseMap<SUnit *, OrderDep>; 208 209 OrderDepsType OrderDeps; 210 getOrderDepOrNullLoopCarriedEdges211 const OrderDep *getOrderDepOrNull(SUnit *Key) const { 212 auto Ite = OrderDeps.find(Key); 213 if (Ite == OrderDeps.end()) 214 return nullptr; 215 return &Ite->second; 216 } 217 218 /// Adds some edges to the original DAG that correspond to loop-carried 219 /// dependencies. Historically, loop-carried edges are represented by using 220 /// non-loop-carried edges in the original DAG. This function appends such 221 /// edges to preserve the previous behavior. 222 void modifySUnits(std::vector<SUnit> &SUnits, const TargetInstrInfo *TII); 223 224 void dump(SUnit *SU, const TargetRegisterInfo *TRI, 225 const MachineRegisterInfo *MRI) const; 226 }; 227 228 /// This class provides APIs to retrieve edges from/to an SUnit node, with a 229 /// particular focus on loop-carried dependencies. Since SUnit is not designed 230 /// to represent such edges, handling them directly using its APIs has required 231 /// non-trivial logic in the past. This class serves as a wrapper around SUnit, 232 /// offering a simpler interface for managing these dependencies. 233 class SwingSchedulerDDG { 234 using EdgesType = SmallVector<SwingSchedulerDDGEdge, 4>; 235 236 struct SwingSchedulerDDGEdges { 237 EdgesType Preds; 238 EdgesType Succs; 239 }; 240 241 void initEdges(SUnit *SU); 242 243 SUnit *EntrySU; 244 SUnit *ExitSU; 245 246 std::vector<SwingSchedulerDDGEdges> EdgesVec; 247 SwingSchedulerDDGEdges EntrySUEdges; 248 SwingSchedulerDDGEdges ExitSUEdges; 249 250 /// Edges that are used only when validating the schedule. These edges are 251 /// not considered to drive the optimization heuristics. 252 SmallVector<SwingSchedulerDDGEdge, 8> ValidationOnlyEdges; 253 254 /// Adds a NON-validation-only edge to the DDG. Assumes to be called only by 255 /// the ctor. 256 void addEdge(const SUnit *SU, const SwingSchedulerDDGEdge &Edge); 257 258 SwingSchedulerDDGEdges &getEdges(const SUnit *SU); 259 const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const; 260 261 public: 262 SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU, 263 const LoopCarriedEdges &LCE); 264 265 const EdgesType &getInEdges(const SUnit *SU) const; 266 267 const EdgesType &getOutEdges(const SUnit *SU) const; 268 269 bool isValidSchedule(const SMSchedule &Schedule) const; 270 }; 271 272 /// This class builds the dependence graph for the instructions in a loop, 273 /// and attempts to schedule the instructions using the SMS algorithm. 274 class SwingSchedulerDAG : public ScheduleDAGInstrs { 275 MachinePipeliner &Pass; 276 277 std::unique_ptr<SwingSchedulerDDG> DDG; 278 279 /// The minimum initiation interval between iterations for this schedule. 280 unsigned MII = 0; 281 /// The maximum initiation interval between iterations for this schedule. 282 unsigned MAX_II = 0; 283 /// Set to true if a valid pipelined schedule is found for the loop. 284 bool Scheduled = false; 285 MachineLoop &Loop; 286 LiveIntervals &LIS; 287 const RegisterClassInfo &RegClassInfo; 288 unsigned II_setByPragma = 0; 289 TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr; 290 291 /// A topological ordering of the SUnits, which is needed for changing 292 /// dependences and iterating over the SUnits. 293 ScheduleDAGTopologicalSort Topo; 294 295 struct NodeInfo { 296 int ASAP = 0; 297 int ALAP = 0; 298 int ZeroLatencyDepth = 0; 299 int ZeroLatencyHeight = 0; 300 301 NodeInfo() = default; 302 }; 303 /// Computed properties for each node in the graph. 304 std::vector<NodeInfo> ScheduleInfo; 305 306 enum OrderKind { BottomUp = 0, TopDown = 1 }; 307 /// Computed node ordering for scheduling. 308 SetVector<SUnit *> NodeOrder; 309 310 using NodeSetType = SmallVector<NodeSet, 8>; 311 using ValueMapTy = DenseMap<unsigned, unsigned>; 312 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; 313 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; 314 315 /// Instructions to change when emitting the final schedule. 316 DenseMap<SUnit *, std::pair<Register, int64_t>> InstrChanges; 317 318 /// We may create a new instruction, so remember it because it 319 /// must be deleted when the pass is finished. 320 DenseMap<MachineInstr*, MachineInstr *> NewMIs; 321 322 /// Ordered list of DAG postprocessing steps. 323 std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations; 324 325 /// Used to compute single-iteration dependencies (i.e., buildSchedGraph). 326 AliasAnalysis *AA; 327 328 /// Used to compute loop-carried dependencies (i.e., 329 /// addLoopCarriedDependences). 330 BatchAAResults BAA; 331 332 /// Helper class to implement Johnson's circuit finding algorithm. 333 class Circuits { 334 std::vector<SUnit> &SUnits; 335 SetVector<SUnit *> Stack; 336 BitVector Blocked; 337 SmallVector<SmallPtrSet<SUnit *, 4>, 10> B; 338 SmallVector<SmallVector<int, 4>, 16> AdjK; 339 // Node to Index from ScheduleDAGTopologicalSort 340 std::vector<int> *Node2Idx; 341 unsigned NumPaths = 0u; 342 static unsigned MaxPaths; 343 344 public: Circuits(std::vector<SUnit> & SUs,ScheduleDAGTopologicalSort & Topo)345 Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo) 346 : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) { 347 Node2Idx = new std::vector<int>(SUs.size()); 348 unsigned Idx = 0; 349 for (const auto &NodeNum : Topo) 350 Node2Idx->at(NodeNum) = Idx++; 351 } 352 Circuits &operator=(const Circuits &other) = delete; 353 Circuits(const Circuits &other) = delete; ~Circuits()354 ~Circuits() { delete Node2Idx; } 355 356 /// Reset the data structures used in the circuit algorithm. reset()357 void reset() { 358 Stack.clear(); 359 Blocked.reset(); 360 B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>()); 361 NumPaths = 0; 362 } 363 364 void createAdjacencyStructure(SwingSchedulerDAG *DAG); 365 bool circuit(int V, int S, NodeSetType &NodeSets, 366 const SwingSchedulerDAG *DAG, bool HasBackedge = false); 367 void unblock(int U); 368 }; 369 370 struct CopyToPhiMutation : public ScheduleDAGMutation { 371 void apply(ScheduleDAGInstrs *DAG) override; 372 }; 373 374 public: SwingSchedulerDAG(MachinePipeliner & P,MachineLoop & L,LiveIntervals & lis,const RegisterClassInfo & rci,unsigned II,TargetInstrInfo::PipelinerLoopInfo * PLI,AliasAnalysis * AA)375 SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis, 376 const RegisterClassInfo &rci, unsigned II, 377 TargetInstrInfo::PipelinerLoopInfo *PLI, AliasAnalysis *AA) 378 : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis), 379 RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI), 380 Topo(SUnits, &ExitSU), AA(AA), BAA(*AA) { 381 P.MF->getSubtarget().getSMSMutations(Mutations); 382 if (SwpEnableCopyToPhi) 383 Mutations.push_back(std::make_unique<CopyToPhiMutation>()); 384 BAA.enableCrossIterationMode(); 385 } 386 387 void schedule() override; 388 void finishBlock() override; 389 390 /// Return true if the loop kernel has been scheduled. hasNewSchedule()391 bool hasNewSchedule() { return Scheduled; } 392 393 /// Return the earliest time an instruction may be scheduled. getASAP(SUnit * Node)394 int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; } 395 396 /// Return the latest time an instruction my be scheduled. getALAP(SUnit * Node)397 int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; } 398 399 /// The mobility function, which the number of slots in which 400 /// an instruction may be scheduled. getMOV(SUnit * Node)401 int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); } 402 403 /// The depth, in the dependence graph, for a node. getDepth(SUnit * Node)404 unsigned getDepth(SUnit *Node) { return Node->getDepth(); } 405 406 /// The maximum unweighted length of a path from an arbitrary node to the 407 /// given node in which each edge has latency 0 getZeroLatencyDepth(SUnit * Node)408 int getZeroLatencyDepth(SUnit *Node) { 409 return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth; 410 } 411 412 /// The height, in the dependence graph, for a node. getHeight(SUnit * Node)413 unsigned getHeight(SUnit *Node) { return Node->getHeight(); } 414 415 /// The maximum unweighted length of a path from the given node to an 416 /// arbitrary node in which each edge has latency 0 getZeroLatencyHeight(SUnit * Node)417 int getZeroLatencyHeight(SUnit *Node) { 418 return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight; 419 } 420 421 bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const; 422 423 void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule); 424 425 void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs); 426 427 /// Return the new base register that was stored away for the changed 428 /// instruction. getInstrBaseReg(SUnit * SU)429 Register getInstrBaseReg(SUnit *SU) const { 430 DenseMap<SUnit *, std::pair<Register, int64_t>>::const_iterator It = 431 InstrChanges.find(SU); 432 if (It != InstrChanges.end()) 433 return It->second.first; 434 return Register(); 435 } 436 addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation)437 void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) { 438 Mutations.push_back(std::move(Mutation)); 439 } 440 classof(const ScheduleDAGInstrs * DAG)441 static bool classof(const ScheduleDAGInstrs *DAG) { return true; } 442 getDDG()443 const SwingSchedulerDDG *getDDG() const { return DDG.get(); } 444 445 bool mayOverlapInLaterIter(const MachineInstr *BaseMI, 446 const MachineInstr *OtherMI) const; 447 448 private: 449 LoopCarriedEdges addLoopCarriedDependences(); 450 void updatePhiDependences(); 451 void changeDependences(); 452 unsigned calculateResMII(); 453 unsigned calculateRecMII(NodeSetType &RecNodeSets); 454 void findCircuits(NodeSetType &NodeSets); 455 void fuseRecs(NodeSetType &NodeSets); 456 void removeDuplicateNodes(NodeSetType &NodeSets); 457 void computeNodeFunctions(NodeSetType &NodeSets); 458 void registerPressureFilter(NodeSetType &NodeSets); 459 void colocateNodeSets(NodeSetType &NodeSets); 460 void checkNodeSets(NodeSetType &NodeSets); 461 void groupRemainingNodes(NodeSetType &NodeSets); 462 void addConnectedNodes(SUnit *SU, NodeSet &NewSet, 463 SetVector<SUnit *> &NodesAdded); 464 void computeNodeOrder(NodeSetType &NodeSets); 465 void checkValidNodeOrder(const NodeSetType &Circuits) const; 466 bool schedulePipeline(SMSchedule &Schedule); 467 bool computeDelta(const MachineInstr &MI, int &Delta) const; 468 MachineInstr *findDefInLoop(Register Reg); 469 bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos, 470 unsigned &OffsetPos, Register &NewBase, 471 int64_t &NewOffset); 472 void postProcessDAG(); 473 /// Set the Minimum Initiation Interval for this schedule attempt. 474 void setMII(unsigned ResMII, unsigned RecMII); 475 /// Set the Maximum Initiation Interval for this schedule attempt. 476 void setMAX_II(); 477 }; 478 479 /// A NodeSet contains a set of SUnit DAG nodes with additional information 480 /// that assigns a priority to the set. 481 class NodeSet { 482 SetVector<SUnit *> Nodes; 483 bool HasRecurrence = false; 484 unsigned RecMII = 0; 485 int MaxMOV = 0; 486 unsigned MaxDepth = 0; 487 unsigned Colocate = 0; 488 SUnit *ExceedPressure = nullptr; 489 unsigned Latency = 0; 490 491 public: 492 using iterator = SetVector<SUnit *>::const_iterator; 493 494 NodeSet() = default; NodeSet(iterator S,iterator E,const SwingSchedulerDAG * DAG)495 NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG) 496 : Nodes(S, E), HasRecurrence(true) { 497 // Calculate the latency of this node set. 498 // Example to demonstrate the calculation: 499 // Given: N0 -> N1 -> N2 -> N0 500 // Edges: 501 // (N0 -> N1, 3) 502 // (N0 -> N1, 5) 503 // (N1 -> N2, 2) 504 // (N2 -> N0, 1) 505 // The total latency which is a lower bound of the recurrence MII is the 506 // longest path from N0 back to N0 given only the edges of this node set. 507 // In this example, the latency is: 5 + 2 + 1 = 8. 508 // 509 // Hold a map from each SUnit in the circle to the maximum distance from the 510 // source node by only considering the nodes. 511 const SwingSchedulerDDG *DDG = DAG->getDDG(); 512 DenseMap<SUnit *, unsigned> SUnitToDistance; 513 for (auto *Node : Nodes) 514 SUnitToDistance[Node] = 0; 515 516 for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) { 517 SUnit *U = Nodes[I - 1]; 518 SUnit *V = Nodes[I % Nodes.size()]; 519 for (const SwingSchedulerDDGEdge &Succ : DDG->getOutEdges(U)) { 520 SUnit *SuccSUnit = Succ.getDst(); 521 if (V != SuccSUnit) 522 continue; 523 unsigned &DU = SUnitToDistance[U]; 524 unsigned &DV = SUnitToDistance[V]; 525 if (DU + Succ.getLatency() > DV) 526 DV = DU + Succ.getLatency(); 527 } 528 } 529 // Handle a back-edge in loop carried dependencies 530 SUnit *FirstNode = Nodes[0]; 531 SUnit *LastNode = Nodes[Nodes.size() - 1]; 532 533 for (auto &PI : DDG->getInEdges(LastNode)) { 534 // If we have an order dep that is potentially loop carried then a 535 // back-edge exists between the last node and the first node that isn't 536 // modeled in the DAG. Handle it manually by adding 1 to the distance of 537 // the last node. 538 if (PI.getSrc() != FirstNode || !PI.isOrderDep() || 539 !DAG->isLoopCarriedDep(PI)) 540 continue; 541 unsigned &First = SUnitToDistance[FirstNode]; 542 unsigned Last = SUnitToDistance[LastNode]; 543 First = std::max(First, Last + 1); 544 } 545 546 // The latency is the distance from the source node to itself. 547 Latency = SUnitToDistance[Nodes.front()]; 548 } 549 insert(SUnit * SU)550 bool insert(SUnit *SU) { return Nodes.insert(SU); } 551 insert(iterator S,iterator E)552 void insert(iterator S, iterator E) { Nodes.insert(S, E); } 553 remove_if(UnaryPredicate P)554 template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) { 555 return Nodes.remove_if(P); 556 } 557 count(SUnit * SU)558 unsigned count(SUnit *SU) const { return Nodes.count(SU); } 559 hasRecurrence()560 bool hasRecurrence() { return HasRecurrence; }; 561 size()562 unsigned size() const { return Nodes.size(); } 563 empty()564 bool empty() const { return Nodes.empty(); } 565 getNode(unsigned i)566 SUnit *getNode(unsigned i) const { return Nodes[i]; }; 567 setRecMII(unsigned mii)568 void setRecMII(unsigned mii) { RecMII = mii; }; 569 setColocate(unsigned c)570 void setColocate(unsigned c) { Colocate = c; }; 571 setExceedPressure(SUnit * SU)572 void setExceedPressure(SUnit *SU) { ExceedPressure = SU; } 573 isExceedSU(SUnit * SU)574 bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; } 575 compareRecMII(NodeSet & RHS)576 int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; } 577 getRecMII()578 int getRecMII() { return RecMII; } 579 580 /// Summarize node functions for the entire node set. computeNodeSetInfo(SwingSchedulerDAG * SSD)581 void computeNodeSetInfo(SwingSchedulerDAG *SSD) { 582 for (SUnit *SU : *this) { 583 MaxMOV = std::max(MaxMOV, SSD->getMOV(SU)); 584 MaxDepth = std::max(MaxDepth, SSD->getDepth(SU)); 585 } 586 } 587 getLatency()588 unsigned getLatency() { return Latency; } 589 getMaxDepth()590 unsigned getMaxDepth() { return MaxDepth; } 591 clear()592 void clear() { 593 Nodes.clear(); 594 RecMII = 0; 595 HasRecurrence = false; 596 MaxMOV = 0; 597 MaxDepth = 0; 598 Colocate = 0; 599 ExceedPressure = nullptr; 600 } 601 602 operator SetVector<SUnit *> &() { return Nodes; } 603 604 /// Sort the node sets by importance. First, rank them by recurrence MII, 605 /// then by mobility (least mobile done first), and finally by depth. 606 /// Each node set may contain a colocate value which is used as the first 607 /// tie breaker, if it's set. 608 bool operator>(const NodeSet &RHS) const { 609 if (RecMII == RHS.RecMII) { 610 if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate) 611 return Colocate < RHS.Colocate; 612 if (MaxMOV == RHS.MaxMOV) 613 return MaxDepth > RHS.MaxDepth; 614 return MaxMOV < RHS.MaxMOV; 615 } 616 return RecMII > RHS.RecMII; 617 } 618 619 bool operator==(const NodeSet &RHS) const { 620 return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV && 621 MaxDepth == RHS.MaxDepth; 622 } 623 624 bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); } 625 begin()626 iterator begin() { return Nodes.begin(); } end()627 iterator end() { return Nodes.end(); } 628 void print(raw_ostream &os) const; 629 630 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 631 LLVM_DUMP_METHOD void dump() const; 632 #endif 633 }; 634 635 // 16 was selected based on the number of ProcResource kinds for all 636 // existing Subtargets, so that SmallVector don't need to resize too often. 637 static const int DefaultProcResSize = 16; 638 639 class ResourceManager { 640 private: 641 const MCSubtargetInfo *STI; 642 const MCSchedModel &SM; 643 const TargetSubtargetInfo *ST; 644 const TargetInstrInfo *TII; 645 ScheduleDAGInstrs *DAG; 646 const bool UseDFA; 647 /// DFA resources for each slot 648 llvm::SmallVector<std::unique_ptr<DFAPacketizer>> DFAResources; 649 /// Modulo Reservation Table. When a resource with ID R is consumed in cycle 650 /// C, it is counted in MRT[C mod II][R]. (Used when UseDFA == F) 651 llvm::SmallVector<llvm::SmallVector<uint64_t, DefaultProcResSize>> MRT; 652 /// The number of scheduled micro operations for each slot. Micro operations 653 /// are assumed to be scheduled one per cycle, starting with the cycle in 654 /// which the instruction is scheduled. 655 llvm::SmallVector<int> NumScheduledMops; 656 /// Each processor resource is associated with a so-called processor resource 657 /// mask. This vector allows to correlate processor resource IDs with 658 /// processor resource masks. There is exactly one element per each processor 659 /// resource declared by the scheduling model. 660 llvm::SmallVector<uint64_t, DefaultProcResSize> ProcResourceMasks; 661 int InitiationInterval = 0; 662 /// The number of micro operations that can be scheduled at a cycle. 663 int IssueWidth; 664 665 int calculateResMIIDFA() const; 666 /// Check if MRT is overbooked 667 bool isOverbooked() const; 668 /// Reserve resources on MRT 669 void reserveResources(const MCSchedClassDesc *SCDesc, int Cycle); 670 /// Unreserve resources on MRT 671 void unreserveResources(const MCSchedClassDesc *SCDesc, int Cycle); 672 673 /// Return M satisfying Dividend = Divisor * X + M, 0 < M < Divisor. 674 /// The slot on MRT to reserve a resource for the cycle C is positiveModulo(C, 675 /// II). positiveModulo(int Dividend,int Divisor)676 int positiveModulo(int Dividend, int Divisor) const { 677 assert(Divisor > 0); 678 int R = Dividend % Divisor; 679 if (R < 0) 680 R += Divisor; 681 return R; 682 } 683 684 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 685 LLVM_DUMP_METHOD void dumpMRT() const; 686 #endif 687 688 public: ResourceManager(const TargetSubtargetInfo * ST,ScheduleDAGInstrs * DAG)689 ResourceManager(const TargetSubtargetInfo *ST, ScheduleDAGInstrs *DAG) 690 : STI(ST), SM(ST->getSchedModel()), ST(ST), TII(ST->getInstrInfo()), 691 DAG(DAG), UseDFA(ST->useDFAforSMS()), 692 ProcResourceMasks(SM.getNumProcResourceKinds(), 0), 693 IssueWidth(SM.IssueWidth) { 694 initProcResourceVectors(SM, ProcResourceMasks); 695 if (IssueWidth <= 0) 696 // If IssueWidth is not specified, set a sufficiently large value 697 IssueWidth = 100; 698 if (SwpForceIssueWidth > 0) 699 IssueWidth = SwpForceIssueWidth; 700 } 701 702 void initProcResourceVectors(const MCSchedModel &SM, 703 SmallVectorImpl<uint64_t> &Masks); 704 705 /// Check if the resources occupied by a machine instruction are available 706 /// in the current state. 707 bool canReserveResources(SUnit &SU, int Cycle); 708 709 /// Reserve the resources occupied by a machine instruction and change the 710 /// current state to reflect that change. 711 void reserveResources(SUnit &SU, int Cycle); 712 713 int calculateResMII() const; 714 715 /// Initialize resources with the initiation interval II. 716 void init(int II); 717 }; 718 719 /// This class represents the scheduled code. The main data structure is a 720 /// map from scheduled cycle to instructions. During scheduling, the 721 /// data structure explicitly represents all stages/iterations. When 722 /// the algorithm finshes, the schedule is collapsed into a single stage, 723 /// which represents instructions from different loop iterations. 724 /// 725 /// The SMS algorithm allows negative values for cycles, so the first cycle 726 /// in the schedule is the smallest cycle value. 727 class SMSchedule { 728 private: 729 /// Map from execution cycle to instructions. 730 DenseMap<int, std::deque<SUnit *>> ScheduledInstrs; 731 732 /// Map from instruction to execution cycle. 733 std::map<SUnit *, int> InstrToCycle; 734 735 /// Keep track of the first cycle value in the schedule. It starts 736 /// as zero, but the algorithm allows negative values. 737 int FirstCycle = 0; 738 739 /// Keep track of the last cycle value in the schedule. 740 int LastCycle = 0; 741 742 /// The initiation interval (II) for the schedule. 743 int InitiationInterval = 0; 744 745 /// Target machine information. 746 const TargetSubtargetInfo &ST; 747 748 /// Virtual register information. 749 MachineRegisterInfo &MRI; 750 751 ResourceManager ProcItinResources; 752 753 public: SMSchedule(MachineFunction * mf,SwingSchedulerDAG * DAG)754 SMSchedule(MachineFunction *mf, SwingSchedulerDAG *DAG) 755 : ST(mf->getSubtarget()), MRI(mf->getRegInfo()), 756 ProcItinResources(&ST, DAG) {} 757 reset()758 void reset() { 759 ScheduledInstrs.clear(); 760 InstrToCycle.clear(); 761 FirstCycle = 0; 762 LastCycle = 0; 763 InitiationInterval = 0; 764 } 765 766 /// Set the initiation interval for this schedule. setInitiationInterval(int ii)767 void setInitiationInterval(int ii) { 768 InitiationInterval = ii; 769 ProcItinResources.init(ii); 770 } 771 772 /// Return the initiation interval for this schedule. getInitiationInterval()773 int getInitiationInterval() const { return InitiationInterval; } 774 775 /// Return the first cycle in the completed schedule. This 776 /// can be a negative value. getFirstCycle()777 int getFirstCycle() const { return FirstCycle; } 778 779 /// Return the last cycle in the finalized schedule. getFinalCycle()780 int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; } 781 782 /// Return the cycle of the earliest scheduled instruction in the dependence 783 /// chain. 784 int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, 785 const SwingSchedulerDDG *DDG); 786 787 /// Return the cycle of the latest scheduled instruction in the dependence 788 /// chain. 789 int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, 790 const SwingSchedulerDDG *DDG); 791 792 void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, 793 SwingSchedulerDAG *DAG); 794 bool insert(SUnit *SU, int StartCycle, int EndCycle, int II); 795 796 /// Iterators for the cycle to instruction map. 797 using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator; 798 using const_sched_iterator = 799 DenseMap<int, std::deque<SUnit *>>::const_iterator; 800 801 /// Return true if the instruction is scheduled at the specified stage. isScheduledAtStage(SUnit * SU,unsigned StageNum)802 bool isScheduledAtStage(SUnit *SU, unsigned StageNum) { 803 return (stageScheduled(SU) == (int)StageNum); 804 } 805 806 /// Return the stage for a scheduled instruction. Return -1 if 807 /// the instruction has not been scheduled. stageScheduled(SUnit * SU)808 int stageScheduled(SUnit *SU) const { 809 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU); 810 if (it == InstrToCycle.end()) 811 return -1; 812 return (it->second - FirstCycle) / InitiationInterval; 813 } 814 815 /// Return the cycle for a scheduled instruction. This function normalizes 816 /// the first cycle to be 0. cycleScheduled(SUnit * SU)817 unsigned cycleScheduled(SUnit *SU) const { 818 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU); 819 assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled."); 820 return (it->second - FirstCycle) % InitiationInterval; 821 } 822 823 /// Return the maximum stage count needed for this schedule. getMaxStageCount()824 unsigned getMaxStageCount() { 825 return (LastCycle - FirstCycle) / InitiationInterval; 826 } 827 828 /// Return the instructions that are scheduled at the specified cycle. getInstructions(int cycle)829 std::deque<SUnit *> &getInstructions(int cycle) { 830 return ScheduledInstrs[cycle]; 831 } 832 833 SmallSet<SUnit *, 8> 834 computeUnpipelineableNodes(SwingSchedulerDAG *SSD, 835 TargetInstrInfo::PipelinerLoopInfo *PLI); 836 837 std::deque<SUnit *> 838 reorderInstructions(const SwingSchedulerDAG *SSD, 839 const std::deque<SUnit *> &Instrs) const; 840 841 bool 842 normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, 843 TargetInstrInfo::PipelinerLoopInfo *PLI); 844 bool isValidSchedule(SwingSchedulerDAG *SSD); 845 void finalizeSchedule(SwingSchedulerDAG *SSD); 846 void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, 847 std::deque<SUnit *> &Insts) const; 848 bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const; 849 bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, 850 MachineOperand &MO) const; 851 852 bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, 853 const SwingSchedulerDDG *DDG) const; 854 void print(raw_ostream &os) const; 855 void dump() const; 856 }; 857 858 } // end namespace llvm 859 860 #endif // LLVM_CODEGEN_MACHINEPIPELINER_H 861