xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp (revision c364ccf9ce2d1d0fc24247aa771cf52e5dfb532a)
1  //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10  //
11  // This SMS implementation is a target-independent back-end pass. When enabled,
12  // the pass runs just prior to the register allocation pass, while the machine
13  // IR is in SSA form. If software pipelining is successful, then the original
14  // loop is replaced by the optimized loop. The optimized loop contains one or
15  // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16  // the instructions cannot be scheduled in a given MII, we increase the MII by
17  // one and try again.
18  //
19  // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20  // represent loop carried dependences in the DAG as order edges to the Phi
21  // nodes. We also perform several passes over the DAG to eliminate unnecessary
22  // edges that inhibit the ability to pipeline. The implementation uses the
23  // DFAPacketizer class to compute the minimum initiation interval and the check
24  // where an instruction may be inserted in the pipelined schedule.
25  //
26  // In order for the SMS pass to work, several target specific hooks need to be
27  // implemented to get information about the loop structure and to rewrite
28  // instructions.
29  //
30  //===----------------------------------------------------------------------===//
31  
32  #include "llvm/ADT/ArrayRef.h"
33  #include "llvm/ADT/BitVector.h"
34  #include "llvm/ADT/DenseMap.h"
35  #include "llvm/ADT/MapVector.h"
36  #include "llvm/ADT/PriorityQueue.h"
37  #include "llvm/ADT/SetVector.h"
38  #include "llvm/ADT/SmallPtrSet.h"
39  #include "llvm/ADT/SmallSet.h"
40  #include "llvm/ADT/SmallVector.h"
41  #include "llvm/ADT/Statistic.h"
42  #include "llvm/ADT/iterator_range.h"
43  #include "llvm/Analysis/AliasAnalysis.h"
44  #include "llvm/Analysis/MemoryLocation.h"
45  #include "llvm/Analysis/ValueTracking.h"
46  #include "llvm/CodeGen/DFAPacketizer.h"
47  #include "llvm/CodeGen/LiveIntervals.h"
48  #include "llvm/CodeGen/MachineBasicBlock.h"
49  #include "llvm/CodeGen/MachineDominators.h"
50  #include "llvm/CodeGen/MachineFunction.h"
51  #include "llvm/CodeGen/MachineFunctionPass.h"
52  #include "llvm/CodeGen/MachineInstr.h"
53  #include "llvm/CodeGen/MachineInstrBuilder.h"
54  #include "llvm/CodeGen/MachineLoopInfo.h"
55  #include "llvm/CodeGen/MachineMemOperand.h"
56  #include "llvm/CodeGen/MachineOperand.h"
57  #include "llvm/CodeGen/MachinePipeliner.h"
58  #include "llvm/CodeGen/MachineRegisterInfo.h"
59  #include "llvm/CodeGen/ModuloSchedule.h"
60  #include "llvm/CodeGen/RegisterPressure.h"
61  #include "llvm/CodeGen/ScheduleDAG.h"
62  #include "llvm/CodeGen/ScheduleDAGMutation.h"
63  #include "llvm/CodeGen/TargetOpcodes.h"
64  #include "llvm/CodeGen/TargetRegisterInfo.h"
65  #include "llvm/CodeGen/TargetSubtargetInfo.h"
66  #include "llvm/Config/llvm-config.h"
67  #include "llvm/IR/Attributes.h"
68  #include "llvm/IR/DebugLoc.h"
69  #include "llvm/IR/Function.h"
70  #include "llvm/MC/LaneBitmask.h"
71  #include "llvm/MC/MCInstrDesc.h"
72  #include "llvm/MC/MCInstrItineraries.h"
73  #include "llvm/MC/MCRegisterInfo.h"
74  #include "llvm/Pass.h"
75  #include "llvm/Support/CommandLine.h"
76  #include "llvm/Support/Compiler.h"
77  #include "llvm/Support/Debug.h"
78  #include "llvm/Support/MathExtras.h"
79  #include "llvm/Support/raw_ostream.h"
80  #include <algorithm>
81  #include <cassert>
82  #include <climits>
83  #include <cstdint>
84  #include <deque>
85  #include <functional>
86  #include <iterator>
87  #include <map>
88  #include <memory>
89  #include <tuple>
90  #include <utility>
91  #include <vector>
92  
93  using namespace llvm;
94  
95  #define DEBUG_TYPE "pipeliner"
96  
97  STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
98  STATISTIC(NumPipelined, "Number of loops software pipelined");
99  STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
100  STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
101  STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
102  STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
103  STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
104  STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
105  STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
106  STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
107  STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
108  
109  /// A command line option to turn software pipelining on or off.
110  static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
111                                 cl::ZeroOrMore,
112                                 cl::desc("Enable Software Pipelining"));
113  
114  /// A command line option to enable SWP at -Os.
115  static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
116                                        cl::desc("Enable SWP at Os."), cl::Hidden,
117                                        cl::init(false));
118  
119  /// A command line argument to limit minimum initial interval for pipelining.
120  static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
121                                cl::desc("Size limit for the MII."),
122                                cl::Hidden, cl::init(27));
123  
124  /// A command line argument to limit the number of stages in the pipeline.
125  static cl::opt<int>
126      SwpMaxStages("pipeliner-max-stages",
127                   cl::desc("Maximum stages allowed in the generated scheduled."),
128                   cl::Hidden, cl::init(3));
129  
130  /// A command line option to disable the pruning of chain dependences due to
131  /// an unrelated Phi.
132  static cl::opt<bool>
133      SwpPruneDeps("pipeliner-prune-deps",
134                   cl::desc("Prune dependences between unrelated Phi nodes."),
135                   cl::Hidden, cl::init(true));
136  
137  /// A command line option to disable the pruning of loop carried order
138  /// dependences.
139  static cl::opt<bool>
140      SwpPruneLoopCarried("pipeliner-prune-loop-carried",
141                          cl::desc("Prune loop carried order dependences."),
142                          cl::Hidden, cl::init(true));
143  
144  #ifndef NDEBUG
145  static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
146  #endif
147  
148  static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
149                                       cl::ReallyHidden, cl::init(false),
150                                       cl::ZeroOrMore, cl::desc("Ignore RecMII"));
151  
152  static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
153                                      cl::init(false));
154  static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
155                                        cl::init(false));
156  
157  static cl::opt<bool> EmitTestAnnotations(
158      "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
159      cl::desc("Instead of emitting the pipelined code, annotate instructions "
160               "with the generated schedule for feeding into the "
161               "-modulo-schedule-test pass"));
162  
163  static cl::opt<bool> ExperimentalCodeGen(
164      "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
165      cl::desc(
166          "Use the experimental peeling code generator for software pipelining"));
167  
168  namespace llvm {
169  
170  // A command line option to enable the CopyToPhi DAG mutation.
171  cl::opt<bool>
172      SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
173                         cl::init(true), cl::ZeroOrMore,
174                         cl::desc("Enable CopyToPhi DAG Mutation"));
175  
176  } // end namespace llvm
177  
178  unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
179  char MachinePipeliner::ID = 0;
180  #ifndef NDEBUG
181  int MachinePipeliner::NumTries = 0;
182  #endif
183  char &llvm::MachinePipelinerID = MachinePipeliner::ID;
184  
185  INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
186                        "Modulo Software Pipelining", false, false)
187  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
188  INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
189  INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
190  INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
191  INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
192                      "Modulo Software Pipelining", false, false)
193  
194  /// The "main" function for implementing Swing Modulo Scheduling.
195  bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
196    if (skipFunction(mf.getFunction()))
197      return false;
198  
199    if (!EnableSWP)
200      return false;
201  
202    if (mf.getFunction().getAttributes().hasAttribute(
203            AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
204        !EnableSWPOptSize.getPosition())
205      return false;
206  
207    if (!mf.getSubtarget().enableMachinePipeliner())
208      return false;
209  
210    // Cannot pipeline loops without instruction itineraries if we are using
211    // DFA for the pipeliner.
212    if (mf.getSubtarget().useDFAforSMS() &&
213        (!mf.getSubtarget().getInstrItineraryData() ||
214         mf.getSubtarget().getInstrItineraryData()->isEmpty()))
215      return false;
216  
217    MF = &mf;
218    MLI = &getAnalysis<MachineLoopInfo>();
219    MDT = &getAnalysis<MachineDominatorTree>();
220    TII = MF->getSubtarget().getInstrInfo();
221    RegClassInfo.runOnMachineFunction(*MF);
222  
223    for (auto &L : *MLI)
224      scheduleLoop(*L);
225  
226    return false;
227  }
228  
229  /// Attempt to perform the SMS algorithm on the specified loop. This function is
230  /// the main entry point for the algorithm.  The function identifies candidate
231  /// loops, calculates the minimum initiation interval, and attempts to schedule
232  /// the loop.
233  bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
234    bool Changed = false;
235    for (auto &InnerLoop : L)
236      Changed |= scheduleLoop(*InnerLoop);
237  
238  #ifndef NDEBUG
239    // Stop trying after reaching the limit (if any).
240    int Limit = SwpLoopLimit;
241    if (Limit >= 0) {
242      if (NumTries >= SwpLoopLimit)
243        return Changed;
244      NumTries++;
245    }
246  #endif
247  
248    setPragmaPipelineOptions(L);
249    if (!canPipelineLoop(L)) {
250      LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
251      return Changed;
252    }
253  
254    ++NumTrytoPipeline;
255  
256    Changed = swingModuloScheduler(L);
257  
258    return Changed;
259  }
260  
261  void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
262    MachineBasicBlock *LBLK = L.getTopBlock();
263  
264    if (LBLK == nullptr)
265      return;
266  
267    const BasicBlock *BBLK = LBLK->getBasicBlock();
268    if (BBLK == nullptr)
269      return;
270  
271    const Instruction *TI = BBLK->getTerminator();
272    if (TI == nullptr)
273      return;
274  
275    MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
276    if (LoopID == nullptr)
277      return;
278  
279    assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
280    assert(LoopID->getOperand(0) == LoopID && "invalid loop");
281  
282    for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
283      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
284  
285      if (MD == nullptr)
286        continue;
287  
288      MDString *S = dyn_cast<MDString>(MD->getOperand(0));
289  
290      if (S == nullptr)
291        continue;
292  
293      if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
294        assert(MD->getNumOperands() == 2 &&
295               "Pipeline initiation interval hint metadata should have two operands.");
296        II_setByPragma =
297            mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
298        assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
299      } else if (S->getString() == "llvm.loop.pipeline.disable") {
300        disabledByPragma = true;
301      }
302    }
303  }
304  
305  /// Return true if the loop can be software pipelined.  The algorithm is
306  /// restricted to loops with a single basic block.  Make sure that the
307  /// branch in the loop can be analyzed.
308  bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
309    if (L.getNumBlocks() != 1)
310      return false;
311  
312    if (disabledByPragma)
313      return false;
314  
315    // Check if the branch can't be understood because we can't do pipelining
316    // if that's the case.
317    LI.TBB = nullptr;
318    LI.FBB = nullptr;
319    LI.BrCond.clear();
320    if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
321      LLVM_DEBUG(
322          dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");
323      NumFailBranch++;
324      return false;
325    }
326  
327    LI.LoopInductionVar = nullptr;
328    LI.LoopCompare = nullptr;
329    if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
330      LLVM_DEBUG(
331          dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
332      NumFailLoop++;
333      return false;
334    }
335  
336    if (!L.getLoopPreheader()) {
337      LLVM_DEBUG(
338          dbgs() << "Preheader not found, can NOT pipeline current Loop\n");
339      NumFailPreheader++;
340      return false;
341    }
342  
343    // Remove any subregisters from inputs to phi nodes.
344    preprocessPhiNodes(*L.getHeader());
345    return true;
346  }
347  
348  void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
349    MachineRegisterInfo &MRI = MF->getRegInfo();
350    SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
351  
352    for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
353      MachineOperand &DefOp = PI.getOperand(0);
354      assert(DefOp.getSubReg() == 0);
355      auto *RC = MRI.getRegClass(DefOp.getReg());
356  
357      for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
358        MachineOperand &RegOp = PI.getOperand(i);
359        if (RegOp.getSubReg() == 0)
360          continue;
361  
362        // If the operand uses a subregister, replace it with a new register
363        // without subregisters, and generate a copy to the new register.
364        Register NewReg = MRI.createVirtualRegister(RC);
365        MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
366        MachineBasicBlock::iterator At = PredB.getFirstTerminator();
367        const DebugLoc &DL = PredB.findDebugLoc(At);
368        auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
369                      .addReg(RegOp.getReg(), getRegState(RegOp),
370                              RegOp.getSubReg());
371        Slots.insertMachineInstrInMaps(*Copy);
372        RegOp.setReg(NewReg);
373        RegOp.setSubReg(0);
374      }
375    }
376  }
377  
378  /// The SMS algorithm consists of the following main steps:
379  /// 1. Computation and analysis of the dependence graph.
380  /// 2. Ordering of the nodes (instructions).
381  /// 3. Attempt to Schedule the loop.
382  bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
383    assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
384  
385    SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
386                          II_setByPragma);
387  
388    MachineBasicBlock *MBB = L.getHeader();
389    // The kernel should not include any terminator instructions.  These
390    // will be added back later.
391    SMS.startBlock(MBB);
392  
393    // Compute the number of 'real' instructions in the basic block by
394    // ignoring terminators.
395    unsigned size = MBB->size();
396    for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
397                                     E = MBB->instr_end();
398         I != E; ++I, --size)
399      ;
400  
401    SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
402    SMS.schedule();
403    SMS.exitRegion();
404  
405    SMS.finishBlock();
406    return SMS.hasNewSchedule();
407  }
408  
409  void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
410    if (II_setByPragma > 0)
411      MII = II_setByPragma;
412    else
413      MII = std::max(ResMII, RecMII);
414  }
415  
416  void SwingSchedulerDAG::setMAX_II() {
417    if (II_setByPragma > 0)
418      MAX_II = II_setByPragma;
419    else
420      MAX_II = MII + 10;
421  }
422  
423  /// We override the schedule function in ScheduleDAGInstrs to implement the
424  /// scheduling part of the Swing Modulo Scheduling algorithm.
425  void SwingSchedulerDAG::schedule() {
426    AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
427    buildSchedGraph(AA);
428    addLoopCarriedDependences(AA);
429    updatePhiDependences();
430    Topo.InitDAGTopologicalSorting();
431    changeDependences();
432    postprocessDAG();
433    LLVM_DEBUG(dump());
434  
435    NodeSetType NodeSets;
436    findCircuits(NodeSets);
437    NodeSetType Circuits = NodeSets;
438  
439    // Calculate the MII.
440    unsigned ResMII = calculateResMII();
441    unsigned RecMII = calculateRecMII(NodeSets);
442  
443    fuseRecs(NodeSets);
444  
445    // This flag is used for testing and can cause correctness problems.
446    if (SwpIgnoreRecMII)
447      RecMII = 0;
448  
449    setMII(ResMII, RecMII);
450    setMAX_II();
451  
452    LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
453                      << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
454  
455    // Can't schedule a loop without a valid MII.
456    if (MII == 0) {
457      LLVM_DEBUG(
458          dbgs()
459          << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");
460      NumFailZeroMII++;
461      return;
462    }
463  
464    // Don't pipeline large loops.
465    if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
466      LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
467                        << ", we don't pipleline large loops\n");
468      NumFailLargeMaxMII++;
469      return;
470    }
471  
472    computeNodeFunctions(NodeSets);
473  
474    registerPressureFilter(NodeSets);
475  
476    colocateNodeSets(NodeSets);
477  
478    checkNodeSets(NodeSets);
479  
480    LLVM_DEBUG({
481      for (auto &I : NodeSets) {
482        dbgs() << "  Rec NodeSet ";
483        I.dump();
484      }
485    });
486  
487    llvm::stable_sort(NodeSets, std::greater<NodeSet>());
488  
489    groupRemainingNodes(NodeSets);
490  
491    removeDuplicateNodes(NodeSets);
492  
493    LLVM_DEBUG({
494      for (auto &I : NodeSets) {
495        dbgs() << "  NodeSet ";
496        I.dump();
497      }
498    });
499  
500    computeNodeOrder(NodeSets);
501  
502    // check for node order issues
503    checkValidNodeOrder(Circuits);
504  
505    SMSchedule Schedule(Pass.MF);
506    Scheduled = schedulePipeline(Schedule);
507  
508    if (!Scheduled){
509      LLVM_DEBUG(dbgs() << "No schedule found, return\n");
510      NumFailNoSchedule++;
511      return;
512    }
513  
514    unsigned numStages = Schedule.getMaxStageCount();
515    // No need to generate pipeline if there are no overlapped iterations.
516    if (numStages == 0) {
517      LLVM_DEBUG(
518          dbgs() << "No overlapped iterations, no need to generate pipeline\n");
519      NumFailZeroStage++;
520      return;
521    }
522    // Check that the maximum stage count is less than user-defined limit.
523    if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
524      LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
525                        << " : too many stages, abort\n");
526      NumFailLargeMaxStage++;
527      return;
528    }
529  
530    // Generate the schedule as a ModuloSchedule.
531    DenseMap<MachineInstr *, int> Cycles, Stages;
532    std::vector<MachineInstr *> OrderedInsts;
533    for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
534         ++Cycle) {
535      for (SUnit *SU : Schedule.getInstructions(Cycle)) {
536        OrderedInsts.push_back(SU->getInstr());
537        Cycles[SU->getInstr()] = Cycle;
538        Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
539      }
540    }
541    DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
542    for (auto &KV : NewMIs) {
543      Cycles[KV.first] = Cycles[KV.second];
544      Stages[KV.first] = Stages[KV.second];
545      NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
546    }
547  
548    ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
549                      std::move(Stages));
550    if (EmitTestAnnotations) {
551      assert(NewInstrChanges.empty() &&
552             "Cannot serialize a schedule with InstrChanges!");
553      ModuloScheduleTestAnnotater MSTI(MF, MS);
554      MSTI.annotate();
555      return;
556    }
557    // The experimental code generator can't work if there are InstChanges.
558    if (ExperimentalCodeGen && NewInstrChanges.empty()) {
559      PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
560      MSE.expand();
561    } else {
562      ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
563      MSE.expand();
564      MSE.cleanup();
565    }
566    ++NumPipelined;
567  }
568  
569  /// Clean up after the software pipeliner runs.
570  void SwingSchedulerDAG::finishBlock() {
571    for (auto &KV : NewMIs)
572      MF.DeleteMachineInstr(KV.second);
573    NewMIs.clear();
574  
575    // Call the superclass.
576    ScheduleDAGInstrs::finishBlock();
577  }
578  
579  /// Return the register values for  the operands of a Phi instruction.
580  /// This function assume the instruction is a Phi.
581  static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
582                         unsigned &InitVal, unsigned &LoopVal) {
583    assert(Phi.isPHI() && "Expecting a Phi.");
584  
585    InitVal = 0;
586    LoopVal = 0;
587    for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
588      if (Phi.getOperand(i + 1).getMBB() != Loop)
589        InitVal = Phi.getOperand(i).getReg();
590      else
591        LoopVal = Phi.getOperand(i).getReg();
592  
593    assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
594  }
595  
596  /// Return the Phi register value that comes the loop block.
597  static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
598    for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
599      if (Phi.getOperand(i + 1).getMBB() == LoopBB)
600        return Phi.getOperand(i).getReg();
601    return 0;
602  }
603  
604  /// Return true if SUb can be reached from SUa following the chain edges.
605  static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
606    SmallPtrSet<SUnit *, 8> Visited;
607    SmallVector<SUnit *, 8> Worklist;
608    Worklist.push_back(SUa);
609    while (!Worklist.empty()) {
610      const SUnit *SU = Worklist.pop_back_val();
611      for (auto &SI : SU->Succs) {
612        SUnit *SuccSU = SI.getSUnit();
613        if (SI.getKind() == SDep::Order) {
614          if (Visited.count(SuccSU))
615            continue;
616          if (SuccSU == SUb)
617            return true;
618          Worklist.push_back(SuccSU);
619          Visited.insert(SuccSU);
620        }
621      }
622    }
623    return false;
624  }
625  
626  /// Return true if the instruction causes a chain between memory
627  /// references before and after it.
628  static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
629    return MI.isCall() || MI.mayRaiseFPException() ||
630           MI.hasUnmodeledSideEffects() ||
631           (MI.hasOrderedMemoryRef() &&
632            (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
633  }
634  
635  /// Return the underlying objects for the memory references of an instruction.
636  /// This function calls the code in ValueTracking, but first checks that the
637  /// instruction has a memory operand.
638  static void getUnderlyingObjects(const MachineInstr *MI,
639                                   SmallVectorImpl<const Value *> &Objs,
640                                   const DataLayout &DL) {
641    if (!MI->hasOneMemOperand())
642      return;
643    MachineMemOperand *MM = *MI->memoperands_begin();
644    if (!MM->getValue())
645      return;
646    GetUnderlyingObjects(MM->getValue(), Objs, DL);
647    for (const Value *V : Objs) {
648      if (!isIdentifiedObject(V)) {
649        Objs.clear();
650        return;
651      }
652      Objs.push_back(V);
653    }
654  }
655  
656  /// Add a chain edge between a load and store if the store can be an
657  /// alias of the load on a subsequent iteration, i.e., a loop carried
658  /// dependence. This code is very similar to the code in ScheduleDAGInstrs
659  /// but that code doesn't create loop carried dependences.
660  void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
661    MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
662    Value *UnknownValue =
663      UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
664    for (auto &SU : SUnits) {
665      MachineInstr &MI = *SU.getInstr();
666      if (isDependenceBarrier(MI, AA))
667        PendingLoads.clear();
668      else if (MI.mayLoad()) {
669        SmallVector<const Value *, 4> Objs;
670        getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
671        if (Objs.empty())
672          Objs.push_back(UnknownValue);
673        for (auto V : Objs) {
674          SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
675          SUs.push_back(&SU);
676        }
677      } else if (MI.mayStore()) {
678        SmallVector<const Value *, 4> Objs;
679        getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
680        if (Objs.empty())
681          Objs.push_back(UnknownValue);
682        for (auto V : Objs) {
683          MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
684              PendingLoads.find(V);
685          if (I == PendingLoads.end())
686            continue;
687          for (auto Load : I->second) {
688            if (isSuccOrder(Load, &SU))
689              continue;
690            MachineInstr &LdMI = *Load->getInstr();
691            // First, perform the cheaper check that compares the base register.
692            // If they are the same and the load offset is less than the store
693            // offset, then mark the dependence as loop carried potentially.
694            const MachineOperand *BaseOp1, *BaseOp2;
695            int64_t Offset1, Offset2;
696            if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) &&
697                TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
698              if (BaseOp1->isIdenticalTo(*BaseOp2) &&
699                  (int)Offset1 < (int)Offset2) {
700                assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
701                       "What happened to the chain edge?");
702                SDep Dep(Load, SDep::Barrier);
703                Dep.setLatency(1);
704                SU.addPred(Dep);
705                continue;
706              }
707            }
708            // Second, the more expensive check that uses alias analysis on the
709            // base registers. If they alias, and the load offset is less than
710            // the store offset, the mark the dependence as loop carried.
711            if (!AA) {
712              SDep Dep(Load, SDep::Barrier);
713              Dep.setLatency(1);
714              SU.addPred(Dep);
715              continue;
716            }
717            MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
718            MachineMemOperand *MMO2 = *MI.memoperands_begin();
719            if (!MMO1->getValue() || !MMO2->getValue()) {
720              SDep Dep(Load, SDep::Barrier);
721              Dep.setLatency(1);
722              SU.addPred(Dep);
723              continue;
724            }
725            if (MMO1->getValue() == MMO2->getValue() &&
726                MMO1->getOffset() <= MMO2->getOffset()) {
727              SDep Dep(Load, SDep::Barrier);
728              Dep.setLatency(1);
729              SU.addPred(Dep);
730              continue;
731            }
732            AliasResult AAResult = AA->alias(
733                MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
734                               MMO1->getAAInfo()),
735                MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
736                               MMO2->getAAInfo()));
737  
738            if (AAResult != NoAlias) {
739              SDep Dep(Load, SDep::Barrier);
740              Dep.setLatency(1);
741              SU.addPred(Dep);
742            }
743          }
744        }
745      }
746    }
747  }
748  
749  /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
750  /// processes dependences for PHIs. This function adds true dependences
751  /// from a PHI to a use, and a loop carried dependence from the use to the
752  /// PHI. The loop carried dependence is represented as an anti dependence
753  /// edge. This function also removes chain dependences between unrelated
754  /// PHIs.
755  void SwingSchedulerDAG::updatePhiDependences() {
756    SmallVector<SDep, 4> RemoveDeps;
757    const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
758  
759    // Iterate over each DAG node.
760    for (SUnit &I : SUnits) {
761      RemoveDeps.clear();
762      // Set to true if the instruction has an operand defined by a Phi.
763      unsigned HasPhiUse = 0;
764      unsigned HasPhiDef = 0;
765      MachineInstr *MI = I.getInstr();
766      // Iterate over each operand, and we process the definitions.
767      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
768                                      MOE = MI->operands_end();
769           MOI != MOE; ++MOI) {
770        if (!MOI->isReg())
771          continue;
772        Register Reg = MOI->getReg();
773        if (MOI->isDef()) {
774          // If the register is used by a Phi, then create an anti dependence.
775          for (MachineRegisterInfo::use_instr_iterator
776                   UI = MRI.use_instr_begin(Reg),
777                   UE = MRI.use_instr_end();
778               UI != UE; ++UI) {
779            MachineInstr *UseMI = &*UI;
780            SUnit *SU = getSUnit(UseMI);
781            if (SU != nullptr && UseMI->isPHI()) {
782              if (!MI->isPHI()) {
783                SDep Dep(SU, SDep::Anti, Reg);
784                Dep.setLatency(1);
785                I.addPred(Dep);
786              } else {
787                HasPhiDef = Reg;
788                // Add a chain edge to a dependent Phi that isn't an existing
789                // predecessor.
790                if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
791                  I.addPred(SDep(SU, SDep::Barrier));
792              }
793            }
794          }
795        } else if (MOI->isUse()) {
796          // If the register is defined by a Phi, then create a true dependence.
797          MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
798          if (DefMI == nullptr)
799            continue;
800          SUnit *SU = getSUnit(DefMI);
801          if (SU != nullptr && DefMI->isPHI()) {
802            if (!MI->isPHI()) {
803              SDep Dep(SU, SDep::Data, Reg);
804              Dep.setLatency(0);
805              ST.adjustSchedDependency(SU, &I, Dep);
806              I.addPred(Dep);
807            } else {
808              HasPhiUse = Reg;
809              // Add a chain edge to a dependent Phi that isn't an existing
810              // predecessor.
811              if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
812                I.addPred(SDep(SU, SDep::Barrier));
813            }
814          }
815        }
816      }
817      // Remove order dependences from an unrelated Phi.
818      if (!SwpPruneDeps)
819        continue;
820      for (auto &PI : I.Preds) {
821        MachineInstr *PMI = PI.getSUnit()->getInstr();
822        if (PMI->isPHI() && PI.getKind() == SDep::Order) {
823          if (I.getInstr()->isPHI()) {
824            if (PMI->getOperand(0).getReg() == HasPhiUse)
825              continue;
826            if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
827              continue;
828          }
829          RemoveDeps.push_back(PI);
830        }
831      }
832      for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
833        I.removePred(RemoveDeps[i]);
834    }
835  }
836  
837  /// Iterate over each DAG node and see if we can change any dependences
838  /// in order to reduce the recurrence MII.
839  void SwingSchedulerDAG::changeDependences() {
840    // See if an instruction can use a value from the previous iteration.
841    // If so, we update the base and offset of the instruction and change
842    // the dependences.
843    for (SUnit &I : SUnits) {
844      unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
845      int64_t NewOffset = 0;
846      if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
847                                 NewOffset))
848        continue;
849  
850      // Get the MI and SUnit for the instruction that defines the original base.
851      Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
852      MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
853      if (!DefMI)
854        continue;
855      SUnit *DefSU = getSUnit(DefMI);
856      if (!DefSU)
857        continue;
858      // Get the MI and SUnit for the instruction that defins the new base.
859      MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
860      if (!LastMI)
861        continue;
862      SUnit *LastSU = getSUnit(LastMI);
863      if (!LastSU)
864        continue;
865  
866      if (Topo.IsReachable(&I, LastSU))
867        continue;
868  
869      // Remove the dependence. The value now depends on a prior iteration.
870      SmallVector<SDep, 4> Deps;
871      for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
872           ++P)
873        if (P->getSUnit() == DefSU)
874          Deps.push_back(*P);
875      for (int i = 0, e = Deps.size(); i != e; i++) {
876        Topo.RemovePred(&I, Deps[i].getSUnit());
877        I.removePred(Deps[i]);
878      }
879      // Remove the chain dependence between the instructions.
880      Deps.clear();
881      for (auto &P : LastSU->Preds)
882        if (P.getSUnit() == &I && P.getKind() == SDep::Order)
883          Deps.push_back(P);
884      for (int i = 0, e = Deps.size(); i != e; i++) {
885        Topo.RemovePred(LastSU, Deps[i].getSUnit());
886        LastSU->removePred(Deps[i]);
887      }
888  
889      // Add a dependence between the new instruction and the instruction
890      // that defines the new base.
891      SDep Dep(&I, SDep::Anti, NewBase);
892      Topo.AddPred(LastSU, &I);
893      LastSU->addPred(Dep);
894  
895      // Remember the base and offset information so that we can update the
896      // instruction during code generation.
897      InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
898    }
899  }
900  
901  namespace {
902  
903  // FuncUnitSorter - Comparison operator used to sort instructions by
904  // the number of functional unit choices.
905  struct FuncUnitSorter {
906    const InstrItineraryData *InstrItins;
907    const MCSubtargetInfo *STI;
908    DenseMap<unsigned, unsigned> Resources;
909  
910    FuncUnitSorter(const TargetSubtargetInfo &TSI)
911        : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
912  
913    // Compute the number of functional unit alternatives needed
914    // at each stage, and take the minimum value. We prioritize the
915    // instructions by the least number of choices first.
916    unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
917      unsigned SchedClass = Inst->getDesc().getSchedClass();
918      unsigned min = UINT_MAX;
919      if (InstrItins && !InstrItins->isEmpty()) {
920        for (const InstrStage &IS :
921             make_range(InstrItins->beginStage(SchedClass),
922                        InstrItins->endStage(SchedClass))) {
923          unsigned funcUnits = IS.getUnits();
924          unsigned numAlternatives = countPopulation(funcUnits);
925          if (numAlternatives < min) {
926            min = numAlternatives;
927            F = funcUnits;
928          }
929        }
930        return min;
931      }
932      if (STI && STI->getSchedModel().hasInstrSchedModel()) {
933        const MCSchedClassDesc *SCDesc =
934            STI->getSchedModel().getSchedClassDesc(SchedClass);
935        if (!SCDesc->isValid())
936          // No valid Schedule Class Desc for schedClass, should be
937          // Pseudo/PostRAPseudo
938          return min;
939  
940        for (const MCWriteProcResEntry &PRE :
941             make_range(STI->getWriteProcResBegin(SCDesc),
942                        STI->getWriteProcResEnd(SCDesc))) {
943          if (!PRE.Cycles)
944            continue;
945          const MCProcResourceDesc *ProcResource =
946              STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
947          unsigned NumUnits = ProcResource->NumUnits;
948          if (NumUnits < min) {
949            min = NumUnits;
950            F = PRE.ProcResourceIdx;
951          }
952        }
953        return min;
954      }
955      llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
956    }
957  
958    // Compute the critical resources needed by the instruction. This
959    // function records the functional units needed by instructions that
960    // must use only one functional unit. We use this as a tie breaker
961    // for computing the resource MII. The instrutions that require
962    // the same, highly used, functional unit have high priority.
963    void calcCriticalResources(MachineInstr &MI) {
964      unsigned SchedClass = MI.getDesc().getSchedClass();
965      if (InstrItins && !InstrItins->isEmpty()) {
966        for (const InstrStage &IS :
967             make_range(InstrItins->beginStage(SchedClass),
968                        InstrItins->endStage(SchedClass))) {
969          unsigned FuncUnits = IS.getUnits();
970          if (countPopulation(FuncUnits) == 1)
971            Resources[FuncUnits]++;
972        }
973        return;
974      }
975      if (STI && STI->getSchedModel().hasInstrSchedModel()) {
976        const MCSchedClassDesc *SCDesc =
977            STI->getSchedModel().getSchedClassDesc(SchedClass);
978        if (!SCDesc->isValid())
979          // No valid Schedule Class Desc for schedClass, should be
980          // Pseudo/PostRAPseudo
981          return;
982  
983        for (const MCWriteProcResEntry &PRE :
984             make_range(STI->getWriteProcResBegin(SCDesc),
985                        STI->getWriteProcResEnd(SCDesc))) {
986          if (!PRE.Cycles)
987            continue;
988          Resources[PRE.ProcResourceIdx]++;
989        }
990        return;
991      }
992      llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
993    }
994  
995    /// Return true if IS1 has less priority than IS2.
996    bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
997      unsigned F1 = 0, F2 = 0;
998      unsigned MFUs1 = minFuncUnits(IS1, F1);
999      unsigned MFUs2 = minFuncUnits(IS2, F2);
1000      if (MFUs1 == MFUs2)
1001        return Resources.lookup(F1) < Resources.lookup(F2);
1002      return MFUs1 > MFUs2;
1003    }
1004  };
1005  
1006  } // end anonymous namespace
1007  
1008  /// Calculate the resource constrained minimum initiation interval for the
1009  /// specified loop. We use the DFA to model the resources needed for
1010  /// each instruction, and we ignore dependences. A different DFA is created
1011  /// for each cycle that is required. When adding a new instruction, we attempt
1012  /// to add it to each existing DFA, until a legal space is found. If the
1013  /// instruction cannot be reserved in an existing DFA, we create a new one.
1014  unsigned SwingSchedulerDAG::calculateResMII() {
1015  
1016    LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1017    SmallVector<ResourceManager*, 8> Resources;
1018    MachineBasicBlock *MBB = Loop.getHeader();
1019    Resources.push_back(new ResourceManager(&MF.getSubtarget()));
1020  
1021    // Sort the instructions by the number of available choices for scheduling,
1022    // least to most. Use the number of critical resources as the tie breaker.
1023    FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());
1024    for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1025                                     E = MBB->getFirstTerminator();
1026         I != E; ++I)
1027      FUS.calcCriticalResources(*I);
1028    PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
1029        FuncUnitOrder(FUS);
1030  
1031    for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1032                                     E = MBB->getFirstTerminator();
1033         I != E; ++I)
1034      FuncUnitOrder.push(&*I);
1035  
1036    while (!FuncUnitOrder.empty()) {
1037      MachineInstr *MI = FuncUnitOrder.top();
1038      FuncUnitOrder.pop();
1039      if (TII->isZeroCost(MI->getOpcode()))
1040        continue;
1041      // Attempt to reserve the instruction in an existing DFA. At least one
1042      // DFA is needed for each cycle.
1043      unsigned NumCycles = getSUnit(MI)->Latency;
1044      unsigned ReservedCycles = 0;
1045      SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin();
1046      SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end();
1047      LLVM_DEBUG({
1048        dbgs() << "Trying to reserve resource for " << NumCycles
1049               << " cycles for \n";
1050        MI->dump();
1051      });
1052      for (unsigned C = 0; C < NumCycles; ++C)
1053        while (RI != RE) {
1054          if ((*RI)->canReserveResources(*MI)) {
1055            (*RI)->reserveResources(*MI);
1056            ++ReservedCycles;
1057            break;
1058          }
1059          RI++;
1060        }
1061      LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
1062                        << ", NumCycles:" << NumCycles << "\n");
1063      // Add new DFAs, if needed, to reserve resources.
1064      for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1065        LLVM_DEBUG(if (SwpDebugResource) dbgs()
1066                   << "NewResource created to reserve resources"
1067                   << "\n");
1068        ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget());
1069        assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1070        NewResource->reserveResources(*MI);
1071        Resources.push_back(NewResource);
1072      }
1073    }
1074    int Resmii = Resources.size();
1075    LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");
1076    // Delete the memory for each of the DFAs that were created earlier.
1077    for (ResourceManager *RI : Resources) {
1078      ResourceManager *D = RI;
1079      delete D;
1080    }
1081    Resources.clear();
1082    return Resmii;
1083  }
1084  
1085  /// Calculate the recurrence-constrainted minimum initiation interval.
1086  /// Iterate over each circuit.  Compute the delay(c) and distance(c)
1087  /// for each circuit. The II needs to satisfy the inequality
1088  /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1089  /// II that satisfies the inequality, and the RecMII is the maximum
1090  /// of those values.
1091  unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1092    unsigned RecMII = 0;
1093  
1094    for (NodeSet &Nodes : NodeSets) {
1095      if (Nodes.empty())
1096        continue;
1097  
1098      unsigned Delay = Nodes.getLatency();
1099      unsigned Distance = 1;
1100  
1101      // ii = ceil(delay / distance)
1102      unsigned CurMII = (Delay + Distance - 1) / Distance;
1103      Nodes.setRecMII(CurMII);
1104      if (CurMII > RecMII)
1105        RecMII = CurMII;
1106    }
1107  
1108    return RecMII;
1109  }
1110  
1111  /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1112  /// but we do this to find the circuits, and then change them back.
1113  static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1114    SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1115    for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1116      SUnit *SU = &SUnits[i];
1117      for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1118           IP != EP; ++IP) {
1119        if (IP->getKind() != SDep::Anti)
1120          continue;
1121        DepsAdded.push_back(std::make_pair(SU, *IP));
1122      }
1123    }
1124    for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1125                                                            E = DepsAdded.end();
1126         I != E; ++I) {
1127      // Remove this anti dependency and add one in the reverse direction.
1128      SUnit *SU = I->first;
1129      SDep &D = I->second;
1130      SUnit *TargetSU = D.getSUnit();
1131      unsigned Reg = D.getReg();
1132      unsigned Lat = D.getLatency();
1133      SU->removePred(D);
1134      SDep Dep(SU, SDep::Anti, Reg);
1135      Dep.setLatency(Lat);
1136      TargetSU->addPred(Dep);
1137    }
1138  }
1139  
1140  /// Create the adjacency structure of the nodes in the graph.
1141  void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1142      SwingSchedulerDAG *DAG) {
1143    BitVector Added(SUnits.size());
1144    DenseMap<int, int> OutputDeps;
1145    for (int i = 0, e = SUnits.size(); i != e; ++i) {
1146      Added.reset();
1147      // Add any successor to the adjacency matrix and exclude duplicates.
1148      for (auto &SI : SUnits[i].Succs) {
1149        // Only create a back-edge on the first and last nodes of a dependence
1150        // chain. This records any chains and adds them later.
1151        if (SI.getKind() == SDep::Output) {
1152          int N = SI.getSUnit()->NodeNum;
1153          int BackEdge = i;
1154          auto Dep = OutputDeps.find(BackEdge);
1155          if (Dep != OutputDeps.end()) {
1156            BackEdge = Dep->second;
1157            OutputDeps.erase(Dep);
1158          }
1159          OutputDeps[N] = BackEdge;
1160        }
1161        // Do not process a boundary node, an artificial node.
1162        // A back-edge is processed only if it goes to a Phi.
1163        if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1164            (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1165          continue;
1166        int N = SI.getSUnit()->NodeNum;
1167        if (!Added.test(N)) {
1168          AdjK[i].push_back(N);
1169          Added.set(N);
1170        }
1171      }
1172      // A chain edge between a store and a load is treated as a back-edge in the
1173      // adjacency matrix.
1174      for (auto &PI : SUnits[i].Preds) {
1175        if (!SUnits[i].getInstr()->mayStore() ||
1176            !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1177          continue;
1178        if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1179          int N = PI.getSUnit()->NodeNum;
1180          if (!Added.test(N)) {
1181            AdjK[i].push_back(N);
1182            Added.set(N);
1183          }
1184        }
1185      }
1186    }
1187    // Add back-edges in the adjacency matrix for the output dependences.
1188    for (auto &OD : OutputDeps)
1189      if (!Added.test(OD.second)) {
1190        AdjK[OD.first].push_back(OD.second);
1191        Added.set(OD.second);
1192      }
1193  }
1194  
1195  /// Identify an elementary circuit in the dependence graph starting at the
1196  /// specified node.
1197  bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1198                                            bool HasBackedge) {
1199    SUnit *SV = &SUnits[V];
1200    bool F = false;
1201    Stack.insert(SV);
1202    Blocked.set(V);
1203  
1204    for (auto W : AdjK[V]) {
1205      if (NumPaths > MaxPaths)
1206        break;
1207      if (W < S)
1208        continue;
1209      if (W == S) {
1210        if (!HasBackedge)
1211          NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1212        F = true;
1213        ++NumPaths;
1214        break;
1215      } else if (!Blocked.test(W)) {
1216        if (circuit(W, S, NodeSets,
1217                    Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1218          F = true;
1219      }
1220    }
1221  
1222    if (F)
1223      unblock(V);
1224    else {
1225      for (auto W : AdjK[V]) {
1226        if (W < S)
1227          continue;
1228        if (B[W].count(SV) == 0)
1229          B[W].insert(SV);
1230      }
1231    }
1232    Stack.pop_back();
1233    return F;
1234  }
1235  
1236  /// Unblock a node in the circuit finding algorithm.
1237  void SwingSchedulerDAG::Circuits::unblock(int U) {
1238    Blocked.reset(U);
1239    SmallPtrSet<SUnit *, 4> &BU = B[U];
1240    while (!BU.empty()) {
1241      SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1242      assert(SI != BU.end() && "Invalid B set.");
1243      SUnit *W = *SI;
1244      BU.erase(W);
1245      if (Blocked.test(W->NodeNum))
1246        unblock(W->NodeNum);
1247    }
1248  }
1249  
1250  /// Identify all the elementary circuits in the dependence graph using
1251  /// Johnson's circuit algorithm.
1252  void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1253    // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1254    // but we do this to find the circuits, and then change them back.
1255    swapAntiDependences(SUnits);
1256  
1257    Circuits Cir(SUnits, Topo);
1258    // Create the adjacency structure.
1259    Cir.createAdjacencyStructure(this);
1260    for (int i = 0, e = SUnits.size(); i != e; ++i) {
1261      Cir.reset();
1262      Cir.circuit(i, i, NodeSets);
1263    }
1264  
1265    // Change the dependences back so that we've created a DAG again.
1266    swapAntiDependences(SUnits);
1267  }
1268  
1269  // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1270  // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1271  // additional copies that are needed across iterations. An artificial dependence
1272  // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1273  
1274  // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1275  // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1276  // PHI-------True-Dep------> USEOfPhi
1277  
1278  // The mutation creates
1279  // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1280  
1281  // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1282  // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1283  // late  to avoid additional copies across iterations. The possible scheduling
1284  // order would be
1285  // USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
1286  
1287  void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1288    for (SUnit &SU : DAG->SUnits) {
1289      // Find the COPY/REG_SEQUENCE instruction.
1290      if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1291        continue;
1292  
1293      // Record the loop carried PHIs.
1294      SmallVector<SUnit *, 4> PHISUs;
1295      // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1296      SmallVector<SUnit *, 4> SrcSUs;
1297  
1298      for (auto &Dep : SU.Preds) {
1299        SUnit *TmpSU = Dep.getSUnit();
1300        MachineInstr *TmpMI = TmpSU->getInstr();
1301        SDep::Kind DepKind = Dep.getKind();
1302        // Save the loop carried PHI.
1303        if (DepKind == SDep::Anti && TmpMI->isPHI())
1304          PHISUs.push_back(TmpSU);
1305        // Save the source of COPY/REG_SEQUENCE.
1306        // If the source has no pre-decessors, we will end up creating cycles.
1307        else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1308          SrcSUs.push_back(TmpSU);
1309      }
1310  
1311      if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1312        continue;
1313  
1314      // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1315      // SUnit to the container.
1316      SmallVector<SUnit *, 8> UseSUs;
1317      // Do not use iterator based loop here as we are updating the container.
1318      for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1319        for (auto &Dep : PHISUs[Index]->Succs) {
1320          if (Dep.getKind() != SDep::Data)
1321            continue;
1322  
1323          SUnit *TmpSU = Dep.getSUnit();
1324          MachineInstr *TmpMI = TmpSU->getInstr();
1325          if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1326            PHISUs.push_back(TmpSU);
1327            continue;
1328          }
1329          UseSUs.push_back(TmpSU);
1330        }
1331      }
1332  
1333      if (UseSUs.size() == 0)
1334        continue;
1335  
1336      SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1337      // Add the artificial dependencies if it does not form a cycle.
1338      for (auto I : UseSUs) {
1339        for (auto Src : SrcSUs) {
1340          if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1341            Src->addPred(SDep(I, SDep::Artificial));
1342            SDAG->Topo.AddPred(Src, I);
1343          }
1344        }
1345      }
1346    }
1347  }
1348  
1349  /// Return true for DAG nodes that we ignore when computing the cost functions.
1350  /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1351  /// in the calculation of the ASAP, ALAP, etc functions.
1352  static bool ignoreDependence(const SDep &D, bool isPred) {
1353    if (D.isArtificial())
1354      return true;
1355    return D.getKind() == SDep::Anti && isPred;
1356  }
1357  
1358  /// Compute several functions need to order the nodes for scheduling.
1359  ///  ASAP - Earliest time to schedule a node.
1360  ///  ALAP - Latest time to schedule a node.
1361  ///  MOV - Mobility function, difference between ALAP and ASAP.
1362  ///  D - Depth of each node.
1363  ///  H - Height of each node.
1364  void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1365    ScheduleInfo.resize(SUnits.size());
1366  
1367    LLVM_DEBUG({
1368      for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1369                                                      E = Topo.end();
1370           I != E; ++I) {
1371        const SUnit &SU = SUnits[*I];
1372        dumpNode(SU);
1373      }
1374    });
1375  
1376    int maxASAP = 0;
1377    // Compute ASAP and ZeroLatencyDepth.
1378    for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1379                                                    E = Topo.end();
1380         I != E; ++I) {
1381      int asap = 0;
1382      int zeroLatencyDepth = 0;
1383      SUnit *SU = &SUnits[*I];
1384      for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1385                                      EP = SU->Preds.end();
1386           IP != EP; ++IP) {
1387        SUnit *pred = IP->getSUnit();
1388        if (IP->getLatency() == 0)
1389          zeroLatencyDepth =
1390              std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1391        if (ignoreDependence(*IP, true))
1392          continue;
1393        asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
1394                                    getDistance(pred, SU, *IP) * MII));
1395      }
1396      maxASAP = std::max(maxASAP, asap);
1397      ScheduleInfo[*I].ASAP = asap;
1398      ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
1399    }
1400  
1401    // Compute ALAP, ZeroLatencyHeight, and MOV.
1402    for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1403                                                            E = Topo.rend();
1404         I != E; ++I) {
1405      int alap = maxASAP;
1406      int zeroLatencyHeight = 0;
1407      SUnit *SU = &SUnits[*I];
1408      for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1409                                      ES = SU->Succs.end();
1410           IS != ES; ++IS) {
1411        SUnit *succ = IS->getSUnit();
1412        if (IS->getLatency() == 0)
1413          zeroLatencyHeight =
1414              std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1415        if (ignoreDependence(*IS, true))
1416          continue;
1417        alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
1418                                    getDistance(SU, succ, *IS) * MII));
1419      }
1420  
1421      ScheduleInfo[*I].ALAP = alap;
1422      ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
1423    }
1424  
1425    // After computing the node functions, compute the summary for each node set.
1426    for (NodeSet &I : NodeSets)
1427      I.computeNodeSetInfo(this);
1428  
1429    LLVM_DEBUG({
1430      for (unsigned i = 0; i < SUnits.size(); i++) {
1431        dbgs() << "\tNode " << i << ":\n";
1432        dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1433        dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1434        dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1435        dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1436        dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1437        dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1438        dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1439      }
1440    });
1441  }
1442  
1443  /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1444  /// as the predecessors of the elements of NodeOrder that are not also in
1445  /// NodeOrder.
1446  static bool pred_L(SetVector<SUnit *> &NodeOrder,
1447                     SmallSetVector<SUnit *, 8> &Preds,
1448                     const NodeSet *S = nullptr) {
1449    Preds.clear();
1450    for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1451         I != E; ++I) {
1452      for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1453           PI != PE; ++PI) {
1454        if (S && S->count(PI->getSUnit()) == 0)
1455          continue;
1456        if (ignoreDependence(*PI, true))
1457          continue;
1458        if (NodeOrder.count(PI->getSUnit()) == 0)
1459          Preds.insert(PI->getSUnit());
1460      }
1461      // Back-edges are predecessors with an anti-dependence.
1462      for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1463                                      ES = (*I)->Succs.end();
1464           IS != ES; ++IS) {
1465        if (IS->getKind() != SDep::Anti)
1466          continue;
1467        if (S && S->count(IS->getSUnit()) == 0)
1468          continue;
1469        if (NodeOrder.count(IS->getSUnit()) == 0)
1470          Preds.insert(IS->getSUnit());
1471      }
1472    }
1473    return !Preds.empty();
1474  }
1475  
1476  /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1477  /// as the successors of the elements of NodeOrder that are not also in
1478  /// NodeOrder.
1479  static bool succ_L(SetVector<SUnit *> &NodeOrder,
1480                     SmallSetVector<SUnit *, 8> &Succs,
1481                     const NodeSet *S = nullptr) {
1482    Succs.clear();
1483    for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1484         I != E; ++I) {
1485      for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1486           SI != SE; ++SI) {
1487        if (S && S->count(SI->getSUnit()) == 0)
1488          continue;
1489        if (ignoreDependence(*SI, false))
1490          continue;
1491        if (NodeOrder.count(SI->getSUnit()) == 0)
1492          Succs.insert(SI->getSUnit());
1493      }
1494      for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1495                                      PE = (*I)->Preds.end();
1496           PI != PE; ++PI) {
1497        if (PI->getKind() != SDep::Anti)
1498          continue;
1499        if (S && S->count(PI->getSUnit()) == 0)
1500          continue;
1501        if (NodeOrder.count(PI->getSUnit()) == 0)
1502          Succs.insert(PI->getSUnit());
1503      }
1504    }
1505    return !Succs.empty();
1506  }
1507  
1508  /// Return true if there is a path from the specified node to any of the nodes
1509  /// in DestNodes. Keep track and return the nodes in any path.
1510  static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1511                          SetVector<SUnit *> &DestNodes,
1512                          SetVector<SUnit *> &Exclude,
1513                          SmallPtrSet<SUnit *, 8> &Visited) {
1514    if (Cur->isBoundaryNode())
1515      return false;
1516    if (Exclude.count(Cur) != 0)
1517      return false;
1518    if (DestNodes.count(Cur) != 0)
1519      return true;
1520    if (!Visited.insert(Cur).second)
1521      return Path.count(Cur) != 0;
1522    bool FoundPath = false;
1523    for (auto &SI : Cur->Succs)
1524      FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1525    for (auto &PI : Cur->Preds)
1526      if (PI.getKind() == SDep::Anti)
1527        FoundPath |=
1528            computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1529    if (FoundPath)
1530      Path.insert(Cur);
1531    return FoundPath;
1532  }
1533  
1534  /// Return true if Set1 is a subset of Set2.
1535  template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1536    for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1537      if (Set2.count(*I) == 0)
1538        return false;
1539    return true;
1540  }
1541  
1542  /// Compute the live-out registers for the instructions in a node-set.
1543  /// The live-out registers are those that are defined in the node-set,
1544  /// but not used. Except for use operands of Phis.
1545  static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1546                              NodeSet &NS) {
1547    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1548    MachineRegisterInfo &MRI = MF.getRegInfo();
1549    SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1550    SmallSet<unsigned, 4> Uses;
1551    for (SUnit *SU : NS) {
1552      const MachineInstr *MI = SU->getInstr();
1553      if (MI->isPHI())
1554        continue;
1555      for (const MachineOperand &MO : MI->operands())
1556        if (MO.isReg() && MO.isUse()) {
1557          Register Reg = MO.getReg();
1558          if (Register::isVirtualRegister(Reg))
1559            Uses.insert(Reg);
1560          else if (MRI.isAllocatable(Reg))
1561            for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1562              Uses.insert(*Units);
1563        }
1564    }
1565    for (SUnit *SU : NS)
1566      for (const MachineOperand &MO : SU->getInstr()->operands())
1567        if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1568          Register Reg = MO.getReg();
1569          if (Register::isVirtualRegister(Reg)) {
1570            if (!Uses.count(Reg))
1571              LiveOutRegs.push_back(RegisterMaskPair(Reg,
1572                                                     LaneBitmask::getNone()));
1573          } else if (MRI.isAllocatable(Reg)) {
1574            for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1575              if (!Uses.count(*Units))
1576                LiveOutRegs.push_back(RegisterMaskPair(*Units,
1577                                                       LaneBitmask::getNone()));
1578          }
1579        }
1580    RPTracker.addLiveRegs(LiveOutRegs);
1581  }
1582  
1583  /// A heuristic to filter nodes in recurrent node-sets if the register
1584  /// pressure of a set is too high.
1585  void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1586    for (auto &NS : NodeSets) {
1587      // Skip small node-sets since they won't cause register pressure problems.
1588      if (NS.size() <= 2)
1589        continue;
1590      IntervalPressure RecRegPressure;
1591      RegPressureTracker RecRPTracker(RecRegPressure);
1592      RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1593      computeLiveOuts(MF, RecRPTracker, NS);
1594      RecRPTracker.closeBottom();
1595  
1596      std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1597      llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1598        return A->NodeNum > B->NodeNum;
1599      });
1600  
1601      for (auto &SU : SUnits) {
1602        // Since we're computing the register pressure for a subset of the
1603        // instructions in a block, we need to set the tracker for each
1604        // instruction in the node-set. The tracker is set to the instruction
1605        // just after the one we're interested in.
1606        MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1607        RecRPTracker.setPos(std::next(CurInstI));
1608  
1609        RegPressureDelta RPDelta;
1610        ArrayRef<PressureChange> CriticalPSets;
1611        RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1612                                               CriticalPSets,
1613                                               RecRegPressure.MaxSetPressure);
1614        if (RPDelta.Excess.isValid()) {
1615          LLVM_DEBUG(
1616              dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1617                     << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1618                     << ":" << RPDelta.Excess.getUnitInc());
1619          NS.setExceedPressure(SU);
1620          break;
1621        }
1622        RecRPTracker.recede();
1623      }
1624    }
1625  }
1626  
1627  /// A heuristic to colocate node sets that have the same set of
1628  /// successors.
1629  void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1630    unsigned Colocate = 0;
1631    for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1632      NodeSet &N1 = NodeSets[i];
1633      SmallSetVector<SUnit *, 8> S1;
1634      if (N1.empty() || !succ_L(N1, S1))
1635        continue;
1636      for (int j = i + 1; j < e; ++j) {
1637        NodeSet &N2 = NodeSets[j];
1638        if (N1.compareRecMII(N2) != 0)
1639          continue;
1640        SmallSetVector<SUnit *, 8> S2;
1641        if (N2.empty() || !succ_L(N2, S2))
1642          continue;
1643        if (isSubset(S1, S2) && S1.size() == S2.size()) {
1644          N1.setColocate(++Colocate);
1645          N2.setColocate(Colocate);
1646          break;
1647        }
1648      }
1649    }
1650  }
1651  
1652  /// Check if the existing node-sets are profitable. If not, then ignore the
1653  /// recurrent node-sets, and attempt to schedule all nodes together. This is
1654  /// a heuristic. If the MII is large and all the recurrent node-sets are small,
1655  /// then it's best to try to schedule all instructions together instead of
1656  /// starting with the recurrent node-sets.
1657  void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1658    // Look for loops with a large MII.
1659    if (MII < 17)
1660      return;
1661    // Check if the node-set contains only a simple add recurrence.
1662    for (auto &NS : NodeSets) {
1663      if (NS.getRecMII() > 2)
1664        return;
1665      if (NS.getMaxDepth() > MII)
1666        return;
1667    }
1668    NodeSets.clear();
1669    LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1670    return;
1671  }
1672  
1673  /// Add the nodes that do not belong to a recurrence set into groups
1674  /// based upon connected componenets.
1675  void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1676    SetVector<SUnit *> NodesAdded;
1677    SmallPtrSet<SUnit *, 8> Visited;
1678    // Add the nodes that are on a path between the previous node sets and
1679    // the current node set.
1680    for (NodeSet &I : NodeSets) {
1681      SmallSetVector<SUnit *, 8> N;
1682      // Add the nodes from the current node set to the previous node set.
1683      if (succ_L(I, N)) {
1684        SetVector<SUnit *> Path;
1685        for (SUnit *NI : N) {
1686          Visited.clear();
1687          computePath(NI, Path, NodesAdded, I, Visited);
1688        }
1689        if (!Path.empty())
1690          I.insert(Path.begin(), Path.end());
1691      }
1692      // Add the nodes from the previous node set to the current node set.
1693      N.clear();
1694      if (succ_L(NodesAdded, N)) {
1695        SetVector<SUnit *> Path;
1696        for (SUnit *NI : N) {
1697          Visited.clear();
1698          computePath(NI, Path, I, NodesAdded, Visited);
1699        }
1700        if (!Path.empty())
1701          I.insert(Path.begin(), Path.end());
1702      }
1703      NodesAdded.insert(I.begin(), I.end());
1704    }
1705  
1706    // Create a new node set with the connected nodes of any successor of a node
1707    // in a recurrent set.
1708    NodeSet NewSet;
1709    SmallSetVector<SUnit *, 8> N;
1710    if (succ_L(NodesAdded, N))
1711      for (SUnit *I : N)
1712        addConnectedNodes(I, NewSet, NodesAdded);
1713    if (!NewSet.empty())
1714      NodeSets.push_back(NewSet);
1715  
1716    // Create a new node set with the connected nodes of any predecessor of a node
1717    // in a recurrent set.
1718    NewSet.clear();
1719    if (pred_L(NodesAdded, N))
1720      for (SUnit *I : N)
1721        addConnectedNodes(I, NewSet, NodesAdded);
1722    if (!NewSet.empty())
1723      NodeSets.push_back(NewSet);
1724  
1725    // Create new nodes sets with the connected nodes any remaining node that
1726    // has no predecessor.
1727    for (unsigned i = 0; i < SUnits.size(); ++i) {
1728      SUnit *SU = &SUnits[i];
1729      if (NodesAdded.count(SU) == 0) {
1730        NewSet.clear();
1731        addConnectedNodes(SU, NewSet, NodesAdded);
1732        if (!NewSet.empty())
1733          NodeSets.push_back(NewSet);
1734      }
1735    }
1736  }
1737  
1738  /// Add the node to the set, and add all of its connected nodes to the set.
1739  void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1740                                            SetVector<SUnit *> &NodesAdded) {
1741    NewSet.insert(SU);
1742    NodesAdded.insert(SU);
1743    for (auto &SI : SU->Succs) {
1744      SUnit *Successor = SI.getSUnit();
1745      if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1746        addConnectedNodes(Successor, NewSet, NodesAdded);
1747    }
1748    for (auto &PI : SU->Preds) {
1749      SUnit *Predecessor = PI.getSUnit();
1750      if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1751        addConnectedNodes(Predecessor, NewSet, NodesAdded);
1752    }
1753  }
1754  
1755  /// Return true if Set1 contains elements in Set2. The elements in common
1756  /// are returned in a different container.
1757  static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1758                          SmallSetVector<SUnit *, 8> &Result) {
1759    Result.clear();
1760    for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1761      SUnit *SU = Set1[i];
1762      if (Set2.count(SU) != 0)
1763        Result.insert(SU);
1764    }
1765    return !Result.empty();
1766  }
1767  
1768  /// Merge the recurrence node sets that have the same initial node.
1769  void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1770    for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1771         ++I) {
1772      NodeSet &NI = *I;
1773      for (NodeSetType::iterator J = I + 1; J != E;) {
1774        NodeSet &NJ = *J;
1775        if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1776          if (NJ.compareRecMII(NI) > 0)
1777            NI.setRecMII(NJ.getRecMII());
1778          for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1779               ++NII)
1780            I->insert(*NII);
1781          NodeSets.erase(J);
1782          E = NodeSets.end();
1783        } else {
1784          ++J;
1785        }
1786      }
1787    }
1788  }
1789  
1790  /// Remove nodes that have been scheduled in previous NodeSets.
1791  void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1792    for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1793         ++I)
1794      for (NodeSetType::iterator J = I + 1; J != E;) {
1795        J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1796  
1797        if (J->empty()) {
1798          NodeSets.erase(J);
1799          E = NodeSets.end();
1800        } else {
1801          ++J;
1802        }
1803      }
1804  }
1805  
1806  /// Compute an ordered list of the dependence graph nodes, which
1807  /// indicates the order that the nodes will be scheduled.  This is a
1808  /// two-level algorithm. First, a partial order is created, which
1809  /// consists of a list of sets ordered from highest to lowest priority.
1810  void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1811    SmallSetVector<SUnit *, 8> R;
1812    NodeOrder.clear();
1813  
1814    for (auto &Nodes : NodeSets) {
1815      LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1816      OrderKind Order;
1817      SmallSetVector<SUnit *, 8> N;
1818      if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
1819        R.insert(N.begin(), N.end());
1820        Order = BottomUp;
1821        LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
1822      } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
1823        R.insert(N.begin(), N.end());
1824        Order = TopDown;
1825        LLVM_DEBUG(dbgs() << "  Top down (succs) ");
1826      } else if (isIntersect(N, Nodes, R)) {
1827        // If some of the successors are in the existing node-set, then use the
1828        // top-down ordering.
1829        Order = TopDown;
1830        LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
1831      } else if (NodeSets.size() == 1) {
1832        for (auto &N : Nodes)
1833          if (N->Succs.size() == 0)
1834            R.insert(N);
1835        Order = BottomUp;
1836        LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
1837      } else {
1838        // Find the node with the highest ASAP.
1839        SUnit *maxASAP = nullptr;
1840        for (SUnit *SU : Nodes) {
1841          if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1842              (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1843            maxASAP = SU;
1844        }
1845        R.insert(maxASAP);
1846        Order = BottomUp;
1847        LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
1848      }
1849  
1850      while (!R.empty()) {
1851        if (Order == TopDown) {
1852          // Choose the node with the maximum height.  If more than one, choose
1853          // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1854          // choose the node with the lowest MOV.
1855          while (!R.empty()) {
1856            SUnit *maxHeight = nullptr;
1857            for (SUnit *I : R) {
1858              if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1859                maxHeight = I;
1860              else if (getHeight(I) == getHeight(maxHeight) &&
1861                       getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
1862                maxHeight = I;
1863              else if (getHeight(I) == getHeight(maxHeight) &&
1864                       getZeroLatencyHeight(I) ==
1865                           getZeroLatencyHeight(maxHeight) &&
1866                       getMOV(I) < getMOV(maxHeight))
1867                maxHeight = I;
1868            }
1869            NodeOrder.insert(maxHeight);
1870            LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1871            R.remove(maxHeight);
1872            for (const auto &I : maxHeight->Succs) {
1873              if (Nodes.count(I.getSUnit()) == 0)
1874                continue;
1875              if (NodeOrder.count(I.getSUnit()) != 0)
1876                continue;
1877              if (ignoreDependence(I, false))
1878                continue;
1879              R.insert(I.getSUnit());
1880            }
1881            // Back-edges are predecessors with an anti-dependence.
1882            for (const auto &I : maxHeight->Preds) {
1883              if (I.getKind() != SDep::Anti)
1884                continue;
1885              if (Nodes.count(I.getSUnit()) == 0)
1886                continue;
1887              if (NodeOrder.count(I.getSUnit()) != 0)
1888                continue;
1889              R.insert(I.getSUnit());
1890            }
1891          }
1892          Order = BottomUp;
1893          LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
1894          SmallSetVector<SUnit *, 8> N;
1895          if (pred_L(NodeOrder, N, &Nodes))
1896            R.insert(N.begin(), N.end());
1897        } else {
1898          // Choose the node with the maximum depth.  If more than one, choose
1899          // the node with the maximum ZeroLatencyDepth. If still more than one,
1900          // choose the node with the lowest MOV.
1901          while (!R.empty()) {
1902            SUnit *maxDepth = nullptr;
1903            for (SUnit *I : R) {
1904              if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1905                maxDepth = I;
1906              else if (getDepth(I) == getDepth(maxDepth) &&
1907                       getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
1908                maxDepth = I;
1909              else if (getDepth(I) == getDepth(maxDepth) &&
1910                       getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
1911                       getMOV(I) < getMOV(maxDepth))
1912                maxDepth = I;
1913            }
1914            NodeOrder.insert(maxDepth);
1915            LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1916            R.remove(maxDepth);
1917            if (Nodes.isExceedSU(maxDepth)) {
1918              Order = TopDown;
1919              R.clear();
1920              R.insert(Nodes.getNode(0));
1921              break;
1922            }
1923            for (const auto &I : maxDepth->Preds) {
1924              if (Nodes.count(I.getSUnit()) == 0)
1925                continue;
1926              if (NodeOrder.count(I.getSUnit()) != 0)
1927                continue;
1928              R.insert(I.getSUnit());
1929            }
1930            // Back-edges are predecessors with an anti-dependence.
1931            for (const auto &I : maxDepth->Succs) {
1932              if (I.getKind() != SDep::Anti)
1933                continue;
1934              if (Nodes.count(I.getSUnit()) == 0)
1935                continue;
1936              if (NodeOrder.count(I.getSUnit()) != 0)
1937                continue;
1938              R.insert(I.getSUnit());
1939            }
1940          }
1941          Order = TopDown;
1942          LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
1943          SmallSetVector<SUnit *, 8> N;
1944          if (succ_L(NodeOrder, N, &Nodes))
1945            R.insert(N.begin(), N.end());
1946        }
1947      }
1948      LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1949    }
1950  
1951    LLVM_DEBUG({
1952      dbgs() << "Node order: ";
1953      for (SUnit *I : NodeOrder)
1954        dbgs() << " " << I->NodeNum << " ";
1955      dbgs() << "\n";
1956    });
1957  }
1958  
1959  /// Process the nodes in the computed order and create the pipelined schedule
1960  /// of the instructions, if possible. Return true if a schedule is found.
1961  bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
1962  
1963    if (NodeOrder.empty()){
1964      LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
1965      return false;
1966    }
1967  
1968    bool scheduleFound = false;
1969    unsigned II = 0;
1970    // Keep increasing II until a valid schedule is found.
1971    for (II = MII; II <= MAX_II && !scheduleFound; ++II) {
1972      Schedule.reset();
1973      Schedule.setInitiationInterval(II);
1974      LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
1975  
1976      SetVector<SUnit *>::iterator NI = NodeOrder.begin();
1977      SetVector<SUnit *>::iterator NE = NodeOrder.end();
1978      do {
1979        SUnit *SU = *NI;
1980  
1981        // Compute the schedule time for the instruction, which is based
1982        // upon the scheduled time for any predecessors/successors.
1983        int EarlyStart = INT_MIN;
1984        int LateStart = INT_MAX;
1985        // These values are set when the size of the schedule window is limited
1986        // due to chain dependences.
1987        int SchedEnd = INT_MAX;
1988        int SchedStart = INT_MIN;
1989        Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
1990                              II, this);
1991        LLVM_DEBUG({
1992          dbgs() << "\n";
1993          dbgs() << "Inst (" << SU->NodeNum << ") ";
1994          SU->getInstr()->dump();
1995          dbgs() << "\n";
1996        });
1997        LLVM_DEBUG({
1998          dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
1999                           LateStart, SchedEnd, SchedStart);
2000        });
2001  
2002        if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2003            SchedStart > LateStart)
2004          scheduleFound = false;
2005        else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2006          SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2007          scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2008        } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2009          SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2010          scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2011        } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2012          SchedEnd =
2013              std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2014          // When scheduling a Phi it is better to start at the late cycle and go
2015          // backwards. The default order may insert the Phi too far away from
2016          // its first dependence.
2017          if (SU->getInstr()->isPHI())
2018            scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2019          else
2020            scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2021        } else {
2022          int FirstCycle = Schedule.getFirstCycle();
2023          scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2024                                          FirstCycle + getASAP(SU) + II - 1, II);
2025        }
2026        // Even if we find a schedule, make sure the schedule doesn't exceed the
2027        // allowable number of stages. We keep trying if this happens.
2028        if (scheduleFound)
2029          if (SwpMaxStages > -1 &&
2030              Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2031            scheduleFound = false;
2032  
2033        LLVM_DEBUG({
2034          if (!scheduleFound)
2035            dbgs() << "\tCan't schedule\n";
2036        });
2037      } while (++NI != NE && scheduleFound);
2038  
2039      // If a schedule is found, check if it is a valid schedule too.
2040      if (scheduleFound)
2041        scheduleFound = Schedule.isValidSchedule(this);
2042    }
2043  
2044    LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
2045                      << ")\n");
2046  
2047    if (scheduleFound)
2048      Schedule.finalizeSchedule(this);
2049    else
2050      Schedule.reset();
2051  
2052    return scheduleFound && Schedule.getMaxStageCount() > 0;
2053  }
2054  
2055  /// Return true if we can compute the amount the instruction changes
2056  /// during each iteration. Set Delta to the amount of the change.
2057  bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2058    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2059    const MachineOperand *BaseOp;
2060    int64_t Offset;
2061    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
2062      return false;
2063  
2064    if (!BaseOp->isReg())
2065      return false;
2066  
2067    Register BaseReg = BaseOp->getReg();
2068  
2069    MachineRegisterInfo &MRI = MF.getRegInfo();
2070    // Check if there is a Phi. If so, get the definition in the loop.
2071    MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2072    if (BaseDef && BaseDef->isPHI()) {
2073      BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2074      BaseDef = MRI.getVRegDef(BaseReg);
2075    }
2076    if (!BaseDef)
2077      return false;
2078  
2079    int D = 0;
2080    if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2081      return false;
2082  
2083    Delta = D;
2084    return true;
2085  }
2086  
2087  /// Check if we can change the instruction to use an offset value from the
2088  /// previous iteration. If so, return true and set the base and offset values
2089  /// so that we can rewrite the load, if necessary.
2090  ///   v1 = Phi(v0, v3)
2091  ///   v2 = load v1, 0
2092  ///   v3 = post_store v1, 4, x
2093  /// This function enables the load to be rewritten as v2 = load v3, 4.
2094  bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2095                                                unsigned &BasePos,
2096                                                unsigned &OffsetPos,
2097                                                unsigned &NewBase,
2098                                                int64_t &Offset) {
2099    // Get the load instruction.
2100    if (TII->isPostIncrement(*MI))
2101      return false;
2102    unsigned BasePosLd, OffsetPosLd;
2103    if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2104      return false;
2105    Register BaseReg = MI->getOperand(BasePosLd).getReg();
2106  
2107    // Look for the Phi instruction.
2108    MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2109    MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2110    if (!Phi || !Phi->isPHI())
2111      return false;
2112    // Get the register defined in the loop block.
2113    unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2114    if (!PrevReg)
2115      return false;
2116  
2117    // Check for the post-increment load/store instruction.
2118    MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2119    if (!PrevDef || PrevDef == MI)
2120      return false;
2121  
2122    if (!TII->isPostIncrement(*PrevDef))
2123      return false;
2124  
2125    unsigned BasePos1 = 0, OffsetPos1 = 0;
2126    if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2127      return false;
2128  
2129    // Make sure that the instructions do not access the same memory location in
2130    // the next iteration.
2131    int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2132    int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2133    MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2134    NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2135    bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2136    MF.DeleteMachineInstr(NewMI);
2137    if (!Disjoint)
2138      return false;
2139  
2140    // Set the return value once we determine that we return true.
2141    BasePos = BasePosLd;
2142    OffsetPos = OffsetPosLd;
2143    NewBase = PrevReg;
2144    Offset = StoreOffset;
2145    return true;
2146  }
2147  
2148  /// Apply changes to the instruction if needed. The changes are need
2149  /// to improve the scheduling and depend up on the final schedule.
2150  void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
2151                                           SMSchedule &Schedule) {
2152    SUnit *SU = getSUnit(MI);
2153    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2154        InstrChanges.find(SU);
2155    if (It != InstrChanges.end()) {
2156      std::pair<unsigned, int64_t> RegAndOffset = It->second;
2157      unsigned BasePos, OffsetPos;
2158      if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2159        return;
2160      Register BaseReg = MI->getOperand(BasePos).getReg();
2161      MachineInstr *LoopDef = findDefInLoop(BaseReg);
2162      int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2163      int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2164      int BaseStageNum = Schedule.stageScheduled(SU);
2165      int BaseCycleNum = Schedule.cycleScheduled(SU);
2166      if (BaseStageNum < DefStageNum) {
2167        MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2168        int OffsetDiff = DefStageNum - BaseStageNum;
2169        if (DefCycleNum < BaseCycleNum) {
2170          NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2171          if (OffsetDiff > 0)
2172            --OffsetDiff;
2173        }
2174        int64_t NewOffset =
2175            MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2176        NewMI->getOperand(OffsetPos).setImm(NewOffset);
2177        SU->setInstr(NewMI);
2178        MISUnitMap[NewMI] = SU;
2179        NewMIs[MI] = NewMI;
2180      }
2181    }
2182  }
2183  
2184  /// Return the instruction in the loop that defines the register.
2185  /// If the definition is a Phi, then follow the Phi operand to
2186  /// the instruction in the loop.
2187  MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
2188    SmallPtrSet<MachineInstr *, 8> Visited;
2189    MachineInstr *Def = MRI.getVRegDef(Reg);
2190    while (Def->isPHI()) {
2191      if (!Visited.insert(Def).second)
2192        break;
2193      for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2194        if (Def->getOperand(i + 1).getMBB() == BB) {
2195          Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2196          break;
2197        }
2198    }
2199    return Def;
2200  }
2201  
2202  /// Return true for an order or output dependence that is loop carried
2203  /// potentially. A dependence is loop carried if the destination defines a valu
2204  /// that may be used or defined by the source in a subsequent iteration.
2205  bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
2206                                           bool isSucc) {
2207    if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2208        Dep.isArtificial())
2209      return false;
2210  
2211    if (!SwpPruneLoopCarried)
2212      return true;
2213  
2214    if (Dep.getKind() == SDep::Output)
2215      return true;
2216  
2217    MachineInstr *SI = Source->getInstr();
2218    MachineInstr *DI = Dep.getSUnit()->getInstr();
2219    if (!isSucc)
2220      std::swap(SI, DI);
2221    assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2222  
2223    // Assume ordered loads and stores may have a loop carried dependence.
2224    if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2225        SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2226        SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2227      return true;
2228  
2229    // Only chain dependences between a load and store can be loop carried.
2230    if (!DI->mayStore() || !SI->mayLoad())
2231      return false;
2232  
2233    unsigned DeltaS, DeltaD;
2234    if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2235      return true;
2236  
2237    const MachineOperand *BaseOpS, *BaseOpD;
2238    int64_t OffsetS, OffsetD;
2239    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2240    if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) ||
2241        !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI))
2242      return true;
2243  
2244    if (!BaseOpS->isIdenticalTo(*BaseOpD))
2245      return true;
2246  
2247    // Check that the base register is incremented by a constant value for each
2248    // iteration.
2249    MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg());
2250    if (!Def || !Def->isPHI())
2251      return true;
2252    unsigned InitVal = 0;
2253    unsigned LoopVal = 0;
2254    getPhiRegs(*Def, BB, InitVal, LoopVal);
2255    MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
2256    int D = 0;
2257    if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
2258      return true;
2259  
2260    uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2261    uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2262  
2263    // This is the main test, which checks the offset values and the loop
2264    // increment value to determine if the accesses may be loop carried.
2265    if (AccessSizeS == MemoryLocation::UnknownSize ||
2266        AccessSizeD == MemoryLocation::UnknownSize)
2267      return true;
2268  
2269    if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2270      return true;
2271  
2272    return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2273  }
2274  
2275  void SwingSchedulerDAG::postprocessDAG() {
2276    for (auto &M : Mutations)
2277      M->apply(this);
2278  }
2279  
2280  /// Try to schedule the node at the specified StartCycle and continue
2281  /// until the node is schedule or the EndCycle is reached.  This function
2282  /// returns true if the node is scheduled.  This routine may search either
2283  /// forward or backward for a place to insert the instruction based upon
2284  /// the relative values of StartCycle and EndCycle.
2285  bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2286    bool forward = true;
2287    LLVM_DEBUG({
2288      dbgs() << "Trying to insert node between " << StartCycle << " and "
2289             << EndCycle << " II: " << II << "\n";
2290    });
2291    if (StartCycle > EndCycle)
2292      forward = false;
2293  
2294    // The terminating condition depends on the direction.
2295    int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2296    for (int curCycle = StartCycle; curCycle != termCycle;
2297         forward ? ++curCycle : --curCycle) {
2298  
2299      // Add the already scheduled instructions at the specified cycle to the
2300      // DFA.
2301      ProcItinResources.clearResources();
2302      for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
2303           checkCycle <= LastCycle; checkCycle += II) {
2304        std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
2305  
2306        for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
2307                                           E = cycleInstrs.end();
2308             I != E; ++I) {
2309          if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
2310            continue;
2311          assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) &&
2312                 "These instructions have already been scheduled.");
2313          ProcItinResources.reserveResources(*(*I)->getInstr());
2314        }
2315      }
2316      if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2317          ProcItinResources.canReserveResources(*SU->getInstr())) {
2318        LLVM_DEBUG({
2319          dbgs() << "\tinsert at cycle " << curCycle << " ";
2320          SU->getInstr()->dump();
2321        });
2322  
2323        ScheduledInstrs[curCycle].push_back(SU);
2324        InstrToCycle.insert(std::make_pair(SU, curCycle));
2325        if (curCycle > LastCycle)
2326          LastCycle = curCycle;
2327        if (curCycle < FirstCycle)
2328          FirstCycle = curCycle;
2329        return true;
2330      }
2331      LLVM_DEBUG({
2332        dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2333        SU->getInstr()->dump();
2334      });
2335    }
2336    return false;
2337  }
2338  
2339  // Return the cycle of the earliest scheduled instruction in the chain.
2340  int SMSchedule::earliestCycleInChain(const SDep &Dep) {
2341    SmallPtrSet<SUnit *, 8> Visited;
2342    SmallVector<SDep, 8> Worklist;
2343    Worklist.push_back(Dep);
2344    int EarlyCycle = INT_MAX;
2345    while (!Worklist.empty()) {
2346      const SDep &Cur = Worklist.pop_back_val();
2347      SUnit *PrevSU = Cur.getSUnit();
2348      if (Visited.count(PrevSU))
2349        continue;
2350      std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2351      if (it == InstrToCycle.end())
2352        continue;
2353      EarlyCycle = std::min(EarlyCycle, it->second);
2354      for (const auto &PI : PrevSU->Preds)
2355        if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
2356          Worklist.push_back(PI);
2357      Visited.insert(PrevSU);
2358    }
2359    return EarlyCycle;
2360  }
2361  
2362  // Return the cycle of the latest scheduled instruction in the chain.
2363  int SMSchedule::latestCycleInChain(const SDep &Dep) {
2364    SmallPtrSet<SUnit *, 8> Visited;
2365    SmallVector<SDep, 8> Worklist;
2366    Worklist.push_back(Dep);
2367    int LateCycle = INT_MIN;
2368    while (!Worklist.empty()) {
2369      const SDep &Cur = Worklist.pop_back_val();
2370      SUnit *SuccSU = Cur.getSUnit();
2371      if (Visited.count(SuccSU))
2372        continue;
2373      std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2374      if (it == InstrToCycle.end())
2375        continue;
2376      LateCycle = std::max(LateCycle, it->second);
2377      for (const auto &SI : SuccSU->Succs)
2378        if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
2379          Worklist.push_back(SI);
2380      Visited.insert(SuccSU);
2381    }
2382    return LateCycle;
2383  }
2384  
2385  /// If an instruction has a use that spans multiple iterations, then
2386  /// return true. These instructions are characterized by having a back-ege
2387  /// to a Phi, which contains a reference to another Phi.
2388  static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
2389    for (auto &P : SU->Preds)
2390      if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2391        for (auto &S : P.getSUnit()->Succs)
2392          if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2393            return P.getSUnit();
2394    return nullptr;
2395  }
2396  
2397  /// Compute the scheduling start slot for the instruction.  The start slot
2398  /// depends on any predecessor or successor nodes scheduled already.
2399  void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2400                                int *MinEnd, int *MaxStart, int II,
2401                                SwingSchedulerDAG *DAG) {
2402    // Iterate over each instruction that has been scheduled already.  The start
2403    // slot computation depends on whether the previously scheduled instruction
2404    // is a predecessor or successor of the specified instruction.
2405    for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2406  
2407      // Iterate over each instruction in the current cycle.
2408      for (SUnit *I : getInstructions(cycle)) {
2409        // Because we're processing a DAG for the dependences, we recognize
2410        // the back-edge in recurrences by anti dependences.
2411        for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2412          const SDep &Dep = SU->Preds[i];
2413          if (Dep.getSUnit() == I) {
2414            if (!DAG->isBackedge(SU, Dep)) {
2415              int EarlyStart = cycle + Dep.getLatency() -
2416                               DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2417              *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2418              if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2419                int End = earliestCycleInChain(Dep) + (II - 1);
2420                *MinEnd = std::min(*MinEnd, End);
2421              }
2422            } else {
2423              int LateStart = cycle - Dep.getLatency() +
2424                              DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2425              *MinLateStart = std::min(*MinLateStart, LateStart);
2426            }
2427          }
2428          // For instruction that requires multiple iterations, make sure that
2429          // the dependent instruction is not scheduled past the definition.
2430          SUnit *BE = multipleIterations(I, DAG);
2431          if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2432              !SU->isPred(I))
2433            *MinLateStart = std::min(*MinLateStart, cycle);
2434        }
2435        for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2436          if (SU->Succs[i].getSUnit() == I) {
2437            const SDep &Dep = SU->Succs[i];
2438            if (!DAG->isBackedge(SU, Dep)) {
2439              int LateStart = cycle - Dep.getLatency() +
2440                              DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2441              *MinLateStart = std::min(*MinLateStart, LateStart);
2442              if (DAG->isLoopCarriedDep(SU, Dep)) {
2443                int Start = latestCycleInChain(Dep) + 1 - II;
2444                *MaxStart = std::max(*MaxStart, Start);
2445              }
2446            } else {
2447              int EarlyStart = cycle + Dep.getLatency() -
2448                               DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2449              *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2450            }
2451          }
2452        }
2453      }
2454    }
2455  }
2456  
2457  /// Order the instructions within a cycle so that the definitions occur
2458  /// before the uses. Returns true if the instruction is added to the start
2459  /// of the list, or false if added to the end.
2460  void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
2461                                   std::deque<SUnit *> &Insts) {
2462    MachineInstr *MI = SU->getInstr();
2463    bool OrderBeforeUse = false;
2464    bool OrderAfterDef = false;
2465    bool OrderBeforeDef = false;
2466    unsigned MoveDef = 0;
2467    unsigned MoveUse = 0;
2468    int StageInst1 = stageScheduled(SU);
2469  
2470    unsigned Pos = 0;
2471    for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2472         ++I, ++Pos) {
2473      for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2474        MachineOperand &MO = MI->getOperand(i);
2475        if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
2476          continue;
2477  
2478        Register Reg = MO.getReg();
2479        unsigned BasePos, OffsetPos;
2480        if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2481          if (MI->getOperand(BasePos).getReg() == Reg)
2482            if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2483              Reg = NewReg;
2484        bool Reads, Writes;
2485        std::tie(Reads, Writes) =
2486            (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2487        if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2488          OrderBeforeUse = true;
2489          if (MoveUse == 0)
2490            MoveUse = Pos;
2491        } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2492          // Add the instruction after the scheduled instruction.
2493          OrderAfterDef = true;
2494          MoveDef = Pos;
2495        } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2496          if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2497            OrderBeforeUse = true;
2498            if (MoveUse == 0)
2499              MoveUse = Pos;
2500          } else {
2501            OrderAfterDef = true;
2502            MoveDef = Pos;
2503          }
2504        } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2505          OrderBeforeUse = true;
2506          if (MoveUse == 0)
2507            MoveUse = Pos;
2508          if (MoveUse != 0) {
2509            OrderAfterDef = true;
2510            MoveDef = Pos - 1;
2511          }
2512        } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2513          // Add the instruction before the scheduled instruction.
2514          OrderBeforeUse = true;
2515          if (MoveUse == 0)
2516            MoveUse = Pos;
2517        } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2518                   isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2519          if (MoveUse == 0) {
2520            OrderBeforeDef = true;
2521            MoveUse = Pos;
2522          }
2523        }
2524      }
2525      // Check for order dependences between instructions. Make sure the source
2526      // is ordered before the destination.
2527      for (auto &S : SU->Succs) {
2528        if (S.getSUnit() != *I)
2529          continue;
2530        if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2531          OrderBeforeUse = true;
2532          if (Pos < MoveUse)
2533            MoveUse = Pos;
2534        }
2535        // We did not handle HW dependences in previous for loop,
2536        // and we normally set Latency = 0 for Anti deps,
2537        // so may have nodes in same cycle with Anti denpendent on HW regs.
2538        else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2539          OrderBeforeUse = true;
2540          if ((MoveUse == 0) || (Pos < MoveUse))
2541            MoveUse = Pos;
2542        }
2543      }
2544      for (auto &P : SU->Preds) {
2545        if (P.getSUnit() != *I)
2546          continue;
2547        if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2548          OrderAfterDef = true;
2549          MoveDef = Pos;
2550        }
2551      }
2552    }
2553  
2554    // A circular dependence.
2555    if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
2556      OrderBeforeUse = false;
2557  
2558    // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
2559    // to a loop-carried dependence.
2560    if (OrderBeforeDef)
2561      OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
2562  
2563    // The uncommon case when the instruction order needs to be updated because
2564    // there is both a use and def.
2565    if (OrderBeforeUse && OrderAfterDef) {
2566      SUnit *UseSU = Insts.at(MoveUse);
2567      SUnit *DefSU = Insts.at(MoveDef);
2568      if (MoveUse > MoveDef) {
2569        Insts.erase(Insts.begin() + MoveUse);
2570        Insts.erase(Insts.begin() + MoveDef);
2571      } else {
2572        Insts.erase(Insts.begin() + MoveDef);
2573        Insts.erase(Insts.begin() + MoveUse);
2574      }
2575      orderDependence(SSD, UseSU, Insts);
2576      orderDependence(SSD, SU, Insts);
2577      orderDependence(SSD, DefSU, Insts);
2578      return;
2579    }
2580    // Put the new instruction first if there is a use in the list. Otherwise,
2581    // put it at the end of the list.
2582    if (OrderBeforeUse)
2583      Insts.push_front(SU);
2584    else
2585      Insts.push_back(SU);
2586  }
2587  
2588  /// Return true if the scheduled Phi has a loop carried operand.
2589  bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
2590    if (!Phi.isPHI())
2591      return false;
2592    assert(Phi.isPHI() && "Expecting a Phi.");
2593    SUnit *DefSU = SSD->getSUnit(&Phi);
2594    unsigned DefCycle = cycleScheduled(DefSU);
2595    int DefStage = stageScheduled(DefSU);
2596  
2597    unsigned InitVal = 0;
2598    unsigned LoopVal = 0;
2599    getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
2600    SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
2601    if (!UseSU)
2602      return true;
2603    if (UseSU->getInstr()->isPHI())
2604      return true;
2605    unsigned LoopCycle = cycleScheduled(UseSU);
2606    int LoopStage = stageScheduled(UseSU);
2607    return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
2608  }
2609  
2610  /// Return true if the instruction is a definition that is loop carried
2611  /// and defines the use on the next iteration.
2612  ///        v1 = phi(v2, v3)
2613  ///  (Def) v3 = op v1
2614  ///  (MO)   = v1
2615  /// If MO appears before Def, then then v1 and v3 may get assigned to the same
2616  /// register.
2617  bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
2618                                         MachineInstr *Def, MachineOperand &MO) {
2619    if (!MO.isReg())
2620      return false;
2621    if (Def->isPHI())
2622      return false;
2623    MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
2624    if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
2625      return false;
2626    if (!isLoopCarried(SSD, *Phi))
2627      return false;
2628    unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
2629    for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
2630      MachineOperand &DMO = Def->getOperand(i);
2631      if (!DMO.isReg() || !DMO.isDef())
2632        continue;
2633      if (DMO.getReg() == LoopReg)
2634        return true;
2635    }
2636    return false;
2637  }
2638  
2639  // Check if the generated schedule is valid. This function checks if
2640  // an instruction that uses a physical register is scheduled in a
2641  // different stage than the definition. The pipeliner does not handle
2642  // physical register values that may cross a basic block boundary.
2643  bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
2644    for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
2645      SUnit &SU = SSD->SUnits[i];
2646      if (!SU.hasPhysRegDefs)
2647        continue;
2648      int StageDef = stageScheduled(&SU);
2649      assert(StageDef != -1 && "Instruction should have been scheduled.");
2650      for (auto &SI : SU.Succs)
2651        if (SI.isAssignedRegDep())
2652          if (Register::isPhysicalRegister(SI.getReg()))
2653            if (stageScheduled(SI.getSUnit()) != StageDef)
2654              return false;
2655    }
2656    return true;
2657  }
2658  
2659  /// A property of the node order in swing-modulo-scheduling is
2660  /// that for nodes outside circuits the following holds:
2661  /// none of them is scheduled after both a successor and a
2662  /// predecessor.
2663  /// The method below checks whether the property is met.
2664  /// If not, debug information is printed and statistics information updated.
2665  /// Note that we do not use an assert statement.
2666  /// The reason is that although an invalid node oder may prevent
2667  /// the pipeliner from finding a pipelined schedule for arbitrary II,
2668  /// it does not lead to the generation of incorrect code.
2669  void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
2670  
2671    // a sorted vector that maps each SUnit to its index in the NodeOrder
2672    typedef std::pair<SUnit *, unsigned> UnitIndex;
2673    std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
2674  
2675    for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
2676      Indices.push_back(std::make_pair(NodeOrder[i], i));
2677  
2678    auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
2679      return std::get<0>(i1) < std::get<0>(i2);
2680    };
2681  
2682    // sort, so that we can perform a binary search
2683    llvm::sort(Indices, CompareKey);
2684  
2685    bool Valid = true;
2686    (void)Valid;
2687    // for each SUnit in the NodeOrder, check whether
2688    // it appears after both a successor and a predecessor
2689    // of the SUnit. If this is the case, and the SUnit
2690    // is not part of circuit, then the NodeOrder is not
2691    // valid.
2692    for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
2693      SUnit *SU = NodeOrder[i];
2694      unsigned Index = i;
2695  
2696      bool PredBefore = false;
2697      bool SuccBefore = false;
2698  
2699      SUnit *Succ;
2700      SUnit *Pred;
2701      (void)Succ;
2702      (void)Pred;
2703  
2704      for (SDep &PredEdge : SU->Preds) {
2705        SUnit *PredSU = PredEdge.getSUnit();
2706        unsigned PredIndex = std::get<1>(
2707            *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
2708        if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
2709          PredBefore = true;
2710          Pred = PredSU;
2711          break;
2712        }
2713      }
2714  
2715      for (SDep &SuccEdge : SU->Succs) {
2716        SUnit *SuccSU = SuccEdge.getSUnit();
2717        // Do not process a boundary node, it was not included in NodeOrder,
2718        // hence not in Indices either, call to std::lower_bound() below will
2719        // return Indices.end().
2720        if (SuccSU->isBoundaryNode())
2721          continue;
2722        unsigned SuccIndex = std::get<1>(
2723            *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
2724        if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
2725          SuccBefore = true;
2726          Succ = SuccSU;
2727          break;
2728        }
2729      }
2730  
2731      if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
2732        // instructions in circuits are allowed to be scheduled
2733        // after both a successor and predecessor.
2734        bool InCircuit = llvm::any_of(
2735            Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
2736        if (InCircuit)
2737          LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
2738        else {
2739          Valid = false;
2740          NumNodeOrderIssues++;
2741          LLVM_DEBUG(dbgs() << "Predecessor ";);
2742        }
2743        LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
2744                          << " are scheduled before node " << SU->NodeNum
2745                          << "\n";);
2746      }
2747    }
2748  
2749    LLVM_DEBUG({
2750      if (!Valid)
2751        dbgs() << "Invalid node order found!\n";
2752    });
2753  }
2754  
2755  /// Attempt to fix the degenerate cases when the instruction serialization
2756  /// causes the register lifetimes to overlap. For example,
2757  ///   p' = store_pi(p, b)
2758  ///      = load p, offset
2759  /// In this case p and p' overlap, which means that two registers are needed.
2760  /// Instead, this function changes the load to use p' and updates the offset.
2761  void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
2762    unsigned OverlapReg = 0;
2763    unsigned NewBaseReg = 0;
2764    for (SUnit *SU : Instrs) {
2765      MachineInstr *MI = SU->getInstr();
2766      for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2767        const MachineOperand &MO = MI->getOperand(i);
2768        // Look for an instruction that uses p. The instruction occurs in the
2769        // same cycle but occurs later in the serialized order.
2770        if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
2771          // Check that the instruction appears in the InstrChanges structure,
2772          // which contains instructions that can have the offset updated.
2773          DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2774            InstrChanges.find(SU);
2775          if (It != InstrChanges.end()) {
2776            unsigned BasePos, OffsetPos;
2777            // Update the base register and adjust the offset.
2778            if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
2779              MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2780              NewMI->getOperand(BasePos).setReg(NewBaseReg);
2781              int64_t NewOffset =
2782                  MI->getOperand(OffsetPos).getImm() - It->second.second;
2783              NewMI->getOperand(OffsetPos).setImm(NewOffset);
2784              SU->setInstr(NewMI);
2785              MISUnitMap[NewMI] = SU;
2786              NewMIs[MI] = NewMI;
2787            }
2788          }
2789          OverlapReg = 0;
2790          NewBaseReg = 0;
2791          break;
2792        }
2793        // Look for an instruction of the form p' = op(p), which uses and defines
2794        // two virtual registers that get allocated to the same physical register.
2795        unsigned TiedUseIdx = 0;
2796        if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
2797          // OverlapReg is p in the example above.
2798          OverlapReg = MI->getOperand(TiedUseIdx).getReg();
2799          // NewBaseReg is p' in the example above.
2800          NewBaseReg = MI->getOperand(i).getReg();
2801          break;
2802        }
2803      }
2804    }
2805  }
2806  
2807  /// After the schedule has been formed, call this function to combine
2808  /// the instructions from the different stages/cycles.  That is, this
2809  /// function creates a schedule that represents a single iteration.
2810  void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
2811    // Move all instructions to the first stage from later stages.
2812    for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2813      for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
2814           ++stage) {
2815        std::deque<SUnit *> &cycleInstrs =
2816            ScheduledInstrs[cycle + (stage * InitiationInterval)];
2817        for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
2818                                                   E = cycleInstrs.rend();
2819             I != E; ++I)
2820          ScheduledInstrs[cycle].push_front(*I);
2821      }
2822    }
2823  
2824    // Erase all the elements in the later stages. Only one iteration should
2825    // remain in the scheduled list, and it contains all the instructions.
2826    for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
2827      ScheduledInstrs.erase(cycle);
2828  
2829    // Change the registers in instruction as specified in the InstrChanges
2830    // map. We need to use the new registers to create the correct order.
2831    for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
2832      SUnit *SU = &SSD->SUnits[i];
2833      SSD->applyInstrChange(SU->getInstr(), *this);
2834    }
2835  
2836    // Reorder the instructions in each cycle to fix and improve the
2837    // generated code.
2838    for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
2839      std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
2840      std::deque<SUnit *> newOrderPhi;
2841      for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
2842        SUnit *SU = cycleInstrs[i];
2843        if (SU->getInstr()->isPHI())
2844          newOrderPhi.push_back(SU);
2845      }
2846      std::deque<SUnit *> newOrderI;
2847      for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
2848        SUnit *SU = cycleInstrs[i];
2849        if (!SU->getInstr()->isPHI())
2850          orderDependence(SSD, SU, newOrderI);
2851      }
2852      // Replace the old order with the new order.
2853      cycleInstrs.swap(newOrderPhi);
2854      cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
2855      SSD->fixupRegisterOverlaps(cycleInstrs);
2856    }
2857  
2858    LLVM_DEBUG(dump(););
2859  }
2860  
2861  void NodeSet::print(raw_ostream &os) const {
2862    os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
2863       << " depth " << MaxDepth << " col " << Colocate << "\n";
2864    for (const auto &I : Nodes)
2865      os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
2866    os << "\n";
2867  }
2868  
2869  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2870  /// Print the schedule information to the given output.
2871  void SMSchedule::print(raw_ostream &os) const {
2872    // Iterate over each cycle.
2873    for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2874      // Iterate over each instruction in the cycle.
2875      const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
2876      for (SUnit *CI : cycleInstrs->second) {
2877        os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
2878        os << "(" << CI->NodeNum << ") ";
2879        CI->getInstr()->print(os);
2880        os << "\n";
2881      }
2882    }
2883  }
2884  
2885  /// Utility function used for debugging to print the schedule.
2886  LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
2887  LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
2888  
2889  #endif
2890  
2891  void ResourceManager::initProcResourceVectors(
2892      const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
2893    unsigned ProcResourceID = 0;
2894  
2895    // We currently limit the resource kinds to 64 and below so that we can use
2896    // uint64_t for Masks
2897    assert(SM.getNumProcResourceKinds() < 64 &&
2898           "Too many kinds of resources, unsupported");
2899    // Create a unique bitmask for every processor resource unit.
2900    // Skip resource at index 0, since it always references 'InvalidUnit'.
2901    Masks.resize(SM.getNumProcResourceKinds());
2902    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
2903      const MCProcResourceDesc &Desc = *SM.getProcResource(I);
2904      if (Desc.SubUnitsIdxBegin)
2905        continue;
2906      Masks[I] = 1ULL << ProcResourceID;
2907      ProcResourceID++;
2908    }
2909    // Create a unique bitmask for every processor resource group.
2910    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
2911      const MCProcResourceDesc &Desc = *SM.getProcResource(I);
2912      if (!Desc.SubUnitsIdxBegin)
2913        continue;
2914      Masks[I] = 1ULL << ProcResourceID;
2915      for (unsigned U = 0; U < Desc.NumUnits; ++U)
2916        Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
2917      ProcResourceID++;
2918    }
2919    LLVM_DEBUG({
2920      if (SwpShowResMask) {
2921        dbgs() << "ProcResourceDesc:\n";
2922        for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
2923          const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
2924          dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
2925                           ProcResource->Name, I, Masks[I],
2926                           ProcResource->NumUnits);
2927        }
2928        dbgs() << " -----------------\n";
2929      }
2930    });
2931  }
2932  
2933  bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {
2934  
2935    LLVM_DEBUG({
2936      if (SwpDebugResource)
2937        dbgs() << "canReserveResources:\n";
2938    });
2939    if (UseDFA)
2940      return DFAResources->canReserveResources(MID);
2941  
2942    unsigned InsnClass = MID->getSchedClass();
2943    const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
2944    if (!SCDesc->isValid()) {
2945      LLVM_DEBUG({
2946        dbgs() << "No valid Schedule Class Desc for schedClass!\n";
2947        dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
2948      });
2949      return true;
2950    }
2951  
2952    const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc);
2953    const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc);
2954    for (; I != E; ++I) {
2955      if (!I->Cycles)
2956        continue;
2957      const MCProcResourceDesc *ProcResource =
2958          SM.getProcResource(I->ProcResourceIdx);
2959      unsigned NumUnits = ProcResource->NumUnits;
2960      LLVM_DEBUG({
2961        if (SwpDebugResource)
2962          dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
2963                           ProcResource->Name, I->ProcResourceIdx,
2964                           ProcResourceCount[I->ProcResourceIdx], NumUnits,
2965                           I->Cycles);
2966      });
2967      if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits)
2968        return false;
2969    }
2970    LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return true\n\n";);
2971    return true;
2972  }
2973  
2974  void ResourceManager::reserveResources(const MCInstrDesc *MID) {
2975    LLVM_DEBUG({
2976      if (SwpDebugResource)
2977        dbgs() << "reserveResources:\n";
2978    });
2979    if (UseDFA)
2980      return DFAResources->reserveResources(MID);
2981  
2982    unsigned InsnClass = MID->getSchedClass();
2983    const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
2984    if (!SCDesc->isValid()) {
2985      LLVM_DEBUG({
2986        dbgs() << "No valid Schedule Class Desc for schedClass!\n";
2987        dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
2988      });
2989      return;
2990    }
2991    for (const MCWriteProcResEntry &PRE :
2992         make_range(STI->getWriteProcResBegin(SCDesc),
2993                    STI->getWriteProcResEnd(SCDesc))) {
2994      if (!PRE.Cycles)
2995        continue;
2996      ++ProcResourceCount[PRE.ProcResourceIdx];
2997      LLVM_DEBUG({
2998        if (SwpDebugResource) {
2999          const MCProcResourceDesc *ProcResource =
3000              SM.getProcResource(PRE.ProcResourceIdx);
3001          dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
3002                           ProcResource->Name, PRE.ProcResourceIdx,
3003                           ProcResourceCount[PRE.ProcResourceIdx],
3004                           ProcResource->NumUnits, PRE.Cycles);
3005        }
3006      });
3007    }
3008    LLVM_DEBUG({
3009      if (SwpDebugResource)
3010        dbgs() << "reserveResources: done!\n\n";
3011    });
3012  }
3013  
3014  bool ResourceManager::canReserveResources(const MachineInstr &MI) const {
3015    return canReserveResources(&MI.getDesc());
3016  }
3017  
3018  void ResourceManager::reserveResources(const MachineInstr &MI) {
3019    return reserveResources(&MI.getDesc());
3020  }
3021  
3022  void ResourceManager::clearResources() {
3023    if (UseDFA)
3024      return DFAResources->clearResources();
3025    std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
3026  }
3027