xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Machine Scheduler interface
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
16 
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/CodeGen/RegisterPressure.h"
19 #include "llvm/CodeGen/ScheduleDAG.h"
20 #include <cstdint>
21 #include <set>
22 #include <vector>
23 
24 namespace llvm {
25 
26 class SIInstrInfo;
27 class SIRegisterInfo;
28 class SIScheduleDAGMI;
29 class SIScheduleBlockCreator;
30 
31 enum SIScheduleCandReason {
32   NoCand,
33   RegUsage,
34   Latency,
35   Successor,
36   Depth,
37   NodeOrder
38 };
39 
40 struct SISchedulerCandidate {
41   // The reason for this candidate.
42   SIScheduleCandReason Reason = NoCand;
43 
44   // Set of reasons that apply to multiple candidates.
45   uint32_t RepeatReasonSet = 0;
46 
47   SISchedulerCandidate() = default;
48 
49   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
50   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
51 };
52 
53 enum SIScheduleBlockLinkKind {
54   NoData,
55   Data
56 };
57 
58 class SIScheduleBlock {
59   SIScheduleDAGMI *DAG;
60   SIScheduleBlockCreator *BC;
61 
62   std::vector<SUnit*> SUnits;
63   std::map<unsigned, unsigned> NodeNum2Index;
64   std::vector<SUnit*> TopReadySUs;
65   std::vector<SUnit*> ScheduledSUnits;
66 
67   /// The top of the unscheduled zone.
68   IntervalPressure TopPressure;
69   RegPressureTracker TopRPTracker;
70 
71   // Pressure: number of said class of registers needed to
72   // store the live virtual and real registers.
73   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
74   // Pressure of additional registers required inside the block.
75   std::vector<unsigned> InternalAdditionalPressure;
76   // Pressure of input and output registers
77   std::vector<unsigned> LiveInPressure;
78   std::vector<unsigned> LiveOutPressure;
79   // Registers required by the block, and outputs.
80   // We do track only virtual registers.
81   // Note that some registers are not 32 bits,
82   // and thus the pressure is not equal
83   // to the number of live registers.
84   std::set<unsigned> LiveInRegs;
85   std::set<unsigned> LiveOutRegs;
86 
87   bool Scheduled = false;
88   bool HighLatencyBlock = false;
89 
90   std::vector<unsigned> HasLowLatencyNonWaitedParent;
91 
92   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
93   unsigned ID;
94 
95   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
96   // All blocks successors, and the kind of link
97   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
98   unsigned NumHighLatencySuccessors = 0;
99 
100 public:
101   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
102                   unsigned ID):
103     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
104 
105   ~SIScheduleBlock() = default;
106 
107   unsigned getID() const { return ID; }
108 
109   /// Functions for Block construction.
110   void addUnit(SUnit *SU);
111 
112   // When all SUs have been added.
113   void finalizeUnits();
114 
115   // Add block pred, which has instruction predecessor of SU.
116   void addPred(SIScheduleBlock *Pred);
117   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
118 
119   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
120   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
121     getSuccs() const { return Succs; }
122 
123   unsigned Height;  // Maximum topdown path length to block without outputs
124   unsigned Depth;   // Maximum bottomup path length to block without inputs
125 
126   unsigned getNumHighLatencySuccessors() const {
127     return NumHighLatencySuccessors;
128   }
129 
130   bool isHighLatencyBlock() { return HighLatencyBlock; }
131 
132   // This is approximative.
133   // Ideally should take into accounts some instructions (rcp, etc)
134   // are 4 times slower.
135   int getCost() { return SUnits.size(); }
136 
137   // The block Predecessors and Successors must be all registered
138   // before fastSchedule().
139   // Fast schedule with no particular requirement.
140   void fastSchedule();
141 
142   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
143 
144   // Complete schedule that will try to minimize reg pressure and
145   // low latencies, and will fill liveins and liveouts.
146   // Needs all MIs to be grouped between BeginBlock and EndBlock.
147   // The MIs can be moved after the scheduling,
148   // it is just used to allow correct track of live registers.
149   void schedule(MachineBasicBlock::iterator BeginBlock,
150                 MachineBasicBlock::iterator EndBlock);
151 
152   bool isScheduled() { return Scheduled; }
153 
154   // Needs the block to be scheduled inside
155   // TODO: find a way to compute it.
156   std::vector<unsigned> &getInternalAdditionalRegUsage() {
157     return InternalAdditionalPressure;
158   }
159 
160   std::set<unsigned> &getInRegs() { return LiveInRegs; }
161   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
162 
163   void printDebug(bool Full);
164 
165 private:
166   struct SISchedCandidate : SISchedulerCandidate {
167     // The best SUnit candidate.
168     SUnit *SU = nullptr;
169 
170     unsigned SGPRUsage;
171     unsigned VGPRUsage;
172     bool IsLowLatency;
173     unsigned LowLatencyOffset;
174     bool HasLowLatencyNonWaitedParent;
175 
176     SISchedCandidate() = default;
177 
178     bool isValid() const { return SU; }
179 
180     // Copy the status of another candidate without changing policy.
181     void setBest(SISchedCandidate &Best) {
182       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
183       SU = Best.SU;
184       Reason = Best.Reason;
185       SGPRUsage = Best.SGPRUsage;
186       VGPRUsage = Best.VGPRUsage;
187       IsLowLatency = Best.IsLowLatency;
188       LowLatencyOffset = Best.LowLatencyOffset;
189       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
190     }
191   };
192 
193   void undoSchedule();
194 
195   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
196   void releaseSucc(SUnit *SU, SDep *SuccEdge);
197   // InOrOutBlock: restrict to links pointing inside the block (true),
198   // or restrict to links pointing outside the block (false).
199   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
200 
201   void nodeScheduled(SUnit *SU);
202   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
203   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
204   SUnit* pickNode();
205   void traceCandidate(const SISchedCandidate &Cand);
206   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
207                        MachineBasicBlock::iterator EndBlock);
208 };
209 
210 struct SIScheduleBlocks {
211   std::vector<SIScheduleBlock*> Blocks;
212   std::vector<int> TopDownIndex2Block;
213   std::vector<int> TopDownBlock2Index;
214 };
215 
216 enum SISchedulerBlockCreatorVariant {
217   LatenciesAlone,
218   LatenciesGrouped,
219   LatenciesAlonePlusConsecutive
220 };
221 
222 class SIScheduleBlockCreator {
223   SIScheduleDAGMI *DAG;
224   // unique_ptr handles freeing memory for us.
225   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
226   std::map<SISchedulerBlockCreatorVariant,
227            SIScheduleBlocks> Blocks;
228   std::vector<SIScheduleBlock*> CurrentBlocks;
229   std::vector<int> Node2CurrentBlock;
230 
231   // Topological sort
232   // Maps topological index to the node number.
233   std::vector<int> TopDownIndex2Block;
234   std::vector<int> TopDownBlock2Index;
235   std::vector<int> BottomUpIndex2Block;
236 
237   // 0 -> Color not given.
238   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
239   // Above -> Other groups.
240   int NextReservedID;
241   int NextNonReservedID;
242   std::vector<int> CurrentColoring;
243   std::vector<int> CurrentTopDownReservedDependencyColoring;
244   std::vector<int> CurrentBottomUpReservedDependencyColoring;
245 
246 public:
247   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
248 
249   SIScheduleBlocks
250   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
251 
252   bool isSUInBlock(SUnit *SU, unsigned ID);
253 
254 private:
255   // Give a Reserved color to every high latency.
256   void colorHighLatenciesAlone();
257 
258   // Create groups of high latencies with a Reserved color.
259   void colorHighLatenciesGroups();
260 
261   // Compute coloring for topdown and bottom traversals with
262   // different colors depending on dependencies on Reserved colors.
263   void colorComputeReservedDependencies();
264 
265   // Give color to all non-colored SUs according to Reserved groups dependencies.
266   void colorAccordingToReservedDependencies();
267 
268   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
269   // The new colors are computed according to the dependencies on the other blocks
270   // formed with colorAccordingToReservedDependencies.
271   void colorEndsAccordingToDependencies();
272 
273   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
274   void colorForceConsecutiveOrderInGroup();
275 
276   // Merge Constant loads that have all their users into another group to the group.
277   // (TODO: else if all their users depend on the same group, put them there)
278   void colorMergeConstantLoadsNextGroup();
279 
280   // Merge SUs that have all their users into another group to the group
281   void colorMergeIfPossibleNextGroup();
282 
283   // Merge SUs that have all their users into another group to the group,
284   // but only for Reserved groups.
285   void colorMergeIfPossibleNextGroupOnlyForReserved();
286 
287   // Merge SUs that have all their users into another group to the group,
288   // but only if the group is no more than a few SUs.
289   void colorMergeIfPossibleSmallGroupsToNextGroup();
290 
291   // Divides Blocks with important size.
292   // Idea of implementation: attribute new colors depending on topdown and
293   // bottom up links to other blocks.
294   void cutHugeBlocks();
295 
296   // Put in one group all instructions with no users in this scheduling region
297   // (we'd want these groups be at the end).
298   void regroupNoUserInstructions();
299 
300   // Give Reserved color to export instructions
301   void colorExports();
302 
303   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
304 
305   void topologicalSort();
306 
307   void scheduleInsideBlocks();
308 
309   void fillStats();
310 };
311 
312 enum SISchedulerBlockSchedulerVariant {
313   BlockLatencyRegUsage,
314   BlockRegUsageLatency,
315   BlockRegUsage
316 };
317 
318 class SIScheduleBlockScheduler {
319   SIScheduleDAGMI *DAG;
320   SISchedulerBlockSchedulerVariant Variant;
321   std::vector<SIScheduleBlock*> Blocks;
322 
323   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
324   std::set<unsigned> LiveRegs;
325   // Num of schedulable unscheduled blocks reading the register.
326   std::map<unsigned, unsigned> LiveRegsConsumers;
327 
328   std::vector<unsigned> LastPosHighLatencyParentScheduled;
329   int LastPosWaitedHighLatency;
330 
331   std::vector<SIScheduleBlock*> BlocksScheduled;
332   unsigned NumBlockScheduled;
333   std::vector<SIScheduleBlock*> ReadyBlocks;
334 
335   unsigned VregCurrentUsage;
336   unsigned SregCurrentUsage;
337 
338   // Currently is only approximation.
339   unsigned maxVregUsage;
340   unsigned maxSregUsage;
341 
342   std::vector<unsigned> BlockNumPredsLeft;
343   std::vector<unsigned> BlockNumSuccsLeft;
344 
345 public:
346   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
347                            SISchedulerBlockSchedulerVariant Variant,
348                            SIScheduleBlocks BlocksStruct);
349   ~SIScheduleBlockScheduler() = default;
350 
351   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
352 
353   unsigned getVGPRUsage() { return maxVregUsage; }
354   unsigned getSGPRUsage() { return maxSregUsage; }
355 
356 private:
357   struct SIBlockSchedCandidate : SISchedulerCandidate {
358     // The best Block candidate.
359     SIScheduleBlock *Block = nullptr;
360 
361     bool IsHighLatency;
362     int VGPRUsageDiff;
363     unsigned NumSuccessors;
364     unsigned NumHighLatencySuccessors;
365     unsigned LastPosHighLatParentScheduled;
366     unsigned Height;
367 
368     SIBlockSchedCandidate() = default;
369 
370     bool isValid() const { return Block; }
371 
372     // Copy the status of another candidate without changing policy.
373     void setBest(SIBlockSchedCandidate &Best) {
374       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
375       Block = Best.Block;
376       Reason = Best.Reason;
377       IsHighLatency = Best.IsHighLatency;
378       VGPRUsageDiff = Best.VGPRUsageDiff;
379       NumSuccessors = Best.NumSuccessors;
380       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
381       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
382       Height = Best.Height;
383     }
384   };
385 
386   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
387                            SIBlockSchedCandidate &TryCand);
388   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
389                             SIBlockSchedCandidate &TryCand);
390   SIScheduleBlock *pickBlock();
391 
392   void addLiveRegs(std::set<unsigned> &Regs);
393   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
394   void releaseBlockSuccs(SIScheduleBlock *Parent);
395   void blockScheduled(SIScheduleBlock *Block);
396 
397   // Check register pressure change
398   // by scheduling a block with these LiveIn and LiveOut.
399   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
400                                        std::set<unsigned> &OutRegs);
401 
402   void schedule();
403 };
404 
405 struct SIScheduleBlockResult {
406   std::vector<unsigned> SUs;
407   unsigned MaxSGPRUsage;
408   unsigned MaxVGPRUsage;
409 };
410 
411 class SIScheduler {
412   SIScheduleDAGMI *DAG;
413   SIScheduleBlockCreator BlockCreator;
414 
415 public:
416   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
417 
418   ~SIScheduler() = default;
419 
420   struct SIScheduleBlockResult
421   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
422                   SISchedulerBlockSchedulerVariant ScheduleVariant);
423 };
424 
425 class SIScheduleDAGMI final : public ScheduleDAGMILive {
426   const SIInstrInfo *SITII;
427   const SIRegisterInfo *SITRI;
428 
429   std::vector<SUnit> SUnitsLinksBackup;
430 
431   // For moveLowLatencies. After all Scheduling variants are tested.
432   std::vector<unsigned> ScheduledSUnits;
433   std::vector<unsigned> ScheduledSUnitsInv;
434 
435 public:
436   SIScheduleDAGMI(MachineSchedContext *C);
437 
438   ~SIScheduleDAGMI() override;
439 
440   // Entry point for the schedule.
441   void schedule() override;
442 
443   // To init Block's RPTracker.
444   void initRPTracker(RegPressureTracker &RPTracker) {
445     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
446   }
447 
448   MachineBasicBlock *getBB() { return BB; }
449   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
450   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
451   LiveIntervals *getLIS() { return LIS; }
452   MachineRegisterInfo *getMRI() { return &MRI; }
453   const TargetRegisterInfo *getTRI() { return TRI; }
454   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
455   SUnit &getEntrySU() { return EntrySU; }
456   SUnit& getExitSU() { return ExitSU; }
457 
458   void restoreSULinksLeft();
459 
460   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
461                                                      _Iterator End,
462                                                      unsigned &VgprUsage,
463                                                      unsigned &SgprUsage);
464 
465   std::set<unsigned> getInRegs() {
466     std::set<unsigned> InRegs;
467     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
468       InRegs.insert(RegMaskPair.RegUnit);
469     }
470     return InRegs;
471   }
472 
473   std::set<unsigned> getOutRegs() {
474     std::set<unsigned> OutRegs;
475     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
476       OutRegs.insert(RegMaskPair.RegUnit);
477     }
478     return OutRegs;
479   };
480 
481 private:
482   void topologicalSort();
483   // After scheduling is done, improve low latency placements.
484   void moveLowLatencies();
485 
486 public:
487   // Some stats for scheduling inside blocks.
488   std::vector<unsigned> IsLowLatencySU;
489   std::vector<unsigned> LowLatencyOffset;
490   std::vector<unsigned> IsHighLatencySU;
491   // Topological sort
492   // Maps topological index to the node number.
493   std::vector<int> TopDownIndex2SU;
494   std::vector<int> BottomUpIndex2SU;
495 };
496 
497 } // end namespace llvm
498 
499 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
500