xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Machine Scheduler interface
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
16 
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/CodeGen/RegisterPressure.h"
19 #include "llvm/CodeGen/ScheduleDAG.h"
20 #include <cstdint>
21 #include <set>
22 #include <vector>
23 
24 namespace llvm {
25 
26 class SIInstrInfo;
27 class SIRegisterInfo;
28 
29 enum SIScheduleCandReason {
30   NoCand,
31   RegUsage,
32   Latency,
33   Successor,
34   Depth,
35   NodeOrder
36 };
37 
38 struct SISchedulerCandidate {
39   // The reason for this candidate.
40   SIScheduleCandReason Reason = NoCand;
41 
42   // Set of reasons that apply to multiple candidates.
43   uint32_t RepeatReasonSet = 0;
44 
45   SISchedulerCandidate() = default;
46 
47   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
48   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
49 };
50 
51 class SIScheduleDAGMI;
52 class SIScheduleBlockCreator;
53 
54 enum SIScheduleBlockLinkKind {
55   NoData,
56   Data
57 };
58 
59 class SIScheduleBlock {
60   SIScheduleDAGMI *DAG;
61   SIScheduleBlockCreator *BC;
62 
63   std::vector<SUnit*> SUnits;
64   std::map<unsigned, unsigned> NodeNum2Index;
65   std::vector<SUnit*> TopReadySUs;
66   std::vector<SUnit*> ScheduledSUnits;
67 
68   /// The top of the unscheduled zone.
69   IntervalPressure TopPressure;
70   RegPressureTracker TopRPTracker;
71 
72   // Pressure: number of said class of registers needed to
73   // store the live virtual and real registers.
74   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
75   // Pressure of additional registers required inside the block.
76   std::vector<unsigned> InternalAdditionnalPressure;
77   // Pressure of input and output registers
78   std::vector<unsigned> LiveInPressure;
79   std::vector<unsigned> LiveOutPressure;
80   // Registers required by the block, and outputs.
81   // We do track only virtual registers.
82   // Note that some registers are not 32 bits,
83   // and thus the pressure is not equal
84   // to the number of live registers.
85   std::set<unsigned> LiveInRegs;
86   std::set<unsigned> LiveOutRegs;
87 
88   bool Scheduled = false;
89   bool HighLatencyBlock = false;
90 
91   std::vector<unsigned> HasLowLatencyNonWaitedParent;
92 
93   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
94   unsigned ID;
95 
96   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
97   // All blocks successors, and the kind of link
98   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
99   unsigned NumHighLatencySuccessors = 0;
100 
101 public:
102   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
103                   unsigned ID):
104     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
105 
106   ~SIScheduleBlock() = default;
107 
108   unsigned getID() const { return ID; }
109 
110   /// Functions for Block construction.
111   void addUnit(SUnit *SU);
112 
113   // When all SUs have been added.
114   void finalizeUnits();
115 
116   // Add block pred, which has instruction predecessor of SU.
117   void addPred(SIScheduleBlock *Pred);
118   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
119 
120   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
121   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
122     getSuccs() const { return Succs; }
123 
124   unsigned Height;  // Maximum topdown path length to block without outputs
125   unsigned Depth;   // Maximum bottomup path length to block without inputs
126 
127   unsigned getNumHighLatencySuccessors() const {
128     return NumHighLatencySuccessors;
129   }
130 
131   bool isHighLatencyBlock() { return HighLatencyBlock; }
132 
133   // This is approximative.
134   // Ideally should take into accounts some instructions (rcp, etc)
135   // are 4 times slower.
136   int getCost() { return SUnits.size(); }
137 
138   // The block Predecessors and Successors must be all registered
139   // before fastSchedule().
140   // Fast schedule with no particular requirement.
141   void fastSchedule();
142 
143   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
144 
145   // Complete schedule that will try to minimize reg pressure and
146   // low latencies, and will fill liveins and liveouts.
147   // Needs all MIs to be grouped between BeginBlock and EndBlock.
148   // The MIs can be moved after the scheduling,
149   // it is just used to allow correct track of live registers.
150   void schedule(MachineBasicBlock::iterator BeginBlock,
151                 MachineBasicBlock::iterator EndBlock);
152 
153   bool isScheduled() { return Scheduled; }
154 
155   // Needs the block to be scheduled inside
156   // TODO: find a way to compute it.
157   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
158     return InternalAdditionnalPressure;
159   }
160 
161   std::set<unsigned> &getInRegs() { return LiveInRegs; }
162   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
163 
164   void printDebug(bool Full);
165 
166 private:
167   struct SISchedCandidate : SISchedulerCandidate {
168     // The best SUnit candidate.
169     SUnit *SU = nullptr;
170 
171     unsigned SGPRUsage;
172     unsigned VGPRUsage;
173     bool IsLowLatency;
174     unsigned LowLatencyOffset;
175     bool HasLowLatencyNonWaitedParent;
176 
177     SISchedCandidate() = default;
178 
179     bool isValid() const { return SU; }
180 
181     // Copy the status of another candidate without changing policy.
182     void setBest(SISchedCandidate &Best) {
183       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
184       SU = Best.SU;
185       Reason = Best.Reason;
186       SGPRUsage = Best.SGPRUsage;
187       VGPRUsage = Best.VGPRUsage;
188       IsLowLatency = Best.IsLowLatency;
189       LowLatencyOffset = Best.LowLatencyOffset;
190       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
191     }
192   };
193 
194   void undoSchedule();
195 
196   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
197   void releaseSucc(SUnit *SU, SDep *SuccEdge);
198   // InOrOutBlock: restrict to links pointing inside the block (true),
199   // or restrict to links pointing outside the block (false).
200   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
201 
202   void nodeScheduled(SUnit *SU);
203   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
204   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
205   SUnit* pickNode();
206   void traceCandidate(const SISchedCandidate &Cand);
207   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
208                        MachineBasicBlock::iterator EndBlock);
209 };
210 
211 struct SIScheduleBlocks {
212   std::vector<SIScheduleBlock*> Blocks;
213   std::vector<int> TopDownIndex2Block;
214   std::vector<int> TopDownBlock2Index;
215 };
216 
217 enum SISchedulerBlockCreatorVariant {
218   LatenciesAlone,
219   LatenciesGrouped,
220   LatenciesAlonePlusConsecutive
221 };
222 
223 class SIScheduleBlockCreator {
224   SIScheduleDAGMI *DAG;
225   // unique_ptr handles freeing memory for us.
226   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
227   std::map<SISchedulerBlockCreatorVariant,
228            SIScheduleBlocks> Blocks;
229   std::vector<SIScheduleBlock*> CurrentBlocks;
230   std::vector<int> Node2CurrentBlock;
231 
232   // Topological sort
233   // Maps topological index to the node number.
234   std::vector<int> TopDownIndex2Block;
235   std::vector<int> TopDownBlock2Index;
236   std::vector<int> BottomUpIndex2Block;
237 
238   // 0 -> Color not given.
239   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
240   // Above -> Other groups.
241   int NextReservedID;
242   int NextNonReservedID;
243   std::vector<int> CurrentColoring;
244   std::vector<int> CurrentTopDownReservedDependencyColoring;
245   std::vector<int> CurrentBottomUpReservedDependencyColoring;
246 
247 public:
248   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
249 
250   SIScheduleBlocks
251   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
252 
253   bool isSUInBlock(SUnit *SU, unsigned ID);
254 
255 private:
256   // Give a Reserved color to every high latency.
257   void colorHighLatenciesAlone();
258 
259   // Create groups of high latencies with a Reserved color.
260   void colorHighLatenciesGroups();
261 
262   // Compute coloring for topdown and bottom traversals with
263   // different colors depending on dependencies on Reserved colors.
264   void colorComputeReservedDependencies();
265 
266   // Give color to all non-colored SUs according to Reserved groups dependencies.
267   void colorAccordingToReservedDependencies();
268 
269   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
270   // The new colors are computed according to the dependencies on the other blocks
271   // formed with colorAccordingToReservedDependencies.
272   void colorEndsAccordingToDependencies();
273 
274   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
275   void colorForceConsecutiveOrderInGroup();
276 
277   // Merge Constant loads that have all their users into another group to the group.
278   // (TODO: else if all their users depend on the same group, put them there)
279   void colorMergeConstantLoadsNextGroup();
280 
281   // Merge SUs that have all their users into another group to the group
282   void colorMergeIfPossibleNextGroup();
283 
284   // Merge SUs that have all their users into another group to the group,
285   // but only for Reserved groups.
286   void colorMergeIfPossibleNextGroupOnlyForReserved();
287 
288   // Merge SUs that have all their users into another group to the group,
289   // but only if the group is no more than a few SUs.
290   void colorMergeIfPossibleSmallGroupsToNextGroup();
291 
292   // Divides Blocks with important size.
293   // Idea of implementation: attribute new colors depending on topdown and
294   // bottom up links to other blocks.
295   void cutHugeBlocks();
296 
297   // Put in one group all instructions with no users in this scheduling region
298   // (we'd want these groups be at the end).
299   void regroupNoUserInstructions();
300 
301   // Give Reserved color to export instructions
302   void colorExports();
303 
304   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
305 
306   void topologicalSort();
307 
308   void scheduleInsideBlocks();
309 
310   void fillStats();
311 };
312 
313 enum SISchedulerBlockSchedulerVariant {
314   BlockLatencyRegUsage,
315   BlockRegUsageLatency,
316   BlockRegUsage
317 };
318 
319 class SIScheduleBlockScheduler {
320   SIScheduleDAGMI *DAG;
321   SISchedulerBlockSchedulerVariant Variant;
322   std::vector<SIScheduleBlock*> Blocks;
323 
324   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
325   std::set<unsigned> LiveRegs;
326   // Num of schedulable unscheduled blocks reading the register.
327   std::map<unsigned, unsigned> LiveRegsConsumers;
328 
329   std::vector<unsigned> LastPosHighLatencyParentScheduled;
330   int LastPosWaitedHighLatency;
331 
332   std::vector<SIScheduleBlock*> BlocksScheduled;
333   unsigned NumBlockScheduled;
334   std::vector<SIScheduleBlock*> ReadyBlocks;
335 
336   unsigned VregCurrentUsage;
337   unsigned SregCurrentUsage;
338 
339   // Currently is only approximation.
340   unsigned maxVregUsage;
341   unsigned maxSregUsage;
342 
343   std::vector<unsigned> BlockNumPredsLeft;
344   std::vector<unsigned> BlockNumSuccsLeft;
345 
346 public:
347   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
348                            SISchedulerBlockSchedulerVariant Variant,
349                            SIScheduleBlocks BlocksStruct);
350   ~SIScheduleBlockScheduler() = default;
351 
352   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
353 
354   unsigned getVGPRUsage() { return maxVregUsage; }
355   unsigned getSGPRUsage() { return maxSregUsage; }
356 
357 private:
358   struct SIBlockSchedCandidate : SISchedulerCandidate {
359     // The best Block candidate.
360     SIScheduleBlock *Block = nullptr;
361 
362     bool IsHighLatency;
363     int VGPRUsageDiff;
364     unsigned NumSuccessors;
365     unsigned NumHighLatencySuccessors;
366     unsigned LastPosHighLatParentScheduled;
367     unsigned Height;
368 
369     SIBlockSchedCandidate() = default;
370 
371     bool isValid() const { return Block; }
372 
373     // Copy the status of another candidate without changing policy.
374     void setBest(SIBlockSchedCandidate &Best) {
375       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
376       Block = Best.Block;
377       Reason = Best.Reason;
378       IsHighLatency = Best.IsHighLatency;
379       VGPRUsageDiff = Best.VGPRUsageDiff;
380       NumSuccessors = Best.NumSuccessors;
381       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
382       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
383       Height = Best.Height;
384     }
385   };
386 
387   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
388                            SIBlockSchedCandidate &TryCand);
389   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
390                             SIBlockSchedCandidate &TryCand);
391   SIScheduleBlock *pickBlock();
392 
393   void addLiveRegs(std::set<unsigned> &Regs);
394   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
395   void releaseBlockSuccs(SIScheduleBlock *Parent);
396   void blockScheduled(SIScheduleBlock *Block);
397 
398   // Check register pressure change
399   // by scheduling a block with these LiveIn and LiveOut.
400   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
401                                        std::set<unsigned> &OutRegs);
402 
403   void schedule();
404 };
405 
406 struct SIScheduleBlockResult {
407   std::vector<unsigned> SUs;
408   unsigned MaxSGPRUsage;
409   unsigned MaxVGPRUsage;
410 };
411 
412 class SIScheduler {
413   SIScheduleDAGMI *DAG;
414   SIScheduleBlockCreator BlockCreator;
415 
416 public:
417   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
418 
419   ~SIScheduler() = default;
420 
421   struct SIScheduleBlockResult
422   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
423                   SISchedulerBlockSchedulerVariant ScheduleVariant);
424 };
425 
426 class SIScheduleDAGMI final : public ScheduleDAGMILive {
427   const SIInstrInfo *SITII;
428   const SIRegisterInfo *SITRI;
429 
430   std::vector<SUnit> SUnitsLinksBackup;
431 
432   // For moveLowLatencies. After all Scheduling variants are tested.
433   std::vector<unsigned> ScheduledSUnits;
434   std::vector<unsigned> ScheduledSUnitsInv;
435 
436 public:
437   SIScheduleDAGMI(MachineSchedContext *C);
438 
439   ~SIScheduleDAGMI() override;
440 
441   // Entry point for the schedule.
442   void schedule() override;
443 
444   // To init Block's RPTracker.
445   void initRPTracker(RegPressureTracker &RPTracker) {
446     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
447   }
448 
449   MachineBasicBlock *getBB() { return BB; }
450   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
451   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
452   LiveIntervals *getLIS() { return LIS; }
453   MachineRegisterInfo *getMRI() { return &MRI; }
454   const TargetRegisterInfo *getTRI() { return TRI; }
455   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
456   SUnit &getEntrySU() { return EntrySU; }
457   SUnit& getExitSU() { return ExitSU; }
458 
459   void restoreSULinksLeft();
460 
461   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
462                                                      _Iterator End,
463                                                      unsigned &VgprUsage,
464                                                      unsigned &SgprUsage);
465 
466   std::set<unsigned> getInRegs() {
467     std::set<unsigned> InRegs;
468     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
469       InRegs.insert(RegMaskPair.RegUnit);
470     }
471     return InRegs;
472   }
473 
474   std::set<unsigned> getOutRegs() {
475     std::set<unsigned> OutRegs;
476     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
477       OutRegs.insert(RegMaskPair.RegUnit);
478     }
479     return OutRegs;
480   };
481 
482 private:
483   void topologicalSort();
484   // After scheduling is done, improve low latency placements.
485   void moveLowLatencies();
486 
487 public:
488   // Some stats for scheduling inside blocks.
489   std::vector<unsigned> IsLowLatencySU;
490   std::vector<unsigned> LowLatencyOffset;
491   std::vector<unsigned> IsHighLatencySU;
492   // Topological sort
493   // Maps topological index to the node number.
494   std::vector<int> TopDownIndex2SU;
495   std::vector<int> BottomUpIndex2SU;
496 };
497 
498 } // end namespace llvm
499 
500 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
501