xref: /freebsd/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h (revision f5b7695d2d5abd735064870ad43f4b9c723940c1)
1 //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements the bottleneck analysis view.
11 ///
12 /// This view internally observes backend pressure increase events in order to
13 /// identify problematic data dependencies and processor resource interferences.
14 ///
15 /// Example of bottleneck analysis report for a dot-product on X86 btver2:
16 ///
17 /// Cycles with backend pressure increase [ 40.76% ]
18 /// Throughput Bottlenecks:
19 ///   Resource Pressure       [ 39.34% ]
20 ///   - JFPA  [ 39.34% ]
21 ///   - JFPU0  [ 39.34% ]
22 ///   Data Dependencies:      [ 1.42% ]
23 ///   - Register Dependencies [ 1.42% ]
24 ///   - Memory Dependencies   [ 0.00% ]
25 ///
26 /// According to the example, backend pressure increased during the 40.76% of
27 /// the simulated cycles.  In particular, the major cause of backend pressure
28 /// increases was the contention on floating point adder JFPA accessible from
29 /// pipeline resource JFPU0.
30 ///
31 /// At the end of each cycle, if pressure on the simulated out-of-order buffers
32 /// has increased, a backend pressure event is reported.
33 /// In particular, this occurs when there is a delta between the number of uOps
34 /// dispatched and the number of uOps issued to the underlying pipelines.
35 ///
36 /// The bottleneck analysis view is also responsible for identifying and printing
37 /// the most "critical" sequence of dependent instructions according to the
38 /// simulated run.
39 ///
40 /// Below is the critical sequence computed for the dot-product example on
41 /// btver2:
42 ///
43 ///              Instruction                     Dependency Information
44 /// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
45 /// |
46 /// |    < loop carried >
47 /// |
48 /// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
49 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
50 /// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
51 /// |
52 /// |    < loop carried >
53 /// |
54 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
55 ///
56 ///
57 /// The algorithm that computes the critical sequence is very similar to a
58 /// critical path analysis.
59 ///
60 /// A dependency graph is used internally to track dependencies between nodes.
61 /// Nodes of the graph represent instructions from the input assembly sequence,
62 /// and edges of the graph represent data dependencies or processor resource
63 /// interferences.
64 ///
65 /// Edges are dynamically 'discovered' by observing instruction state transitions
66 /// and backend pressure increase events. Edges are internally ranked based on
67 /// their "criticality". A dependency is considered to be critical if it takes a
68 /// long time to execute, and if it contributes to backend pressure increases.
69 /// Criticality is internally measured in terms of cycles; it is computed for
70 /// every edge in the graph as a function of the edge latency and the number of
71 /// backend pressure increase cycles contributed by that edge.
72 ///
73 /// At the end of simulation, costs are propagated to nodes through the edges of
74 /// the graph, and the most expensive path connecting the root-set (a
75 /// set of nodes with no predecessors) to a leaf node is reported as critical
76 /// sequence.
77 //
78 //===----------------------------------------------------------------------===//
79 
80 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
81 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
82 
83 #include "Views/View.h"
84 #include "llvm/ADT/DenseMap.h"
85 #include "llvm/ADT/SmallVector.h"
86 #include "llvm/MC/MCInstPrinter.h"
87 #include "llvm/MC/MCSchedule.h"
88 #include "llvm/MC/MCSubtargetInfo.h"
89 #include "llvm/Support/raw_ostream.h"
90 
91 namespace llvm {
92 namespace mca {
93 
94 class PressureTracker {
95   const MCSchedModel &SM;
96 
97   // Resource pressure distribution. There is an element for every processor
98   // resource declared by the scheduling model. Quantities are number of cycles.
99   SmallVector<unsigned, 4> ResourcePressureDistribution;
100 
101   // Each processor resource is associated with a so-called processor resource
102   // mask. This vector allows to correlate processor resource IDs with processor
103   // resource masks. There is exactly one element per each processor resource
104   // declared by the scheduling model.
105   SmallVector<uint64_t, 4> ProcResID2Mask;
106 
107   // Maps processor resource state indices (returned by calls to
108   // `getResourceStateIndex(Mask)` to processor resource identifiers.
109   SmallVector<unsigned, 4> ResIdx2ProcResID;
110 
111   // Maps Processor Resource identifiers to ResourceUsers indices.
112   SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
113 
114   // Identifies the last user of a processor resource unit.
115   // This vector is updated on every instruction issued event.
116   // There is one entry for every processor resource unit declared by the
117   // processor model. An all_ones value is treated like an invalid instruction
118   // identifier.
119   using User = std::pair<unsigned, unsigned>;
120   SmallVector<User, 4> ResourceUsers;
121 
122   struct InstructionPressureInfo {
123     unsigned RegisterPressureCycles;
124     unsigned MemoryPressureCycles;
125     unsigned ResourcePressureCycles;
126   };
127   DenseMap<unsigned, InstructionPressureInfo> IPI;
128 
129   void updateResourcePressureDistribution(uint64_t CumulativeMask);
130 
131   User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
132     unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
133     return ResourceUsers[Index + UnitID];
134   }
135 
136 public:
137   PressureTracker(const MCSchedModel &Model);
138 
139   ArrayRef<unsigned> getResourcePressureDistribution() const {
140     return ResourcePressureDistribution;
141   }
142 
143   void getResourceUsers(uint64_t ResourceMask,
144                         SmallVectorImpl<User> &Users) const;
145 
146   unsigned getRegisterPressureCycles(unsigned IID) const {
147     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
148     const InstructionPressureInfo &Info = IPI.find(IID)->second;
149     return Info.RegisterPressureCycles;
150   }
151 
152   unsigned getMemoryPressureCycles(unsigned IID) const {
153     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
154     const InstructionPressureInfo &Info = IPI.find(IID)->second;
155     return Info.MemoryPressureCycles;
156   }
157 
158   unsigned getResourcePressureCycles(unsigned IID) const {
159     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
160     const InstructionPressureInfo &Info = IPI.find(IID)->second;
161     return Info.ResourcePressureCycles;
162   }
163 
164   const char *resolveResourceName(uint64_t ResourceMask) const {
165     unsigned Index = getResourceStateIndex(ResourceMask);
166     unsigned ProcResID = ResIdx2ProcResID[Index];
167     const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
168     return PRDesc.Name;
169   }
170 
171   void onInstructionDispatched(unsigned IID);
172   void onInstructionExecuted(unsigned IID);
173 
174   void handlePressureEvent(const HWPressureEvent &Event);
175   void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
176 };
177 
178 // A dependency edge.
179 struct DependencyEdge {
180   enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
181 
182   // Dependency edge descriptor.
183   //
184   // It specifies the dependency type, as well as the edge cost in cycles.
185   struct Dependency {
186     DependencyType Type;
187     uint64_t ResourceOrRegID;
188     uint64_t Cost;
189   };
190   Dependency Dep;
191 
192   unsigned FromIID;
193   unsigned ToIID;
194 
195   // Used by the bottleneck analysis to compute the interference
196   // probability for processor resources.
197   unsigned Frequency;
198 };
199 
200 // A dependency graph used by the bottleneck analysis to describe data
201 // dependencies and processor resource interferences between instructions.
202 //
203 // There is a node (an instance of struct DGNode) for every instruction in the
204 // input assembly sequence. Edges of the graph represent dependencies between
205 // instructions.
206 //
207 // Each edge of the graph is associated with a cost value which is used
208 // internally to rank dependency based on their impact on the runtime
209 // performance (see field DependencyEdge::Dependency::Cost). In general, the
210 // higher the cost of an edge, the higher the impact on performance.
211 //
212 // The cost of a dependency is a function of both the latency and the number of
213 // cycles where the dependency has been seen as critical (i.e. contributing to
214 // back-pressure increases).
215 //
216 // Loop carried dependencies are carefully expanded by the bottleneck analysis
217 // to guarantee that the graph stays acyclic. To this end, extra nodes are
218 // pre-allocated at construction time to describe instructions from "past and
219 // future" iterations. The graph is kept acyclic mainly because it simplifies the
220 // complexity of the algorithm that computes the critical sequence.
221 class DependencyGraph {
222   struct DGNode {
223     unsigned NumPredecessors;
224     unsigned NumVisitedPredecessors;
225     uint64_t Cost;
226     unsigned Depth;
227 
228     DependencyEdge CriticalPredecessor;
229     SmallVector<DependencyEdge, 8> OutgoingEdges;
230   };
231   SmallVector<DGNode, 16> Nodes;
232 
233   DependencyGraph(const DependencyGraph &) = delete;
234   DependencyGraph &operator=(const DependencyGraph &) = delete;
235 
236   void addDependency(unsigned From, unsigned To,
237                      DependencyEdge::Dependency &&DE);
238 
239   void pruneEdges(unsigned Iterations);
240   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
241   void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
242 
243 #ifndef NDEBUG
244   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
245                           MCInstPrinter &MCIP) const;
246 #endif
247 
248 public:
249   DependencyGraph(unsigned Size) : Nodes(Size) {}
250 
251   void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
252                       unsigned Cost) {
253     addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
254   }
255 
256   void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
257     addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
258   }
259 
260   void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
261                       unsigned Cost) {
262     addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
263   }
264 
265   // Called by the bottleneck analysis at the end of simulation to propagate
266   // costs through the edges of the graph, and compute a critical path.
267   void finalizeGraph(unsigned Iterations) {
268     SmallVector<unsigned, 16> RootSet;
269     pruneEdges(Iterations);
270     initializeRootSet(RootSet);
271     propagateThroughEdges(RootSet, Iterations);
272   }
273 
274   // Returns a sequence of edges representing the critical sequence based on the
275   // simulated run. It assumes that the graph has already been finalized (i.e.
276   // method `finalizeGraph()` has already been called on this graph).
277   void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
278 
279 #ifndef NDEBUG
280   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
281 #endif
282 };
283 
284 /// A view that collects and prints a few performance numbers.
285 class BottleneckAnalysis : public View {
286   const MCSubtargetInfo &STI;
287   MCInstPrinter &MCIP;
288   PressureTracker Tracker;
289   DependencyGraph DG;
290 
291   ArrayRef<MCInst> Source;
292   unsigned Iterations;
293   unsigned TotalCycles;
294 
295   bool PressureIncreasedBecauseOfResources;
296   bool PressureIncreasedBecauseOfRegisterDependencies;
297   bool PressureIncreasedBecauseOfMemoryDependencies;
298   // True if throughput was affected by dispatch stalls.
299   bool SeenStallCycles;
300 
301   struct BackPressureInfo {
302     // Cycles where backpressure increased.
303     unsigned PressureIncreaseCycles;
304     // Cycles where backpressure increased because of pipeline pressure.
305     unsigned ResourcePressureCycles;
306     // Cycles where backpressure increased because of data dependencies.
307     unsigned DataDependencyCycles;
308     // Cycles where backpressure increased because of register dependencies.
309     unsigned RegisterDependencyCycles;
310     // Cycles where backpressure increased because of memory dependencies.
311     unsigned MemoryDependencyCycles;
312   };
313   BackPressureInfo BPI;
314 
315   // Used to populate the dependency graph DG.
316   void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
317   void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
318   void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
319 
320   // Prints a bottleneck message to OS.
321   void printBottleneckHints(raw_ostream &OS) const;
322   void printCriticalSequence(raw_ostream &OS) const;
323 
324 public:
325   BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
326                      ArrayRef<MCInst> Sequence, unsigned Iterations);
327 
328   void onCycleEnd() override;
329   void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
330   void onEvent(const HWPressureEvent &Event) override;
331   void onEvent(const HWInstructionEvent &Event) override;
332 
333   void printView(raw_ostream &OS) const override;
334 
335 #ifndef NDEBUG
336   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
337 #endif
338 };
339 
340 } // namespace mca
341 } // namespace llvm
342 
343 #endif
344