xref: /freebsd/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements the bottleneck analysis view.
11 ///
12 /// This view internally observes backend pressure increase events in order to
13 /// identify problematic data dependencies and processor resource interferences.
14 ///
15 /// Example of bottleneck analysis report for a dot-product on X86 btver2:
16 ///
17 /// Cycles with backend pressure increase [ 40.76% ]
18 /// Throughput Bottlenecks:
19 ///   Resource Pressure       [ 39.34% ]
20 ///   - JFPA  [ 39.34% ]
21 ///   - JFPU0  [ 39.34% ]
22 ///   Data Dependencies:      [ 1.42% ]
23 ///   - Register Dependencies [ 1.42% ]
24 ///   - Memory Dependencies   [ 0.00% ]
25 ///
26 /// According to the example, backend pressure increased during the 40.76% of
27 /// the simulated cycles.  In particular, the major cause of backend pressure
28 /// increases was the contention on floating point adder JFPA accessible from
29 /// pipeline resource JFPU0.
30 ///
31 /// At the end of each cycle, if pressure on the simulated out-of-order buffers
32 /// has increased, a backend pressure event is reported.
33 /// In particular, this occurs when there is a delta between the number of uOps
34 /// dispatched and the number of uOps issued to the underlying pipelines.
35 ///
36 /// The bottleneck analysis view is also responsible for identifying and
37 /// printing the most "critical" sequence of dependent instructions according to
38 /// the simulated run.
39 ///
40 /// Below is the critical sequence computed for the dot-product example on
41 /// btver2:
42 ///
43 ///              Instruction                     Dependency Information
44 /// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
45 /// |
46 /// |    < loop carried >
47 /// |
48 /// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
49 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
50 /// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
51 /// |
52 /// |    < loop carried >
53 /// |
54 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
55 ///
56 ///
57 /// The algorithm that computes the critical sequence is very similar to a
58 /// critical path analysis.
59 ///
60 /// A dependency graph is used internally to track dependencies between nodes.
61 /// Nodes of the graph represent instructions from the input assembly sequence,
62 /// and edges of the graph represent data dependencies or processor resource
63 /// interferences.
64 ///
65 /// Edges are dynamically 'discovered' by observing instruction state
66 /// transitions and backend pressure increase events. Edges are internally
67 /// ranked based on their "criticality". A dependency is considered to be
68 /// critical if it takes a long time to execute, and if it contributes to
69 /// backend pressure increases. Criticality is internally measured in terms of
70 /// cycles; it is computed for every edge in the graph as a function of the edge
71 /// latency and the number of backend pressure increase cycles contributed by
72 /// that edge.
73 ///
74 /// At the end of simulation, costs are propagated to nodes through the edges of
75 /// the graph, and the most expensive path connecting the root-set (a
76 /// set of nodes with no predecessors) to a leaf node is reported as critical
77 /// sequence.
78 //
79 //===----------------------------------------------------------------------===//
80 
81 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
82 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
83 
84 #include "Views/InstructionView.h"
85 #include "llvm/ADT/DenseMap.h"
86 #include "llvm/ADT/SmallVector.h"
87 #include "llvm/MC/MCInstPrinter.h"
88 #include "llvm/MC/MCSchedule.h"
89 #include "llvm/MC/MCSubtargetInfo.h"
90 #include "llvm/Support/FormattedStream.h"
91 #include "llvm/Support/raw_ostream.h"
92 
93 namespace llvm {
94 namespace mca {
95 
96 class PressureTracker {
97   const MCSchedModel &SM;
98 
99   // Resource pressure distribution. There is an element for every processor
100   // resource declared by the scheduling model. Quantities are number of cycles.
101   SmallVector<unsigned, 4> ResourcePressureDistribution;
102 
103   // Each processor resource is associated with a so-called processor resource
104   // mask. This vector allows to correlate processor resource IDs with processor
105   // resource masks. There is exactly one element per each processor resource
106   // declared by the scheduling model.
107   SmallVector<uint64_t, 4> ProcResID2Mask;
108 
109   // Maps processor resource state indices (returned by calls to
110   // `getResourceStateIndex(Mask)` to processor resource identifiers.
111   SmallVector<unsigned, 4> ResIdx2ProcResID;
112 
113   // Maps Processor Resource identifiers to ResourceUsers indices.
114   SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
115 
116   // Identifies the last user of a processor resource unit.
117   // This vector is updated on every instruction issued event.
118   // There is one entry for every processor resource unit declared by the
119   // processor model. An all_ones value is treated like an invalid instruction
120   // identifier.
121   using User = std::pair<unsigned, unsigned>;
122   SmallVector<User, 4> ResourceUsers;
123 
124   struct InstructionPressureInfo {
125     unsigned RegisterPressureCycles;
126     unsigned MemoryPressureCycles;
127     unsigned ResourcePressureCycles;
128   };
129   DenseMap<unsigned, InstructionPressureInfo> IPI;
130 
131   void updateResourcePressureDistribution(uint64_t CumulativeMask);
132 
getResourceUser(unsigned ProcResID,unsigned UnitID)133   User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
134     unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
135     return ResourceUsers[Index + UnitID];
136   }
137 
138 public:
139   PressureTracker(const MCSchedModel &Model);
140 
getResourcePressureDistribution()141   ArrayRef<unsigned> getResourcePressureDistribution() const {
142     return ResourcePressureDistribution;
143   }
144 
145   void getResourceUsers(uint64_t ResourceMask,
146                         SmallVectorImpl<User> &Users) const;
147 
getRegisterPressureCycles(unsigned IID)148   unsigned getRegisterPressureCycles(unsigned IID) const {
149     assert(IPI.contains(IID) && "Instruction is not tracked!");
150     const InstructionPressureInfo &Info = IPI.find(IID)->second;
151     return Info.RegisterPressureCycles;
152   }
153 
getMemoryPressureCycles(unsigned IID)154   unsigned getMemoryPressureCycles(unsigned IID) const {
155     assert(IPI.contains(IID) && "Instruction is not tracked!");
156     const InstructionPressureInfo &Info = IPI.find(IID)->second;
157     return Info.MemoryPressureCycles;
158   }
159 
getResourcePressureCycles(unsigned IID)160   unsigned getResourcePressureCycles(unsigned IID) const {
161     assert(IPI.contains(IID) && "Instruction is not tracked!");
162     const InstructionPressureInfo &Info = IPI.find(IID)->second;
163     return Info.ResourcePressureCycles;
164   }
165 
resolveResourceName(uint64_t ResourceMask)166   const char *resolveResourceName(uint64_t ResourceMask) const {
167     unsigned Index = getResourceStateIndex(ResourceMask);
168     unsigned ProcResID = ResIdx2ProcResID[Index];
169     const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
170     return PRDesc.Name;
171   }
172 
173   void onInstructionDispatched(unsigned IID);
174   void onInstructionExecuted(unsigned IID);
175 
176   void handlePressureEvent(const HWPressureEvent &Event);
177   void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
178 };
179 
180 // A dependency edge.
181 struct DependencyEdge {
182   enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
183 
184   // Dependency edge descriptor.
185   //
186   // It specifies the dependency type, as well as the edge cost in cycles.
187   struct Dependency {
188     DependencyType Type;
189     uint64_t ResourceOrRegID;
190     uint64_t Cost;
191   };
192   Dependency Dep;
193 
194   unsigned FromIID;
195   unsigned ToIID;
196 
197   // Used by the bottleneck analysis to compute the interference
198   // probability for processor resources.
199   unsigned Frequency;
200 };
201 
202 // A dependency graph used by the bottleneck analysis to describe data
203 // dependencies and processor resource interferences between instructions.
204 //
205 // There is a node (an instance of struct DGNode) for every instruction in the
206 // input assembly sequence. Edges of the graph represent dependencies between
207 // instructions.
208 //
209 // Each edge of the graph is associated with a cost value which is used
210 // internally to rank dependency based on their impact on the runtime
211 // performance (see field DependencyEdge::Dependency::Cost). In general, the
212 // higher the cost of an edge, the higher the impact on performance.
213 //
214 // The cost of a dependency is a function of both the latency and the number of
215 // cycles where the dependency has been seen as critical (i.e. contributing to
216 // back-pressure increases).
217 //
218 // Loop carried dependencies are carefully expanded by the bottleneck analysis
219 // to guarantee that the graph stays acyclic. To this end, extra nodes are
220 // pre-allocated at construction time to describe instructions from "past and
221 // future" iterations. The graph is kept acyclic mainly because it simplifies
222 // the complexity of the algorithm that computes the critical sequence.
223 class DependencyGraph {
224   struct DGNode {
225     unsigned NumPredecessors;
226     unsigned NumVisitedPredecessors;
227     uint64_t Cost;
228     unsigned Depth;
229 
230     DependencyEdge CriticalPredecessor;
231     // Measurements show that more than 90% of nodes have no outgoing edges. To
232     // minimize memory consumption we use SmallVector with zero inline elements
233     // that is preferred version of std::vector.
234     SmallVector<DependencyEdge, 0> OutgoingEdges;
235   };
236   SmallVector<DGNode, 16> Nodes;
237 
238   DependencyGraph(const DependencyGraph &) = delete;
239   DependencyGraph &operator=(const DependencyGraph &) = delete;
240 
241   void addDependency(unsigned From, unsigned To,
242                      DependencyEdge::Dependency &&DE);
243 
244   void pruneEdges(unsigned Iterations);
245   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
246   void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet,
247                              unsigned Iterations);
248 
249 #ifndef NDEBUG
250   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
251                           MCInstPrinter &MCIP) const;
252 #endif
253 
254 public:
DependencyGraph(unsigned Size)255   DependencyGraph(unsigned Size) : Nodes(Size) {}
256 
addRegisterDep(unsigned From,unsigned To,unsigned RegID,unsigned Cost)257   void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
258                       unsigned Cost) {
259     addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
260   }
261 
addMemoryDep(unsigned From,unsigned To,unsigned Cost)262   void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
263     addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
264   }
265 
addResourceDep(unsigned From,unsigned To,uint64_t Mask,unsigned Cost)266   void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
267                       unsigned Cost) {
268     addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
269   }
270 
271   // Called by the bottleneck analysis at the end of simulation to propagate
272   // costs through the edges of the graph, and compute a critical path.
finalizeGraph(unsigned Iterations)273   void finalizeGraph(unsigned Iterations) {
274     SmallVector<unsigned, 16> RootSet;
275     pruneEdges(Iterations);
276     initializeRootSet(RootSet);
277     propagateThroughEdges(RootSet, Iterations);
278   }
279 
280   // Returns a sequence of edges representing the critical sequence based on the
281   // simulated run. It assumes that the graph has already been finalized (i.e.
282   // method `finalizeGraph()` has already been called on this graph).
283   void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
284 
285 #ifndef NDEBUG
286   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
287 #endif
288 };
289 
290 /// A view that collects and prints a few performance numbers.
291 class BottleneckAnalysis : public InstructionView {
292   PressureTracker Tracker;
293   DependencyGraph DG;
294 
295   unsigned Iterations;
296   unsigned TotalCycles;
297 
298   bool PressureIncreasedBecauseOfResources;
299   bool PressureIncreasedBecauseOfRegisterDependencies;
300   bool PressureIncreasedBecauseOfMemoryDependencies;
301   // True if throughput was affected by dispatch stalls.
302   bool SeenStallCycles;
303 
304   struct BackPressureInfo {
305     // Cycles where backpressure increased.
306     unsigned PressureIncreaseCycles;
307     // Cycles where backpressure increased because of pipeline pressure.
308     unsigned ResourcePressureCycles;
309     // Cycles where backpressure increased because of data dependencies.
310     unsigned DataDependencyCycles;
311     // Cycles where backpressure increased because of register dependencies.
312     unsigned RegisterDependencyCycles;
313     // Cycles where backpressure increased because of memory dependencies.
314     unsigned MemoryDependencyCycles;
315   };
316   BackPressureInfo BPI;
317 
318   // Used to populate the dependency graph DG.
319   void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
320   void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
321   void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
322 
323   void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
324                         bool UseDifferentColor = false) const;
325 
326   // Prints a bottleneck message to OS.
327   void printBottleneckHints(raw_ostream &OS) const;
328   void printCriticalSequence(raw_ostream &OS) const;
329 
330 public:
331   BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
332                      ArrayRef<MCInst> Sequence, unsigned Iterations);
333 
334   void onCycleEnd() override;
onEvent(const HWStallEvent & Event)335   void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
336   void onEvent(const HWPressureEvent &Event) override;
337   void onEvent(const HWInstructionEvent &Event) override;
338 
339   void printView(raw_ostream &OS) const override;
getNameAsString()340   StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
isSerializable()341   bool isSerializable() const override { return true; }
342   json::Value toJSON() const override;
343 
344 #ifndef NDEBUG
dump(raw_ostream & OS,MCInstPrinter & MCIP)345   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
346 #endif
347 };
348 
349 } // namespace mca
350 } // namespace llvm
351 
352 #endif
353