1 //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements the bottleneck analysis view. 11 /// 12 /// This view internally observes backend pressure increase events in order to 13 /// identify problematic data dependencies and processor resource interferences. 14 /// 15 /// Example of bottleneck analysis report for a dot-product on X86 btver2: 16 /// 17 /// Cycles with backend pressure increase [ 40.76% ] 18 /// Throughput Bottlenecks: 19 /// Resource Pressure [ 39.34% ] 20 /// - JFPA [ 39.34% ] 21 /// - JFPU0 [ 39.34% ] 22 /// Data Dependencies: [ 1.42% ] 23 /// - Register Dependencies [ 1.42% ] 24 /// - Memory Dependencies [ 0.00% ] 25 /// 26 /// According to the example, backend pressure increased during the 40.76% of 27 /// the simulated cycles. In particular, the major cause of backend pressure 28 /// increases was the contention on floating point adder JFPA accessible from 29 /// pipeline resource JFPU0. 30 /// 31 /// At the end of each cycle, if pressure on the simulated out-of-order buffers 32 /// has increased, a backend pressure event is reported. 33 /// In particular, this occurs when there is a delta between the number of uOps 34 /// dispatched and the number of uOps issued to the underlying pipelines. 35 /// 36 /// The bottleneck analysis view is also responsible for identifying and 37 /// printing the most "critical" sequence of dependent instructions according to 38 /// the simulated run. 39 /// 40 /// Below is the critical sequence computed for the dot-product example on 41 /// btver2: 42 /// 43 /// Instruction Dependency Information 44 /// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 45 /// | 46 /// | < loop carried > 47 /// | 48 /// | 0. vmulps %xmm0, %xmm0, %xmm2 49 /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] 50 /// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 51 /// | 52 /// | < loop carried > 53 /// | 54 /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] 55 /// 56 /// 57 /// The algorithm that computes the critical sequence is very similar to a 58 /// critical path analysis. 59 /// 60 /// A dependency graph is used internally to track dependencies between nodes. 61 /// Nodes of the graph represent instructions from the input assembly sequence, 62 /// and edges of the graph represent data dependencies or processor resource 63 /// interferences. 64 /// 65 /// Edges are dynamically 'discovered' by observing instruction state 66 /// transitions and backend pressure increase events. Edges are internally 67 /// ranked based on their "criticality". A dependency is considered to be 68 /// critical if it takes a long time to execute, and if it contributes to 69 /// backend pressure increases. Criticality is internally measured in terms of 70 /// cycles; it is computed for every edge in the graph as a function of the edge 71 /// latency and the number of backend pressure increase cycles contributed by 72 /// that edge. 73 /// 74 /// At the end of simulation, costs are propagated to nodes through the edges of 75 /// the graph, and the most expensive path connecting the root-set (a 76 /// set of nodes with no predecessors) to a leaf node is reported as critical 77 /// sequence. 78 // 79 //===----------------------------------------------------------------------===// 80 81 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H 82 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H 83 84 #include "Views/InstructionView.h" 85 #include "llvm/ADT/DenseMap.h" 86 #include "llvm/ADT/SmallVector.h" 87 #include "llvm/MC/MCInstPrinter.h" 88 #include "llvm/MC/MCSchedule.h" 89 #include "llvm/MC/MCSubtargetInfo.h" 90 #include "llvm/Support/FormattedStream.h" 91 #include "llvm/Support/raw_ostream.h" 92 93 namespace llvm { 94 namespace mca { 95 96 class PressureTracker { 97 const MCSchedModel &SM; 98 99 // Resource pressure distribution. There is an element for every processor 100 // resource declared by the scheduling model. Quantities are number of cycles. 101 SmallVector<unsigned, 4> ResourcePressureDistribution; 102 103 // Each processor resource is associated with a so-called processor resource 104 // mask. This vector allows to correlate processor resource IDs with processor 105 // resource masks. There is exactly one element per each processor resource 106 // declared by the scheduling model. 107 SmallVector<uint64_t, 4> ProcResID2Mask; 108 109 // Maps processor resource state indices (returned by calls to 110 // `getResourceStateIndex(Mask)` to processor resource identifiers. 111 SmallVector<unsigned, 4> ResIdx2ProcResID; 112 113 // Maps Processor Resource identifiers to ResourceUsers indices. 114 SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex; 115 116 // Identifies the last user of a processor resource unit. 117 // This vector is updated on every instruction issued event. 118 // There is one entry for every processor resource unit declared by the 119 // processor model. An all_ones value is treated like an invalid instruction 120 // identifier. 121 using User = std::pair<unsigned, unsigned>; 122 SmallVector<User, 4> ResourceUsers; 123 124 struct InstructionPressureInfo { 125 unsigned RegisterPressureCycles; 126 unsigned MemoryPressureCycles; 127 unsigned ResourcePressureCycles; 128 }; 129 DenseMap<unsigned, InstructionPressureInfo> IPI; 130 131 void updateResourcePressureDistribution(uint64_t CumulativeMask); 132 getResourceUser(unsigned ProcResID,unsigned UnitID)133 User getResourceUser(unsigned ProcResID, unsigned UnitID) const { 134 unsigned Index = ProcResID2ResourceUsersIndex[ProcResID]; 135 return ResourceUsers[Index + UnitID]; 136 } 137 138 public: 139 PressureTracker(const MCSchedModel &Model); 140 getResourcePressureDistribution()141 ArrayRef<unsigned> getResourcePressureDistribution() const { 142 return ResourcePressureDistribution; 143 } 144 145 void getResourceUsers(uint64_t ResourceMask, 146 SmallVectorImpl<User> &Users) const; 147 getRegisterPressureCycles(unsigned IID)148 unsigned getRegisterPressureCycles(unsigned IID) const { 149 assert(IPI.contains(IID) && "Instruction is not tracked!"); 150 const InstructionPressureInfo &Info = IPI.find(IID)->second; 151 return Info.RegisterPressureCycles; 152 } 153 getMemoryPressureCycles(unsigned IID)154 unsigned getMemoryPressureCycles(unsigned IID) const { 155 assert(IPI.contains(IID) && "Instruction is not tracked!"); 156 const InstructionPressureInfo &Info = IPI.find(IID)->second; 157 return Info.MemoryPressureCycles; 158 } 159 getResourcePressureCycles(unsigned IID)160 unsigned getResourcePressureCycles(unsigned IID) const { 161 assert(IPI.contains(IID) && "Instruction is not tracked!"); 162 const InstructionPressureInfo &Info = IPI.find(IID)->second; 163 return Info.ResourcePressureCycles; 164 } 165 resolveResourceName(uint64_t ResourceMask)166 const char *resolveResourceName(uint64_t ResourceMask) const { 167 unsigned Index = getResourceStateIndex(ResourceMask); 168 unsigned ProcResID = ResIdx2ProcResID[Index]; 169 const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); 170 return PRDesc.Name; 171 } 172 173 void onInstructionDispatched(unsigned IID); 174 void onInstructionExecuted(unsigned IID); 175 176 void handlePressureEvent(const HWPressureEvent &Event); 177 void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); 178 }; 179 180 // A dependency edge. 181 struct DependencyEdge { 182 enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; 183 184 // Dependency edge descriptor. 185 // 186 // It specifies the dependency type, as well as the edge cost in cycles. 187 struct Dependency { 188 DependencyType Type; 189 uint64_t ResourceOrRegID; 190 uint64_t Cost; 191 }; 192 Dependency Dep; 193 194 unsigned FromIID; 195 unsigned ToIID; 196 197 // Used by the bottleneck analysis to compute the interference 198 // probability for processor resources. 199 unsigned Frequency; 200 }; 201 202 // A dependency graph used by the bottleneck analysis to describe data 203 // dependencies and processor resource interferences between instructions. 204 // 205 // There is a node (an instance of struct DGNode) for every instruction in the 206 // input assembly sequence. Edges of the graph represent dependencies between 207 // instructions. 208 // 209 // Each edge of the graph is associated with a cost value which is used 210 // internally to rank dependency based on their impact on the runtime 211 // performance (see field DependencyEdge::Dependency::Cost). In general, the 212 // higher the cost of an edge, the higher the impact on performance. 213 // 214 // The cost of a dependency is a function of both the latency and the number of 215 // cycles where the dependency has been seen as critical (i.e. contributing to 216 // back-pressure increases). 217 // 218 // Loop carried dependencies are carefully expanded by the bottleneck analysis 219 // to guarantee that the graph stays acyclic. To this end, extra nodes are 220 // pre-allocated at construction time to describe instructions from "past and 221 // future" iterations. The graph is kept acyclic mainly because it simplifies 222 // the complexity of the algorithm that computes the critical sequence. 223 class DependencyGraph { 224 struct DGNode { 225 unsigned NumPredecessors; 226 unsigned NumVisitedPredecessors; 227 uint64_t Cost; 228 unsigned Depth; 229 230 DependencyEdge CriticalPredecessor; 231 SmallVector<DependencyEdge, 8> OutgoingEdges; 232 }; 233 SmallVector<DGNode, 16> Nodes; 234 235 DependencyGraph(const DependencyGraph &) = delete; 236 DependencyGraph &operator=(const DependencyGraph &) = delete; 237 238 void addDependency(unsigned From, unsigned To, 239 DependencyEdge::Dependency &&DE); 240 241 void pruneEdges(unsigned Iterations); 242 void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const; 243 void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, 244 unsigned Iterations); 245 246 #ifndef NDEBUG 247 void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, 248 MCInstPrinter &MCIP) const; 249 #endif 250 251 public: DependencyGraph(unsigned Size)252 DependencyGraph(unsigned Size) : Nodes(Size) {} 253 addRegisterDep(unsigned From,unsigned To,unsigned RegID,unsigned Cost)254 void addRegisterDep(unsigned From, unsigned To, unsigned RegID, 255 unsigned Cost) { 256 addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost}); 257 } 258 addMemoryDep(unsigned From,unsigned To,unsigned Cost)259 void addMemoryDep(unsigned From, unsigned To, unsigned Cost) { 260 addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost}); 261 } 262 addResourceDep(unsigned From,unsigned To,uint64_t Mask,unsigned Cost)263 void addResourceDep(unsigned From, unsigned To, uint64_t Mask, 264 unsigned Cost) { 265 addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost}); 266 } 267 268 // Called by the bottleneck analysis at the end of simulation to propagate 269 // costs through the edges of the graph, and compute a critical path. finalizeGraph(unsigned Iterations)270 void finalizeGraph(unsigned Iterations) { 271 SmallVector<unsigned, 16> RootSet; 272 pruneEdges(Iterations); 273 initializeRootSet(RootSet); 274 propagateThroughEdges(RootSet, Iterations); 275 } 276 277 // Returns a sequence of edges representing the critical sequence based on the 278 // simulated run. It assumes that the graph has already been finalized (i.e. 279 // method `finalizeGraph()` has already been called on this graph). 280 void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const; 281 282 #ifndef NDEBUG 283 void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; 284 #endif 285 }; 286 287 /// A view that collects and prints a few performance numbers. 288 class BottleneckAnalysis : public InstructionView { 289 PressureTracker Tracker; 290 DependencyGraph DG; 291 292 unsigned Iterations; 293 unsigned TotalCycles; 294 295 bool PressureIncreasedBecauseOfResources; 296 bool PressureIncreasedBecauseOfRegisterDependencies; 297 bool PressureIncreasedBecauseOfMemoryDependencies; 298 // True if throughput was affected by dispatch stalls. 299 bool SeenStallCycles; 300 301 struct BackPressureInfo { 302 // Cycles where backpressure increased. 303 unsigned PressureIncreaseCycles; 304 // Cycles where backpressure increased because of pipeline pressure. 305 unsigned ResourcePressureCycles; 306 // Cycles where backpressure increased because of data dependencies. 307 unsigned DataDependencyCycles; 308 // Cycles where backpressure increased because of register dependencies. 309 unsigned RegisterDependencyCycles; 310 // Cycles where backpressure increased because of memory dependencies. 311 unsigned MemoryDependencyCycles; 312 }; 313 BackPressureInfo BPI; 314 315 // Used to populate the dependency graph DG. 316 void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy); 317 void addMemoryDep(unsigned From, unsigned To, unsigned Cy); 318 void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy); 319 320 void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI, 321 bool UseDifferentColor = false) const; 322 323 // Prints a bottleneck message to OS. 324 void printBottleneckHints(raw_ostream &OS) const; 325 void printCriticalSequence(raw_ostream &OS) const; 326 327 public: 328 BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, 329 ArrayRef<MCInst> Sequence, unsigned Iterations); 330 331 void onCycleEnd() override; onEvent(const HWStallEvent & Event)332 void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; } 333 void onEvent(const HWPressureEvent &Event) override; 334 void onEvent(const HWInstructionEvent &Event) override; 335 336 void printView(raw_ostream &OS) const override; getNameAsString()337 StringRef getNameAsString() const override { return "BottleneckAnalysis"; } isSerializable()338 bool isSerializable() const override { return false; } 339 340 #ifndef NDEBUG dump(raw_ostream & OS,MCInstPrinter & MCIP)341 void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); } 342 #endif 343 }; 344 345 } // namespace mca 346 } // namespace llvm 347 348 #endif 349