xref: /freebsd/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
17 
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
23 
24 namespace llvm {
25 namespace mca {
26 
27 /// A node of a memory dependency graph. A MemoryGroup describes a set of
28 /// instructions with same memory dependencies.
29 ///
30 /// By construction, instructions of a MemoryGroup don't depend on each other.
31 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
32 /// A Memory group identifier is then stored as a "token" in field
33 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
34 /// internally by the LSUnit to track memory dependencies.
35 class MemoryGroup {
36   unsigned NumPredecessors = 0;
37   unsigned NumExecutingPredecessors = 0;
38   unsigned NumExecutedPredecessors = 0;
39 
40   unsigned NumInstructions = 0;
41   unsigned NumExecuting = 0;
42   unsigned NumExecuted = 0;
43   // Successors that are in a order dependency with this group.
44   SmallVector<MemoryGroup *, 4> OrderSucc;
45   // Successors that are in a data dependency with this group.
46   SmallVector<MemoryGroup *, 4> DataSucc;
47 
48   CriticalDependency CriticalPredecessor;
49   InstRef CriticalMemoryInstruction;
50 
51   MemoryGroup(const MemoryGroup &) = delete;
52   MemoryGroup &operator=(const MemoryGroup &) = delete;
53 
54 public:
55   MemoryGroup() = default;
56   MemoryGroup(MemoryGroup &&) = default;
57 
getNumSuccessors()58   size_t getNumSuccessors() const {
59     return OrderSucc.size() + DataSucc.size();
60   }
getNumPredecessors()61   unsigned getNumPredecessors() const { return NumPredecessors; }
getNumExecutingPredecessors()62   unsigned getNumExecutingPredecessors() const {
63     return NumExecutingPredecessors;
64   }
getNumExecutedPredecessors()65   unsigned getNumExecutedPredecessors() const {
66     return NumExecutedPredecessors;
67   }
getNumInstructions()68   unsigned getNumInstructions() const { return NumInstructions; }
getNumExecuting()69   unsigned getNumExecuting() const { return NumExecuting; }
getNumExecuted()70   unsigned getNumExecuted() const { return NumExecuted; }
71 
getCriticalMemoryInstruction()72   const InstRef &getCriticalMemoryInstruction() const {
73     return CriticalMemoryInstruction;
74   }
getCriticalPredecessor()75   const CriticalDependency &getCriticalPredecessor() const {
76     return CriticalPredecessor;
77   }
78 
addSuccessor(MemoryGroup * Group,bool IsDataDependent)79   void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
80     // Do not need to add a dependency if there is no data
81     // dependency and all instructions from this group have been
82     // issued already.
83     if (!IsDataDependent && isExecuting())
84       return;
85 
86     Group->NumPredecessors++;
87     assert(!isExecuted() && "Should have been removed!");
88     if (isExecuting())
89       Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
90 
91     if (IsDataDependent)
92       DataSucc.emplace_back(Group);
93     else
94       OrderSucc.emplace_back(Group);
95   }
96 
isWaiting()97   bool isWaiting() const {
98     return NumPredecessors >
99            (NumExecutingPredecessors + NumExecutedPredecessors);
100   }
isPending()101   bool isPending() const {
102     return NumExecutingPredecessors &&
103            ((NumExecutedPredecessors + NumExecutingPredecessors) ==
104             NumPredecessors);
105   }
isReady()106   bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
isExecuting()107   bool isExecuting() const {
108     return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
109   }
isExecuted()110   bool isExecuted() const { return NumInstructions == NumExecuted; }
111 
onGroupIssued(const InstRef & IR,bool ShouldUpdateCriticalDep)112   void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
113     assert(!isReady() && "Unexpected group-start event!");
114     NumExecutingPredecessors++;
115 
116     if (!ShouldUpdateCriticalDep)
117       return;
118 
119     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
120     if (CriticalPredecessor.Cycles < Cycles) {
121       CriticalPredecessor.IID = IR.getSourceIndex();
122       CriticalPredecessor.Cycles = Cycles;
123     }
124   }
125 
onGroupExecuted()126   void onGroupExecuted() {
127     assert(!isReady() && "Inconsistent state found!");
128     NumExecutingPredecessors--;
129     NumExecutedPredecessors++;
130   }
131 
onInstructionIssued(const InstRef & IR)132   void onInstructionIssued(const InstRef &IR) {
133     assert(!isExecuting() && "Invalid internal state!");
134     ++NumExecuting;
135 
136     // update the CriticalMemDep.
137     const Instruction &IS = *IR.getInstruction();
138     if ((bool)CriticalMemoryInstruction) {
139       const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
140       if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
141         CriticalMemoryInstruction = IR;
142     } else {
143       CriticalMemoryInstruction = IR;
144     }
145 
146     if (!isExecuting())
147       return;
148 
149     // Notify successors that this group started execution.
150     for (MemoryGroup *MG : OrderSucc) {
151       MG->onGroupIssued(CriticalMemoryInstruction, false);
152       // Release the order dependency with this group.
153       MG->onGroupExecuted();
154     }
155 
156     for (MemoryGroup *MG : DataSucc)
157       MG->onGroupIssued(CriticalMemoryInstruction, true);
158   }
159 
onInstructionExecuted(const InstRef & IR)160   void onInstructionExecuted(const InstRef &IR) {
161     assert(isReady() && !isExecuted() && "Invalid internal state!");
162     --NumExecuting;
163     ++NumExecuted;
164 
165     if (CriticalMemoryInstruction &&
166         CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
167       CriticalMemoryInstruction.invalidate();
168     }
169 
170     if (!isExecuted())
171       return;
172 
173     // Notify data dependent successors that this group has finished execution.
174     for (MemoryGroup *MG : DataSucc)
175       MG->onGroupExecuted();
176   }
177 
addInstruction()178   void addInstruction() {
179     assert(!getNumSuccessors() && "Cannot add instructions to this group!");
180     ++NumInstructions;
181   }
182 
cycleEvent()183   void cycleEvent() {
184     if (isWaiting() && CriticalPredecessor.Cycles)
185       CriticalPredecessor.Cycles--;
186   }
187 };
188 
189 /// Abstract base interface for LS (load/store) units in llvm-mca.
190 class LSUnitBase : public HardwareUnit {
191   /// Load queue size.
192   ///
193   /// A value of zero for this field means that the load queue is unbounded.
194   /// Processor models can declare the size of a load queue via tablegen (see
195   /// the definition of tablegen class LoadQueue in
196   /// llvm/Target/TargetSchedule.td).
197   unsigned LQSize;
198 
199   /// Load queue size.
200   ///
201   /// A value of zero for this field means that the store queue is unbounded.
202   /// Processor models can declare the size of a store queue via tablegen (see
203   /// the definition of tablegen class StoreQueue in
204   /// llvm/Target/TargetSchedule.td).
205   unsigned SQSize;
206 
207   unsigned UsedLQEntries;
208   unsigned UsedSQEntries;
209 
210   /// True if loads don't alias with stores.
211   ///
212   /// By default, the LS unit assumes that loads and stores don't alias with
213   /// eachother. If this field is set to false, then loads are always assumed to
214   /// alias with stores.
215   const bool NoAlias;
216 
217   /// Used to map group identifiers to MemoryGroups.
218   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
219   unsigned NextGroupID;
220 
221 public:
222   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
223              unsigned StoreQueueSize, bool AssumeNoAlias);
224 
225   virtual ~LSUnitBase();
226 
227   /// Returns the total number of entries in the load queue.
getLoadQueueSize()228   unsigned getLoadQueueSize() const { return LQSize; }
229 
230   /// Returns the total number of entries in the store queue.
getStoreQueueSize()231   unsigned getStoreQueueSize() const { return SQSize; }
232 
getUsedLQEntries()233   unsigned getUsedLQEntries() const { return UsedLQEntries; }
getUsedSQEntries()234   unsigned getUsedSQEntries() const { return UsedSQEntries; }
acquireLQSlot()235   void acquireLQSlot() { ++UsedLQEntries; }
acquireSQSlot()236   void acquireSQSlot() { ++UsedSQEntries; }
releaseLQSlot()237   void releaseLQSlot() { --UsedLQEntries; }
releaseSQSlot()238   void releaseSQSlot() { --UsedSQEntries; }
239 
assumeNoAlias()240   bool assumeNoAlias() const { return NoAlias; }
241 
242   enum Status {
243     LSU_AVAILABLE = 0,
244     LSU_LQUEUE_FULL, // Load Queue unavailable
245     LSU_SQUEUE_FULL  // Store Queue unavailable
246   };
247 
248   /// This method checks the availability of the load/store buffers.
249   ///
250   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
251   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
252   /// not a memory operation.
253   virtual Status isAvailable(const InstRef &IR) const = 0;
254 
255   /// Allocates LS resources for instruction IR.
256   ///
257   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
258   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
259   /// Returns the GroupID associated with this instruction. That value will be
260   /// used to set the LSUTokenID field in class Instruction.
261   virtual unsigned dispatch(const InstRef &IR) = 0;
262 
isSQEmpty()263   bool isSQEmpty() const { return !UsedSQEntries; }
isLQEmpty()264   bool isLQEmpty() const { return !UsedLQEntries; }
isSQFull()265   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
isLQFull()266   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
267 
isValidGroupID(unsigned Index)268   bool isValidGroupID(unsigned Index) const {
269     return Index && Groups.contains(Index);
270   }
271 
272   /// Check if a peviously dispatched instruction IR is now ready for execution.
isReady(const InstRef & IR)273   bool isReady(const InstRef &IR) const {
274     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
275     const MemoryGroup &Group = getGroup(GroupID);
276     return Group.isReady();
277   }
278 
279   /// Check if instruction IR only depends on memory instructions that are
280   /// currently executing.
isPending(const InstRef & IR)281   bool isPending(const InstRef &IR) const {
282     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
283     const MemoryGroup &Group = getGroup(GroupID);
284     return Group.isPending();
285   }
286 
287   /// Check if instruction IR is still waiting on memory operations, and the
288   /// wait time is still unknown.
isWaiting(const InstRef & IR)289   bool isWaiting(const InstRef &IR) const {
290     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
291     const MemoryGroup &Group = getGroup(GroupID);
292     return Group.isWaiting();
293   }
294 
hasDependentUsers(const InstRef & IR)295   bool hasDependentUsers(const InstRef &IR) const {
296     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
297     const MemoryGroup &Group = getGroup(GroupID);
298     return !Group.isExecuted() && Group.getNumSuccessors();
299   }
300 
getGroup(unsigned Index)301   const MemoryGroup &getGroup(unsigned Index) const {
302     assert(isValidGroupID(Index) && "Group doesn't exist!");
303     return *Groups.find(Index)->second;
304   }
305 
getGroup(unsigned Index)306   MemoryGroup &getGroup(unsigned Index) {
307     assert(isValidGroupID(Index) && "Group doesn't exist!");
308     return *Groups.find(Index)->second;
309   }
310 
createMemoryGroup()311   unsigned createMemoryGroup() {
312     Groups.insert(
313         std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
314     return NextGroupID++;
315   }
316 
317   virtual void onInstructionExecuted(const InstRef &IR);
318 
319   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
320   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
321   // By default we conservatively assume that the LDQ receives a load at
322   // dispatch. Loads leave the LDQ at retirement stage.
323   virtual void onInstructionRetired(const InstRef &IR);
324 
onInstructionIssued(const InstRef & IR)325   virtual void onInstructionIssued(const InstRef &IR) {
326     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
327     Groups[GroupID]->onInstructionIssued(IR);
328   }
329 
330   virtual void cycleEvent();
331 
332 #ifndef NDEBUG
333   void dump() const;
334 #endif
335 };
336 
337 /// Default Load/Store Unit (LS Unit) for simulated processors.
338 ///
339 /// Each load (or store) consumes one entry in the load (or store) queue.
340 ///
341 /// Rules are:
342 /// 1) A younger load is allowed to pass an older load only if there are no
343 ///    stores nor barriers in between the two loads.
344 /// 2) An younger store is not allowed to pass an older store.
345 /// 3) A younger store is not allowed to pass an older load.
346 /// 4) A younger load is allowed to pass an older store only if the load does
347 ///    not alias with the store.
348 ///
349 /// This class optimistically assumes that loads don't alias store operations.
350 /// Under this assumption, younger loads are always allowed to pass older
351 /// stores (this would only affects rule 4).
352 /// Essentially, this class doesn't perform any sort alias analysis to
353 /// identify aliasing loads and stores.
354 ///
355 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
356 /// set to `false` by the constructor of LSUnit.
357 ///
358 /// Note that this class doesn't know about the existence of different memory
359 /// types for memory operations (example: write-through, write-combining, etc.).
360 /// Derived classes are responsible for implementing that extra knowledge, and
361 /// provide different sets of rules for loads and stores by overriding method
362 /// `isReady()`.
363 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
364 /// derived class to enable the reordering of non-aliasing store operations.
365 ///
366 /// No assumptions are made by this class on the size of the store buffer.  This
367 /// class doesn't know how to identify cases where store-to-load forwarding may
368 /// occur.
369 ///
370 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
371 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
372 /// cache hierarchy and memory types.
373 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
374 /// scheduling model provides an "optimistic" load-to-use latency (which usually
375 /// matches the load-to-use latency for when there is a hit in the L1D).
376 /// Derived classes may expand this knowledge.
377 ///
378 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
379 /// memory-barrier like instructions.
380 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
381 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
382 /// serializes loads without forcing a flush of the load queue.
383 /// Similarly, instructions that both `mayStore` and have `unmodeled side
384 /// effects` are treated like store barriers. A full memory
385 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
386 /// effects. This is obviously inaccurate, but this is the best that we can do
387 /// at the moment.
388 ///
389 /// Each load/store barrier consumes one entry in the load/store queue. A
390 /// load/store barrier enforces ordering of loads/stores:
391 ///  - A younger load cannot pass a load barrier.
392 ///  - A younger store cannot pass a store barrier.
393 ///
394 /// A younger load has to wait for the memory load barrier to execute.
395 /// A load/store barrier is "executed" when it becomes the oldest entry in
396 /// the load/store queue(s). That also means, all the older loads/stores have
397 /// already been executed.
398 class LSUnit : public LSUnitBase {
399   // This class doesn't know about the latency of a load instruction. So, it
400   // conservatively/pessimistically assumes that the latency of a load opcode
401   // matches the instruction latency.
402   //
403   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
404   // and load/store conflicts, the latency of a load is determined by the depth
405   // of the load pipeline. So, we could use field `LoadLatency` in the
406   // MCSchedModel to model that latency.
407   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
408   // L1D, and it usually already accounts for any extra latency due to data
409   // forwarding.
410   // When doing throughput analysis, `LoadLatency` is likely to
411   // be a better predictor of load latency than instruction latency. This is
412   // particularly true when simulating code with temporal/spatial locality of
413   // memory accesses.
414   // Using `LoadLatency` (instead of the instruction latency) is also expected
415   // to improve the load queue allocation for long latency instructions with
416   // folded memory operands (See PR39829).
417   //
418   // FIXME: On some processors, load/store operations are split into multiple
419   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
420   // not 256-bit data types. So, a 256-bit load is effectively split into two
421   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
422   // simplicity, this class optimistically assumes that a load instruction only
423   // consumes one entry in the LoadQueue.  Similarly, store instructions only
424   // consume a single entry in the StoreQueue.
425   // In future, we should reassess the quality of this design, and consider
426   // alternative approaches that let instructions specify the number of
427   // load/store queue entries which they consume at dispatch stage (See
428   // PR39830).
429   //
430   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
431   // conservatively treated as a store barrier. It forces older store to be
432   // executed before newer stores are issued.
433   //
434   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
435   // conservatively treated as a load barrier. It forces older loads to execute
436   // before newer loads are issued.
437   unsigned CurrentLoadGroupID;
438   unsigned CurrentLoadBarrierGroupID;
439   unsigned CurrentStoreGroupID;
440   unsigned CurrentStoreBarrierGroupID;
441 
442 public:
LSUnit(const MCSchedModel & SM)443   LSUnit(const MCSchedModel &SM)
444       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)445   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
446       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)447   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
448       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
449         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
450         CurrentStoreBarrierGroupID(0) {}
451 
452   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
453   /// accomodate instruction IR.
454   Status isAvailable(const InstRef &IR) const override;
455 
456   /// Allocates LS resources for instruction IR.
457   ///
458   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
459   /// returning LSU_AVAILABLE.
460   ///
461   /// Rules are:
462   /// By default, rules are:
463   /// 1. A store may not pass a previous store.
464   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
465   /// 3. A load may pass a previous load.
466   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
467   /// 5. A load has to wait until an older load barrier is fully executed.
468   /// 6. A store has to wait until an older store barrier is fully executed.
469   unsigned dispatch(const InstRef &IR) override;
470 
471   void onInstructionExecuted(const InstRef &IR) override;
472 };
473 
474 } // namespace mca
475 } // namespace llvm
476 
477 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
478