xref: /freebsd/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
17 
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
23 #include "llvm/Support/Compiler.h"
24 
25 namespace llvm {
26 namespace mca {
27 
28 /// Abstract base interface for LS (load/store) units in llvm-mca.
29 class LLVM_ABI LSUnitBase : public HardwareUnit {
30   /// Load queue size.
31   ///
32   /// A value of zero for this field means that the load queue is unbounded.
33   /// Processor models can declare the size of a load queue via tablegen (see
34   /// the definition of tablegen class LoadQueue in
35   /// llvm/Target/TargetSchedule.td).
36   unsigned LQSize;
37 
38   /// Load queue size.
39   ///
40   /// A value of zero for this field means that the store queue is unbounded.
41   /// Processor models can declare the size of a store queue via tablegen (see
42   /// the definition of tablegen class StoreQueue in
43   /// llvm/Target/TargetSchedule.td).
44   unsigned SQSize;
45 
46   unsigned UsedLQEntries;
47   unsigned UsedSQEntries;
48 
49   /// True if loads don't alias with stores.
50   ///
51   /// By default, the LS unit assumes that loads and stores don't alias with
52   /// each other. If this field is set to false, then loads are always assumed
53   /// to alias with stores.
54   const bool NoAlias;
55 
56 public:
57   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
58              unsigned StoreQueueSize, bool AssumeNoAlias);
59 
60   virtual ~LSUnitBase();
61 
62   /// Returns the total number of entries in the load queue.
getLoadQueueSize()63   unsigned getLoadQueueSize() const { return LQSize; }
64 
65   /// Returns the total number of entries in the store queue.
getStoreQueueSize()66   unsigned getStoreQueueSize() const { return SQSize; }
67 
getUsedLQEntries()68   unsigned getUsedLQEntries() const { return UsedLQEntries; }
getUsedSQEntries()69   unsigned getUsedSQEntries() const { return UsedSQEntries; }
acquireLQSlot()70   void acquireLQSlot() { ++UsedLQEntries; }
acquireSQSlot()71   void acquireSQSlot() { ++UsedSQEntries; }
releaseLQSlot()72   void releaseLQSlot() { --UsedLQEntries; }
releaseSQSlot()73   void releaseSQSlot() { --UsedSQEntries; }
74 
assumeNoAlias()75   bool assumeNoAlias() const { return NoAlias; }
76 
77   enum Status {
78     LSU_AVAILABLE = 0,
79     LSU_LQUEUE_FULL, // Load Queue unavailable
80     LSU_SQUEUE_FULL  // Store Queue unavailable
81   };
82 
83   /// This method checks the availability of the load/store buffers.
84   ///
85   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
86   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
87   /// not a memory operation.
88   virtual Status isAvailable(const InstRef &IR) const = 0;
89 
90   /// Allocates LS resources for instruction IR.
91   ///
92   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
93   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
94   /// Returns the GroupID associated with this instruction. That value will be
95   /// used to set the LSUTokenID field in class Instruction.
96   virtual unsigned dispatch(const InstRef &IR) = 0;
97 
isSQEmpty()98   bool isSQEmpty() const { return !UsedSQEntries; }
isLQEmpty()99   bool isLQEmpty() const { return !UsedLQEntries; }
isSQFull()100   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
isLQFull()101   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
102 
103   /// Check if a peviously dispatched instruction IR is now ready for execution.
104   virtual bool isReady(const InstRef &IR) const = 0;
105 
106   /// Check if instruction IR only depends on memory instructions that are
107   /// currently executing.
108   virtual bool isPending(const InstRef &IR) const = 0;
109 
110   /// Check if instruction IR is still waiting on memory operations, and the
111   /// wait time is still unknown.
112   virtual bool isWaiting(const InstRef &IR) const = 0;
113 
114   virtual bool hasDependentUsers(const InstRef &IR) const = 0;
115 
116   virtual const CriticalDependency getCriticalPredecessor(unsigned GroupId) = 0;
117 
118   virtual void onInstructionExecuted(const InstRef &IR) = 0;
119 
120   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
121   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
122   // By default we conservatively assume that the LDQ receives a load at
123   // dispatch. Loads leave the LDQ at retirement stage.
124   virtual void onInstructionRetired(const InstRef &IR) = 0;
125 
126   virtual void onInstructionIssued(const InstRef &IR) = 0;
127 
128   virtual void cycleEvent() = 0;
129 
130 #ifndef NDEBUG
131   virtual void dump() const = 0;
132 #endif
133 };
134 
135 /// Default Load/Store Unit (LS Unit) for simulated processors.
136 ///
137 /// Each load (or store) consumes one entry in the load (or store) queue.
138 ///
139 /// Rules are:
140 /// 1) A younger load is allowed to pass an older load only if there are no
141 ///    stores nor barriers in between the two loads.
142 /// 2) An younger store is not allowed to pass an older store.
143 /// 3) A younger store is not allowed to pass an older load.
144 /// 4) A younger load is allowed to pass an older store only if the load does
145 ///    not alias with the store.
146 ///
147 /// This class optimistically assumes that loads don't alias store operations.
148 /// Under this assumption, younger loads are always allowed to pass older
149 /// stores (this would only affects rule 4).
150 /// Essentially, this class doesn't perform any sort alias analysis to
151 /// identify aliasing loads and stores.
152 ///
153 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
154 /// set to `false` by the constructor of LSUnit.
155 ///
156 /// Note that this class doesn't know about the existence of different memory
157 /// types for memory operations (example: write-through, write-combining, etc.).
158 /// Derived classes are responsible for implementing that extra knowledge, and
159 /// provide different sets of rules for loads and stores by overriding method
160 /// `isReady()`.
161 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
162 /// derived class to enable the reordering of non-aliasing store operations.
163 ///
164 /// No assumptions are made by this class on the size of the store buffer.  This
165 /// class doesn't know how to identify cases where store-to-load forwarding may
166 /// occur.
167 ///
168 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
169 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
170 /// cache hierarchy and memory types.
171 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
172 /// scheduling model provides an "optimistic" load-to-use latency (which usually
173 /// matches the load-to-use latency for when there is a hit in the L1D).
174 /// Derived classes may expand this knowledge.
175 ///
176 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
177 /// memory-barrier like instructions.
178 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
179 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
180 /// serializes loads without forcing a flush of the load queue.
181 /// Similarly, instructions that both `mayStore` and have `unmodeled side
182 /// effects` are treated like store barriers. A full memory
183 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
184 /// effects. This is obviously inaccurate, but this is the best that we can do
185 /// at the moment.
186 ///
187 /// Each load/store barrier consumes one entry in the load/store queue. A
188 /// load/store barrier enforces ordering of loads/stores:
189 ///  - A younger load cannot pass a load barrier.
190 ///  - A younger store cannot pass a store barrier.
191 ///
192 /// A younger load has to wait for the memory load barrier to execute.
193 /// A load/store barrier is "executed" when it becomes the oldest entry in
194 /// the load/store queue(s). That also means, all the older loads/stores have
195 /// already been executed.
196 class LLVM_ABI LSUnit : public LSUnitBase {
197 
198   // This class doesn't know about the latency of a load instruction. So, it
199   // conservatively/pessimistically assumes that the latency of a load opcode
200   // matches the instruction latency.
201   //
202   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
203   // and load/store conflicts, the latency of a load is determined by the depth
204   // of the load pipeline. So, we could use field `LoadLatency` in the
205   // MCSchedModel to model that latency.
206   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
207   // L1D, and it usually already accounts for any extra latency due to data
208   // forwarding.
209   // When doing throughput analysis, `LoadLatency` is likely to
210   // be a better predictor of load latency than instruction latency. This is
211   // particularly true when simulating code with temporal/spatial locality of
212   // memory accesses.
213   // Using `LoadLatency` (instead of the instruction latency) is also expected
214   // to improve the load queue allocation for long latency instructions with
215   // folded memory operands (See PR39829).
216   //
217   // FIXME: On some processors, load/store operations are split into multiple
218   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
219   // not 256-bit data types. So, a 256-bit load is effectively split into two
220   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
221   // simplicity, this class optimistically assumes that a load instruction only
222   // consumes one entry in the LoadQueue.  Similarly, store instructions only
223   // consume a single entry in the StoreQueue.
224   // In future, we should reassess the quality of this design, and consider
225   // alternative approaches that let instructions specify the number of
226   // load/store queue entries which they consume at dispatch stage (See
227   // PR39830).
228   //
229   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
230   // conservatively treated as a store barrier. It forces older store to be
231   // executed before newer stores are issued.
232   //
233   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
234   // conservatively treated as a load barrier. It forces older loads to execute
235   // before newer loads are issued.
236 
237 protected:
238   /// A node of a memory dependency graph. A MemoryGroup describes a set of
239   /// instructions with same memory dependencies.
240   ///
241   /// By construction, instructions of a MemoryGroup don't depend on each other.
242   /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
243   /// A Memory group identifier is then stored as a "token" in field
244   /// Instruction::LSUTokenID of each dispatched instructions. That token is
245   /// used internally by the LSUnit to track memory dependencies.
246   class MemoryGroup {
247     unsigned NumPredecessors = 0;
248     unsigned NumExecutingPredecessors = 0;
249     unsigned NumExecutedPredecessors = 0;
250 
251     unsigned NumInstructions = 0;
252     unsigned NumExecuting = 0;
253     unsigned NumExecuted = 0;
254     // Successors that are in a order dependency with this group.
255     SmallVector<MemoryGroup *, 4> OrderSucc;
256     // Successors that are in a data dependency with this group.
257     SmallVector<MemoryGroup *, 4> DataSucc;
258 
259     CriticalDependency CriticalPredecessor;
260     InstRef CriticalMemoryInstruction;
261 
262     MemoryGroup(const MemoryGroup &) = delete;
263     MemoryGroup &operator=(const MemoryGroup &) = delete;
264 
265   public:
266     MemoryGroup() = default;
267     MemoryGroup(MemoryGroup &&) = default;
268 
getNumSuccessors()269     size_t getNumSuccessors() const {
270       return OrderSucc.size() + DataSucc.size();
271     }
getNumPredecessors()272     unsigned getNumPredecessors() const { return NumPredecessors; }
getNumExecutingPredecessors()273     unsigned getNumExecutingPredecessors() const {
274       return NumExecutingPredecessors;
275     }
getNumExecutedPredecessors()276     unsigned getNumExecutedPredecessors() const {
277       return NumExecutedPredecessors;
278     }
getNumInstructions()279     unsigned getNumInstructions() const { return NumInstructions; }
getNumExecuting()280     unsigned getNumExecuting() const { return NumExecuting; }
getNumExecuted()281     unsigned getNumExecuted() const { return NumExecuted; }
282 
getCriticalMemoryInstruction()283     const InstRef &getCriticalMemoryInstruction() const {
284       return CriticalMemoryInstruction;
285     }
getCriticalPredecessor()286     const CriticalDependency &getCriticalPredecessor() const {
287       return CriticalPredecessor;
288     }
289 
addSuccessor(MemoryGroup * Group,bool IsDataDependent)290     void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
291       // Do not need to add a dependency if there is no data
292       // dependency and all instructions from this group have been
293       // issued already.
294       if (!IsDataDependent && isExecuting())
295         return;
296 
297       Group->NumPredecessors++;
298       assert(!isExecuted() && "Should have been removed!");
299       if (isExecuting())
300         Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
301 
302       if (IsDataDependent)
303         DataSucc.emplace_back(Group);
304       else
305         OrderSucc.emplace_back(Group);
306     }
307 
isWaiting()308     bool isWaiting() const {
309       return NumPredecessors >
310              (NumExecutingPredecessors + NumExecutedPredecessors);
311     }
isPending()312     bool isPending() const {
313       return NumExecutingPredecessors &&
314              ((NumExecutedPredecessors + NumExecutingPredecessors) ==
315               NumPredecessors);
316     }
isReady()317     bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
isExecuting()318     bool isExecuting() const {
319       return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
320     }
isExecuted()321     bool isExecuted() const { return NumInstructions == NumExecuted; }
322 
onGroupIssued(const InstRef & IR,bool ShouldUpdateCriticalDep)323     void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
324       assert(!isReady() && "Unexpected group-start event!");
325       NumExecutingPredecessors++;
326 
327       if (!ShouldUpdateCriticalDep)
328         return;
329 
330       unsigned Cycles = IR.getInstruction()->getCyclesLeft();
331       if (CriticalPredecessor.Cycles < Cycles) {
332         CriticalPredecessor.IID = IR.getSourceIndex();
333         CriticalPredecessor.Cycles = Cycles;
334       }
335     }
336 
onGroupExecuted()337     void onGroupExecuted() {
338       assert(!isReady() && "Inconsistent state found!");
339       NumExecutingPredecessors--;
340       NumExecutedPredecessors++;
341     }
342 
onInstructionIssued(const InstRef & IR)343     void onInstructionIssued(const InstRef &IR) {
344       assert(!isExecuting() && "Invalid internal state!");
345       ++NumExecuting;
346 
347       // update the CriticalMemDep.
348       const Instruction &IS = *IR.getInstruction();
349       if ((bool)CriticalMemoryInstruction) {
350         const Instruction &OtherIS =
351             *CriticalMemoryInstruction.getInstruction();
352         if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
353           CriticalMemoryInstruction = IR;
354       } else {
355         CriticalMemoryInstruction = IR;
356       }
357 
358       if (!isExecuting())
359         return;
360 
361       // Notify successors that this group started execution.
362       for (MemoryGroup *MG : OrderSucc) {
363         MG->onGroupIssued(CriticalMemoryInstruction, false);
364         // Release the order dependency with this group.
365         MG->onGroupExecuted();
366       }
367 
368       for (MemoryGroup *MG : DataSucc)
369         MG->onGroupIssued(CriticalMemoryInstruction, true);
370     }
371 
onInstructionExecuted(const InstRef & IR)372     void onInstructionExecuted(const InstRef &IR) {
373       assert(isReady() && !isExecuted() && "Invalid internal state!");
374       --NumExecuting;
375       ++NumExecuted;
376 
377       if (CriticalMemoryInstruction &&
378           CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
379         CriticalMemoryInstruction.invalidate();
380       }
381 
382       if (!isExecuted())
383         return;
384 
385       // Notify data dependent successors that this group has finished
386       // execution.
387       for (MemoryGroup *MG : DataSucc)
388         MG->onGroupExecuted();
389     }
390 
addInstruction()391     void addInstruction() {
392       assert(!getNumSuccessors() && "Cannot add instructions to this group!");
393       ++NumInstructions;
394     }
395 
cycleEvent()396     void cycleEvent() {
397       if (isWaiting() && CriticalPredecessor.Cycles)
398         CriticalPredecessor.Cycles--;
399     }
400   };
401   /// Used to map group identifiers to MemoryGroups.
402   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
403   unsigned NextGroupID = 1;
404 
405   unsigned CurrentLoadGroupID;
406   unsigned CurrentLoadBarrierGroupID;
407   unsigned CurrentStoreGroupID;
408   unsigned CurrentStoreBarrierGroupID;
409 
410 public:
LSUnit(const MCSchedModel & SM)411   LSUnit(const MCSchedModel &SM)
412       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)413   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
414       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)415   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
416       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
417         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
418         CurrentStoreBarrierGroupID(0) {}
419 
420   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
421   /// accomodate instruction IR.
422   Status isAvailable(const InstRef &IR) const override;
423 
isReady(const InstRef & IR)424   bool isReady(const InstRef &IR) const override {
425     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
426     const MemoryGroup &Group = getGroup(GroupID);
427     return Group.isReady();
428   }
429 
isPending(const InstRef & IR)430   bool isPending(const InstRef &IR) const override {
431     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
432     const MemoryGroup &Group = getGroup(GroupID);
433     return Group.isPending();
434   }
435 
isWaiting(const InstRef & IR)436   bool isWaiting(const InstRef &IR) const override {
437     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
438     const MemoryGroup &Group = getGroup(GroupID);
439     return Group.isWaiting();
440   }
441 
hasDependentUsers(const InstRef & IR)442   bool hasDependentUsers(const InstRef &IR) const override {
443     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
444     const MemoryGroup &Group = getGroup(GroupID);
445     return !Group.isExecuted() && Group.getNumSuccessors();
446   }
447 
getCriticalPredecessor(unsigned GroupId)448   const CriticalDependency getCriticalPredecessor(unsigned GroupId) override {
449     const MemoryGroup &Group = getGroup(GroupId);
450     return Group.getCriticalPredecessor();
451   }
452 
453   /// Allocates LS resources for instruction IR.
454   ///
455   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
456   /// returning LSU_AVAILABLE.
457   ///
458   /// Rules are:
459   /// By default, rules are:
460   /// 1. A store may not pass a previous store.
461   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
462   /// 3. A load may pass a previous load.
463   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
464   /// 5. A load has to wait until an older load barrier is fully executed.
465   /// 6. A store has to wait until an older store barrier is fully executed.
466   unsigned dispatch(const InstRef &IR) override;
467 
onInstructionIssued(const InstRef & IR)468   virtual void onInstructionIssued(const InstRef &IR) override {
469     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
470     Groups[GroupID]->onInstructionIssued(IR);
471   }
472 
473   virtual void onInstructionRetired(const InstRef &IR) override;
474 
475   virtual void onInstructionExecuted(const InstRef &IR) override;
476 
477   virtual void cycleEvent() override;
478 
479 #ifndef NDEBUG
480   virtual void dump() const override;
481 #endif
482 
483 private:
isValidGroupID(unsigned Index)484   bool isValidGroupID(unsigned Index) const {
485     return Index && Groups.contains(Index);
486   }
487 
getGroup(unsigned Index)488   const MemoryGroup &getGroup(unsigned Index) const {
489     assert(isValidGroupID(Index) && "Group doesn't exist!");
490     return *Groups.find(Index)->second;
491   }
492 
getGroup(unsigned Index)493   MemoryGroup &getGroup(unsigned Index) {
494     assert(isValidGroupID(Index) && "Group doesn't exist!");
495     return *Groups.find(Index)->second;
496   }
497 
createMemoryGroup()498   unsigned createMemoryGroup() {
499     Groups.insert(std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
500     return NextGroupID++;
501   }
502 };
503 
504 } // namespace mca
505 } // namespace llvm
506 
507 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
508