xref: /freebsd/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1  //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  /// \file
9  ///
10  /// A Load/Store unit class that models load/store queues and that implements
11  /// a simple weak memory consistency model.
12  ///
13  //===----------------------------------------------------------------------===//
14  
15  #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
16  #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
17  
18  #include "llvm/ADT/DenseMap.h"
19  #include "llvm/ADT/SmallVector.h"
20  #include "llvm/MC/MCSchedule.h"
21  #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22  #include "llvm/MCA/Instruction.h"
23  
24  namespace llvm {
25  namespace mca {
26  
27  /// A node of a memory dependency graph. A MemoryGroup describes a set of
28  /// instructions with same memory dependencies.
29  ///
30  /// By construction, instructions of a MemoryGroup don't depend on each other.
31  /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
32  /// A Memory group identifier is then stored as a "token" in field
33  /// Instruction::LSUTokenID of each dispatched instructions. That token is used
34  /// internally by the LSUnit to track memory dependencies.
35  class MemoryGroup {
36    unsigned NumPredecessors = 0;
37    unsigned NumExecutingPredecessors = 0;
38    unsigned NumExecutedPredecessors = 0;
39  
40    unsigned NumInstructions = 0;
41    unsigned NumExecuting = 0;
42    unsigned NumExecuted = 0;
43    // Successors that are in a order dependency with this group.
44    SmallVector<MemoryGroup *, 4> OrderSucc;
45    // Successors that are in a data dependency with this group.
46    SmallVector<MemoryGroup *, 4> DataSucc;
47  
48    CriticalDependency CriticalPredecessor;
49    InstRef CriticalMemoryInstruction;
50  
51    MemoryGroup(const MemoryGroup &) = delete;
52    MemoryGroup &operator=(const MemoryGroup &) = delete;
53  
54  public:
55    MemoryGroup() = default;
56    MemoryGroup(MemoryGroup &&) = default;
57  
getNumSuccessors()58    size_t getNumSuccessors() const {
59      return OrderSucc.size() + DataSucc.size();
60    }
getNumPredecessors()61    unsigned getNumPredecessors() const { return NumPredecessors; }
getNumExecutingPredecessors()62    unsigned getNumExecutingPredecessors() const {
63      return NumExecutingPredecessors;
64    }
getNumExecutedPredecessors()65    unsigned getNumExecutedPredecessors() const {
66      return NumExecutedPredecessors;
67    }
getNumInstructions()68    unsigned getNumInstructions() const { return NumInstructions; }
getNumExecuting()69    unsigned getNumExecuting() const { return NumExecuting; }
getNumExecuted()70    unsigned getNumExecuted() const { return NumExecuted; }
71  
getCriticalMemoryInstruction()72    const InstRef &getCriticalMemoryInstruction() const {
73      return CriticalMemoryInstruction;
74    }
getCriticalPredecessor()75    const CriticalDependency &getCriticalPredecessor() const {
76      return CriticalPredecessor;
77    }
78  
addSuccessor(MemoryGroup * Group,bool IsDataDependent)79    void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
80      // Do not need to add a dependency if there is no data
81      // dependency and all instructions from this group have been
82      // issued already.
83      if (!IsDataDependent && isExecuting())
84        return;
85  
86      Group->NumPredecessors++;
87      assert(!isExecuted() && "Should have been removed!");
88      if (isExecuting())
89        Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
90  
91      if (IsDataDependent)
92        DataSucc.emplace_back(Group);
93      else
94        OrderSucc.emplace_back(Group);
95    }
96  
isWaiting()97    bool isWaiting() const {
98      return NumPredecessors >
99             (NumExecutingPredecessors + NumExecutedPredecessors);
100    }
isPending()101    bool isPending() const {
102      return NumExecutingPredecessors &&
103             ((NumExecutedPredecessors + NumExecutingPredecessors) ==
104              NumPredecessors);
105    }
isReady()106    bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
isExecuting()107    bool isExecuting() const {
108      return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
109    }
isExecuted()110    bool isExecuted() const { return NumInstructions == NumExecuted; }
111  
onGroupIssued(const InstRef & IR,bool ShouldUpdateCriticalDep)112    void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
113      assert(!isReady() && "Unexpected group-start event!");
114      NumExecutingPredecessors++;
115  
116      if (!ShouldUpdateCriticalDep)
117        return;
118  
119      unsigned Cycles = IR.getInstruction()->getCyclesLeft();
120      if (CriticalPredecessor.Cycles < Cycles) {
121        CriticalPredecessor.IID = IR.getSourceIndex();
122        CriticalPredecessor.Cycles = Cycles;
123      }
124    }
125  
onGroupExecuted()126    void onGroupExecuted() {
127      assert(!isReady() && "Inconsistent state found!");
128      NumExecutingPredecessors--;
129      NumExecutedPredecessors++;
130    }
131  
onInstructionIssued(const InstRef & IR)132    void onInstructionIssued(const InstRef &IR) {
133      assert(!isExecuting() && "Invalid internal state!");
134      ++NumExecuting;
135  
136      // update the CriticalMemDep.
137      const Instruction &IS = *IR.getInstruction();
138      if ((bool)CriticalMemoryInstruction) {
139        const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
140        if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
141          CriticalMemoryInstruction = IR;
142      } else {
143        CriticalMemoryInstruction = IR;
144      }
145  
146      if (!isExecuting())
147        return;
148  
149      // Notify successors that this group started execution.
150      for (MemoryGroup *MG : OrderSucc) {
151        MG->onGroupIssued(CriticalMemoryInstruction, false);
152        // Release the order dependency with this group.
153        MG->onGroupExecuted();
154      }
155  
156      for (MemoryGroup *MG : DataSucc)
157        MG->onGroupIssued(CriticalMemoryInstruction, true);
158    }
159  
onInstructionExecuted(const InstRef & IR)160    void onInstructionExecuted(const InstRef &IR) {
161      assert(isReady() && !isExecuted() && "Invalid internal state!");
162      --NumExecuting;
163      ++NumExecuted;
164  
165      if (CriticalMemoryInstruction &&
166          CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
167        CriticalMemoryInstruction.invalidate();
168      }
169  
170      if (!isExecuted())
171        return;
172  
173      // Notify data dependent successors that this group has finished execution.
174      for (MemoryGroup *MG : DataSucc)
175        MG->onGroupExecuted();
176    }
177  
addInstruction()178    void addInstruction() {
179      assert(!getNumSuccessors() && "Cannot add instructions to this group!");
180      ++NumInstructions;
181    }
182  
cycleEvent()183    void cycleEvent() {
184      if (isWaiting() && CriticalPredecessor.Cycles)
185        CriticalPredecessor.Cycles--;
186    }
187  };
188  
189  /// Abstract base interface for LS (load/store) units in llvm-mca.
190  class LSUnitBase : public HardwareUnit {
191    /// Load queue size.
192    ///
193    /// A value of zero for this field means that the load queue is unbounded.
194    /// Processor models can declare the size of a load queue via tablegen (see
195    /// the definition of tablegen class LoadQueue in
196    /// llvm/Target/TargetSchedule.td).
197    unsigned LQSize;
198  
199    /// Load queue size.
200    ///
201    /// A value of zero for this field means that the store queue is unbounded.
202    /// Processor models can declare the size of a store queue via tablegen (see
203    /// the definition of tablegen class StoreQueue in
204    /// llvm/Target/TargetSchedule.td).
205    unsigned SQSize;
206  
207    unsigned UsedLQEntries;
208    unsigned UsedSQEntries;
209  
210    /// True if loads don't alias with stores.
211    ///
212    /// By default, the LS unit assumes that loads and stores don't alias with
213    /// eachother. If this field is set to false, then loads are always assumed to
214    /// alias with stores.
215    const bool NoAlias;
216  
217    /// Used to map group identifiers to MemoryGroups.
218    DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
219    unsigned NextGroupID;
220  
221  public:
222    LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
223               unsigned StoreQueueSize, bool AssumeNoAlias);
224  
225    virtual ~LSUnitBase();
226  
227    /// Returns the total number of entries in the load queue.
getLoadQueueSize()228    unsigned getLoadQueueSize() const { return LQSize; }
229  
230    /// Returns the total number of entries in the store queue.
getStoreQueueSize()231    unsigned getStoreQueueSize() const { return SQSize; }
232  
getUsedLQEntries()233    unsigned getUsedLQEntries() const { return UsedLQEntries; }
getUsedSQEntries()234    unsigned getUsedSQEntries() const { return UsedSQEntries; }
acquireLQSlot()235    void acquireLQSlot() { ++UsedLQEntries; }
acquireSQSlot()236    void acquireSQSlot() { ++UsedSQEntries; }
releaseLQSlot()237    void releaseLQSlot() { --UsedLQEntries; }
releaseSQSlot()238    void releaseSQSlot() { --UsedSQEntries; }
239  
assumeNoAlias()240    bool assumeNoAlias() const { return NoAlias; }
241  
242    enum Status {
243      LSU_AVAILABLE = 0,
244      LSU_LQUEUE_FULL, // Load Queue unavailable
245      LSU_SQUEUE_FULL  // Store Queue unavailable
246    };
247  
248    /// This method checks the availability of the load/store buffers.
249    ///
250    /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
251    /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
252    /// not a memory operation.
253    virtual Status isAvailable(const InstRef &IR) const = 0;
254  
255    /// Allocates LS resources for instruction IR.
256    ///
257    /// This method assumes that a previous call to `isAvailable(IR)` succeeded
258    /// with a LSUnitBase::Status value of LSU_AVAILABLE.
259    /// Returns the GroupID associated with this instruction. That value will be
260    /// used to set the LSUTokenID field in class Instruction.
261    virtual unsigned dispatch(const InstRef &IR) = 0;
262  
isSQEmpty()263    bool isSQEmpty() const { return !UsedSQEntries; }
isLQEmpty()264    bool isLQEmpty() const { return !UsedLQEntries; }
isSQFull()265    bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
isLQFull()266    bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
267  
isValidGroupID(unsigned Index)268    bool isValidGroupID(unsigned Index) const {
269      return Index && Groups.contains(Index);
270    }
271  
272    /// Check if a peviously dispatched instruction IR is now ready for execution.
isReady(const InstRef & IR)273    bool isReady(const InstRef &IR) const {
274      unsigned GroupID = IR.getInstruction()->getLSUTokenID();
275      const MemoryGroup &Group = getGroup(GroupID);
276      return Group.isReady();
277    }
278  
279    /// Check if instruction IR only depends on memory instructions that are
280    /// currently executing.
isPending(const InstRef & IR)281    bool isPending(const InstRef &IR) const {
282      unsigned GroupID = IR.getInstruction()->getLSUTokenID();
283      const MemoryGroup &Group = getGroup(GroupID);
284      return Group.isPending();
285    }
286  
287    /// Check if instruction IR is still waiting on memory operations, and the
288    /// wait time is still unknown.
isWaiting(const InstRef & IR)289    bool isWaiting(const InstRef &IR) const {
290      unsigned GroupID = IR.getInstruction()->getLSUTokenID();
291      const MemoryGroup &Group = getGroup(GroupID);
292      return Group.isWaiting();
293    }
294  
hasDependentUsers(const InstRef & IR)295    bool hasDependentUsers(const InstRef &IR) const {
296      unsigned GroupID = IR.getInstruction()->getLSUTokenID();
297      const MemoryGroup &Group = getGroup(GroupID);
298      return !Group.isExecuted() && Group.getNumSuccessors();
299    }
300  
getGroup(unsigned Index)301    const MemoryGroup &getGroup(unsigned Index) const {
302      assert(isValidGroupID(Index) && "Group doesn't exist!");
303      return *Groups.find(Index)->second;
304    }
305  
getGroup(unsigned Index)306    MemoryGroup &getGroup(unsigned Index) {
307      assert(isValidGroupID(Index) && "Group doesn't exist!");
308      return *Groups.find(Index)->second;
309    }
310  
createMemoryGroup()311    unsigned createMemoryGroup() {
312      Groups.insert(
313          std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
314      return NextGroupID++;
315    }
316  
317    virtual void onInstructionExecuted(const InstRef &IR);
318  
319    // Loads are tracked by the LDQ (load queue) from dispatch until completion.
320    // Stores are tracked by the STQ (store queue) from dispatch until commitment.
321    // By default we conservatively assume that the LDQ receives a load at
322    // dispatch. Loads leave the LDQ at retirement stage.
323    virtual void onInstructionRetired(const InstRef &IR);
324  
onInstructionIssued(const InstRef & IR)325    virtual void onInstructionIssued(const InstRef &IR) {
326      unsigned GroupID = IR.getInstruction()->getLSUTokenID();
327      Groups[GroupID]->onInstructionIssued(IR);
328    }
329  
330    virtual void cycleEvent();
331  
332  #ifndef NDEBUG
333    void dump() const;
334  #endif
335  };
336  
337  /// Default Load/Store Unit (LS Unit) for simulated processors.
338  ///
339  /// Each load (or store) consumes one entry in the load (or store) queue.
340  ///
341  /// Rules are:
342  /// 1) A younger load is allowed to pass an older load only if there are no
343  ///    stores nor barriers in between the two loads.
344  /// 2) An younger store is not allowed to pass an older store.
345  /// 3) A younger store is not allowed to pass an older load.
346  /// 4) A younger load is allowed to pass an older store only if the load does
347  ///    not alias with the store.
348  ///
349  /// This class optimistically assumes that loads don't alias store operations.
350  /// Under this assumption, younger loads are always allowed to pass older
351  /// stores (this would only affects rule 4).
352  /// Essentially, this class doesn't perform any sort alias analysis to
353  /// identify aliasing loads and stores.
354  ///
355  /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
356  /// set to `false` by the constructor of LSUnit.
357  ///
358  /// Note that this class doesn't know about the existence of different memory
359  /// types for memory operations (example: write-through, write-combining, etc.).
360  /// Derived classes are responsible for implementing that extra knowledge, and
361  /// provide different sets of rules for loads and stores by overriding method
362  /// `isReady()`.
363  /// To emulate a write-combining memory type, rule 2. must be relaxed in a
364  /// derived class to enable the reordering of non-aliasing store operations.
365  ///
366  /// No assumptions are made by this class on the size of the store buffer.  This
367  /// class doesn't know how to identify cases where store-to-load forwarding may
368  /// occur.
369  ///
370  /// LSUnit doesn't attempt to predict whether a load or store hits or misses
371  /// the L1 cache. To be more specific, LSUnit doesn't know anything about
372  /// cache hierarchy and memory types.
373  /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
374  /// scheduling model provides an "optimistic" load-to-use latency (which usually
375  /// matches the load-to-use latency for when there is a hit in the L1D).
376  /// Derived classes may expand this knowledge.
377  ///
378  /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
379  /// memory-barrier like instructions.
380  /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
381  /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
382  /// serializes loads without forcing a flush of the load queue.
383  /// Similarly, instructions that both `mayStore` and have `unmodeled side
384  /// effects` are treated like store barriers. A full memory
385  /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
386  /// effects. This is obviously inaccurate, but this is the best that we can do
387  /// at the moment.
388  ///
389  /// Each load/store barrier consumes one entry in the load/store queue. A
390  /// load/store barrier enforces ordering of loads/stores:
391  ///  - A younger load cannot pass a load barrier.
392  ///  - A younger store cannot pass a store barrier.
393  ///
394  /// A younger load has to wait for the memory load barrier to execute.
395  /// A load/store barrier is "executed" when it becomes the oldest entry in
396  /// the load/store queue(s). That also means, all the older loads/stores have
397  /// already been executed.
398  class LSUnit : public LSUnitBase {
399    // This class doesn't know about the latency of a load instruction. So, it
400    // conservatively/pessimistically assumes that the latency of a load opcode
401    // matches the instruction latency.
402    //
403    // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
404    // and load/store conflicts, the latency of a load is determined by the depth
405    // of the load pipeline. So, we could use field `LoadLatency` in the
406    // MCSchedModel to model that latency.
407    // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
408    // L1D, and it usually already accounts for any extra latency due to data
409    // forwarding.
410    // When doing throughput analysis, `LoadLatency` is likely to
411    // be a better predictor of load latency than instruction latency. This is
412    // particularly true when simulating code with temporal/spatial locality of
413    // memory accesses.
414    // Using `LoadLatency` (instead of the instruction latency) is also expected
415    // to improve the load queue allocation for long latency instructions with
416    // folded memory operands (See PR39829).
417    //
418    // FIXME: On some processors, load/store operations are split into multiple
419    // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
420    // not 256-bit data types. So, a 256-bit load is effectively split into two
421    // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
422    // simplicity, this class optimistically assumes that a load instruction only
423    // consumes one entry in the LoadQueue.  Similarly, store instructions only
424    // consume a single entry in the StoreQueue.
425    // In future, we should reassess the quality of this design, and consider
426    // alternative approaches that let instructions specify the number of
427    // load/store queue entries which they consume at dispatch stage (See
428    // PR39830).
429    //
430    // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
431    // conservatively treated as a store barrier. It forces older store to be
432    // executed before newer stores are issued.
433    //
434    // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
435    // conservatively treated as a load barrier. It forces older loads to execute
436    // before newer loads are issued.
437    unsigned CurrentLoadGroupID;
438    unsigned CurrentLoadBarrierGroupID;
439    unsigned CurrentStoreGroupID;
440    unsigned CurrentStoreBarrierGroupID;
441  
442  public:
LSUnit(const MCSchedModel & SM)443    LSUnit(const MCSchedModel &SM)
444        : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)445    LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
446        : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)447    LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
448        : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
449          CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
450          CurrentStoreBarrierGroupID(0) {}
451  
452    /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
453    /// accomodate instruction IR.
454    Status isAvailable(const InstRef &IR) const override;
455  
456    /// Allocates LS resources for instruction IR.
457    ///
458    /// This method assumes that a previous call to `isAvailable(IR)` succeeded
459    /// returning LSU_AVAILABLE.
460    ///
461    /// Rules are:
462    /// By default, rules are:
463    /// 1. A store may not pass a previous store.
464    /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
465    /// 3. A load may pass a previous load.
466    /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
467    /// 5. A load has to wait until an older load barrier is fully executed.
468    /// 6. A store has to wait until an older store barrier is fully executed.
469    unsigned dispatch(const InstRef &IR) override;
470  
471    void onInstructionExecuted(const InstRef &IR) override;
472  };
473  
474  } // namespace mca
475  } // namespace llvm
476  
477  #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
478