1 //===------------------------- LSUnit.h --------------------------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// A Load/Store unit class that models load/store queues and that implements 11 /// a simple weak memory consistency model. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H 16 #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H 17 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/MC/MCSchedule.h" 21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h" 22 #include "llvm/MCA/Instruction.h" 23 #include "llvm/Support/Compiler.h" 24 25 namespace llvm { 26 namespace mca { 27 28 /// Abstract base interface for LS (load/store) units in llvm-mca. 29 class LLVM_ABI LSUnitBase : public HardwareUnit { 30 /// Load queue size. 31 /// 32 /// A value of zero for this field means that the load queue is unbounded. 33 /// Processor models can declare the size of a load queue via tablegen (see 34 /// the definition of tablegen class LoadQueue in 35 /// llvm/Target/TargetSchedule.td). 36 unsigned LQSize; 37 38 /// Load queue size. 39 /// 40 /// A value of zero for this field means that the store queue is unbounded. 41 /// Processor models can declare the size of a store queue via tablegen (see 42 /// the definition of tablegen class StoreQueue in 43 /// llvm/Target/TargetSchedule.td). 44 unsigned SQSize; 45 46 unsigned UsedLQEntries; 47 unsigned UsedSQEntries; 48 49 /// True if loads don't alias with stores. 50 /// 51 /// By default, the LS unit assumes that loads and stores don't alias with 52 /// each other. If this field is set to false, then loads are always assumed 53 /// to alias with stores. 54 const bool NoAlias; 55 56 public: 57 LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize, 58 unsigned StoreQueueSize, bool AssumeNoAlias); 59 60 virtual ~LSUnitBase(); 61 62 /// Returns the total number of entries in the load queue. getLoadQueueSize()63 unsigned getLoadQueueSize() const { return LQSize; } 64 65 /// Returns the total number of entries in the store queue. getStoreQueueSize()66 unsigned getStoreQueueSize() const { return SQSize; } 67 getUsedLQEntries()68 unsigned getUsedLQEntries() const { return UsedLQEntries; } getUsedSQEntries()69 unsigned getUsedSQEntries() const { return UsedSQEntries; } acquireLQSlot()70 void acquireLQSlot() { ++UsedLQEntries; } acquireSQSlot()71 void acquireSQSlot() { ++UsedSQEntries; } releaseLQSlot()72 void releaseLQSlot() { --UsedLQEntries; } releaseSQSlot()73 void releaseSQSlot() { --UsedSQEntries; } 74 assumeNoAlias()75 bool assumeNoAlias() const { return NoAlias; } 76 77 enum Status { 78 LSU_AVAILABLE = 0, 79 LSU_LQUEUE_FULL, // Load Queue unavailable 80 LSU_SQUEUE_FULL // Store Queue unavailable 81 }; 82 83 /// This method checks the availability of the load/store buffers. 84 /// 85 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 86 /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is 87 /// not a memory operation. 88 virtual Status isAvailable(const InstRef &IR) const = 0; 89 90 /// Allocates LS resources for instruction IR. 91 /// 92 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 93 /// with a LSUnitBase::Status value of LSU_AVAILABLE. 94 /// Returns the GroupID associated with this instruction. That value will be 95 /// used to set the LSUTokenID field in class Instruction. 96 virtual unsigned dispatch(const InstRef &IR) = 0; 97 isSQEmpty()98 bool isSQEmpty() const { return !UsedSQEntries; } isLQEmpty()99 bool isLQEmpty() const { return !UsedLQEntries; } isSQFull()100 bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; } isLQFull()101 bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; } 102 103 /// Check if a peviously dispatched instruction IR is now ready for execution. 104 virtual bool isReady(const InstRef &IR) const = 0; 105 106 /// Check if instruction IR only depends on memory instructions that are 107 /// currently executing. 108 virtual bool isPending(const InstRef &IR) const = 0; 109 110 /// Check if instruction IR is still waiting on memory operations, and the 111 /// wait time is still unknown. 112 virtual bool isWaiting(const InstRef &IR) const = 0; 113 114 virtual bool hasDependentUsers(const InstRef &IR) const = 0; 115 116 virtual const CriticalDependency getCriticalPredecessor(unsigned GroupId) = 0; 117 118 virtual void onInstructionExecuted(const InstRef &IR) = 0; 119 120 // Loads are tracked by the LDQ (load queue) from dispatch until completion. 121 // Stores are tracked by the STQ (store queue) from dispatch until commitment. 122 // By default we conservatively assume that the LDQ receives a load at 123 // dispatch. Loads leave the LDQ at retirement stage. 124 virtual void onInstructionRetired(const InstRef &IR) = 0; 125 126 virtual void onInstructionIssued(const InstRef &IR) = 0; 127 128 virtual void cycleEvent() = 0; 129 130 #ifndef NDEBUG 131 virtual void dump() const = 0; 132 #endif 133 }; 134 135 /// Default Load/Store Unit (LS Unit) for simulated processors. 136 /// 137 /// Each load (or store) consumes one entry in the load (or store) queue. 138 /// 139 /// Rules are: 140 /// 1) A younger load is allowed to pass an older load only if there are no 141 /// stores nor barriers in between the two loads. 142 /// 2) An younger store is not allowed to pass an older store. 143 /// 3) A younger store is not allowed to pass an older load. 144 /// 4) A younger load is allowed to pass an older store only if the load does 145 /// not alias with the store. 146 /// 147 /// This class optimistically assumes that loads don't alias store operations. 148 /// Under this assumption, younger loads are always allowed to pass older 149 /// stores (this would only affects rule 4). 150 /// Essentially, this class doesn't perform any sort alias analysis to 151 /// identify aliasing loads and stores. 152 /// 153 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be 154 /// set to `false` by the constructor of LSUnit. 155 /// 156 /// Note that this class doesn't know about the existence of different memory 157 /// types for memory operations (example: write-through, write-combining, etc.). 158 /// Derived classes are responsible for implementing that extra knowledge, and 159 /// provide different sets of rules for loads and stores by overriding method 160 /// `isReady()`. 161 /// To emulate a write-combining memory type, rule 2. must be relaxed in a 162 /// derived class to enable the reordering of non-aliasing store operations. 163 /// 164 /// No assumptions are made by this class on the size of the store buffer. This 165 /// class doesn't know how to identify cases where store-to-load forwarding may 166 /// occur. 167 /// 168 /// LSUnit doesn't attempt to predict whether a load or store hits or misses 169 /// the L1 cache. To be more specific, LSUnit doesn't know anything about 170 /// cache hierarchy and memory types. 171 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the 172 /// scheduling model provides an "optimistic" load-to-use latency (which usually 173 /// matches the load-to-use latency for when there is a hit in the L1D). 174 /// Derived classes may expand this knowledge. 175 /// 176 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor 177 /// memory-barrier like instructions. 178 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has 179 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it 180 /// serializes loads without forcing a flush of the load queue. 181 /// Similarly, instructions that both `mayStore` and have `unmodeled side 182 /// effects` are treated like store barriers. A full memory 183 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side 184 /// effects. This is obviously inaccurate, but this is the best that we can do 185 /// at the moment. 186 /// 187 /// Each load/store barrier consumes one entry in the load/store queue. A 188 /// load/store barrier enforces ordering of loads/stores: 189 /// - A younger load cannot pass a load barrier. 190 /// - A younger store cannot pass a store barrier. 191 /// 192 /// A younger load has to wait for the memory load barrier to execute. 193 /// A load/store barrier is "executed" when it becomes the oldest entry in 194 /// the load/store queue(s). That also means, all the older loads/stores have 195 /// already been executed. 196 class LLVM_ABI LSUnit : public LSUnitBase { 197 198 // This class doesn't know about the latency of a load instruction. So, it 199 // conservatively/pessimistically assumes that the latency of a load opcode 200 // matches the instruction latency. 201 // 202 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), 203 // and load/store conflicts, the latency of a load is determined by the depth 204 // of the load pipeline. So, we could use field `LoadLatency` in the 205 // MCSchedModel to model that latency. 206 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from 207 // L1D, and it usually already accounts for any extra latency due to data 208 // forwarding. 209 // When doing throughput analysis, `LoadLatency` is likely to 210 // be a better predictor of load latency than instruction latency. This is 211 // particularly true when simulating code with temporal/spatial locality of 212 // memory accesses. 213 // Using `LoadLatency` (instead of the instruction latency) is also expected 214 // to improve the load queue allocation for long latency instructions with 215 // folded memory operands (See PR39829). 216 // 217 // FIXME: On some processors, load/store operations are split into multiple 218 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but 219 // not 256-bit data types. So, a 256-bit load is effectively split into two 220 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For 221 // simplicity, this class optimistically assumes that a load instruction only 222 // consumes one entry in the LoadQueue. Similarly, store instructions only 223 // consume a single entry in the StoreQueue. 224 // In future, we should reassess the quality of this design, and consider 225 // alternative approaches that let instructions specify the number of 226 // load/store queue entries which they consume at dispatch stage (See 227 // PR39830). 228 // 229 // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is 230 // conservatively treated as a store barrier. It forces older store to be 231 // executed before newer stores are issued. 232 // 233 // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is 234 // conservatively treated as a load barrier. It forces older loads to execute 235 // before newer loads are issued. 236 237 protected: 238 /// A node of a memory dependency graph. A MemoryGroup describes a set of 239 /// instructions with same memory dependencies. 240 /// 241 /// By construction, instructions of a MemoryGroup don't depend on each other. 242 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups. 243 /// A Memory group identifier is then stored as a "token" in field 244 /// Instruction::LSUTokenID of each dispatched instructions. That token is 245 /// used internally by the LSUnit to track memory dependencies. 246 class MemoryGroup { 247 unsigned NumPredecessors = 0; 248 unsigned NumExecutingPredecessors = 0; 249 unsigned NumExecutedPredecessors = 0; 250 251 unsigned NumInstructions = 0; 252 unsigned NumExecuting = 0; 253 unsigned NumExecuted = 0; 254 // Successors that are in a order dependency with this group. 255 SmallVector<MemoryGroup *, 4> OrderSucc; 256 // Successors that are in a data dependency with this group. 257 SmallVector<MemoryGroup *, 4> DataSucc; 258 259 CriticalDependency CriticalPredecessor; 260 InstRef CriticalMemoryInstruction; 261 262 MemoryGroup(const MemoryGroup &) = delete; 263 MemoryGroup &operator=(const MemoryGroup &) = delete; 264 265 public: 266 MemoryGroup() = default; 267 MemoryGroup(MemoryGroup &&) = default; 268 getNumSuccessors()269 size_t getNumSuccessors() const { 270 return OrderSucc.size() + DataSucc.size(); 271 } getNumPredecessors()272 unsigned getNumPredecessors() const { return NumPredecessors; } getNumExecutingPredecessors()273 unsigned getNumExecutingPredecessors() const { 274 return NumExecutingPredecessors; 275 } getNumExecutedPredecessors()276 unsigned getNumExecutedPredecessors() const { 277 return NumExecutedPredecessors; 278 } getNumInstructions()279 unsigned getNumInstructions() const { return NumInstructions; } getNumExecuting()280 unsigned getNumExecuting() const { return NumExecuting; } getNumExecuted()281 unsigned getNumExecuted() const { return NumExecuted; } 282 getCriticalMemoryInstruction()283 const InstRef &getCriticalMemoryInstruction() const { 284 return CriticalMemoryInstruction; 285 } getCriticalPredecessor()286 const CriticalDependency &getCriticalPredecessor() const { 287 return CriticalPredecessor; 288 } 289 addSuccessor(MemoryGroup * Group,bool IsDataDependent)290 void addSuccessor(MemoryGroup *Group, bool IsDataDependent) { 291 // Do not need to add a dependency if there is no data 292 // dependency and all instructions from this group have been 293 // issued already. 294 if (!IsDataDependent && isExecuting()) 295 return; 296 297 Group->NumPredecessors++; 298 assert(!isExecuted() && "Should have been removed!"); 299 if (isExecuting()) 300 Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent); 301 302 if (IsDataDependent) 303 DataSucc.emplace_back(Group); 304 else 305 OrderSucc.emplace_back(Group); 306 } 307 isWaiting()308 bool isWaiting() const { 309 return NumPredecessors > 310 (NumExecutingPredecessors + NumExecutedPredecessors); 311 } isPending()312 bool isPending() const { 313 return NumExecutingPredecessors && 314 ((NumExecutedPredecessors + NumExecutingPredecessors) == 315 NumPredecessors); 316 } isReady()317 bool isReady() const { return NumExecutedPredecessors == NumPredecessors; } isExecuting()318 bool isExecuting() const { 319 return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted)); 320 } isExecuted()321 bool isExecuted() const { return NumInstructions == NumExecuted; } 322 onGroupIssued(const InstRef & IR,bool ShouldUpdateCriticalDep)323 void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) { 324 assert(!isReady() && "Unexpected group-start event!"); 325 NumExecutingPredecessors++; 326 327 if (!ShouldUpdateCriticalDep) 328 return; 329 330 unsigned Cycles = IR.getInstruction()->getCyclesLeft(); 331 if (CriticalPredecessor.Cycles < Cycles) { 332 CriticalPredecessor.IID = IR.getSourceIndex(); 333 CriticalPredecessor.Cycles = Cycles; 334 } 335 } 336 onGroupExecuted()337 void onGroupExecuted() { 338 assert(!isReady() && "Inconsistent state found!"); 339 NumExecutingPredecessors--; 340 NumExecutedPredecessors++; 341 } 342 onInstructionIssued(const InstRef & IR)343 void onInstructionIssued(const InstRef &IR) { 344 assert(!isExecuting() && "Invalid internal state!"); 345 ++NumExecuting; 346 347 // update the CriticalMemDep. 348 const Instruction &IS = *IR.getInstruction(); 349 if ((bool)CriticalMemoryInstruction) { 350 const Instruction &OtherIS = 351 *CriticalMemoryInstruction.getInstruction(); 352 if (OtherIS.getCyclesLeft() < IS.getCyclesLeft()) 353 CriticalMemoryInstruction = IR; 354 } else { 355 CriticalMemoryInstruction = IR; 356 } 357 358 if (!isExecuting()) 359 return; 360 361 // Notify successors that this group started execution. 362 for (MemoryGroup *MG : OrderSucc) { 363 MG->onGroupIssued(CriticalMemoryInstruction, false); 364 // Release the order dependency with this group. 365 MG->onGroupExecuted(); 366 } 367 368 for (MemoryGroup *MG : DataSucc) 369 MG->onGroupIssued(CriticalMemoryInstruction, true); 370 } 371 onInstructionExecuted(const InstRef & IR)372 void onInstructionExecuted(const InstRef &IR) { 373 assert(isReady() && !isExecuted() && "Invalid internal state!"); 374 --NumExecuting; 375 ++NumExecuted; 376 377 if (CriticalMemoryInstruction && 378 CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) { 379 CriticalMemoryInstruction.invalidate(); 380 } 381 382 if (!isExecuted()) 383 return; 384 385 // Notify data dependent successors that this group has finished 386 // execution. 387 for (MemoryGroup *MG : DataSucc) 388 MG->onGroupExecuted(); 389 } 390 addInstruction()391 void addInstruction() { 392 assert(!getNumSuccessors() && "Cannot add instructions to this group!"); 393 ++NumInstructions; 394 } 395 cycleEvent()396 void cycleEvent() { 397 if (isWaiting() && CriticalPredecessor.Cycles) 398 CriticalPredecessor.Cycles--; 399 } 400 }; 401 /// Used to map group identifiers to MemoryGroups. 402 DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups; 403 unsigned NextGroupID = 1; 404 405 unsigned CurrentLoadGroupID; 406 unsigned CurrentLoadBarrierGroupID; 407 unsigned CurrentStoreGroupID; 408 unsigned CurrentStoreBarrierGroupID; 409 410 public: LSUnit(const MCSchedModel & SM)411 LSUnit(const MCSchedModel &SM) 412 : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {} LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)413 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ) 414 : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {} LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)415 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias) 416 : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0), 417 CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0), 418 CurrentStoreBarrierGroupID(0) {} 419 420 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 421 /// accomodate instruction IR. 422 Status isAvailable(const InstRef &IR) const override; 423 isReady(const InstRef & IR)424 bool isReady(const InstRef &IR) const override { 425 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 426 const MemoryGroup &Group = getGroup(GroupID); 427 return Group.isReady(); 428 } 429 isPending(const InstRef & IR)430 bool isPending(const InstRef &IR) const override { 431 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 432 const MemoryGroup &Group = getGroup(GroupID); 433 return Group.isPending(); 434 } 435 isWaiting(const InstRef & IR)436 bool isWaiting(const InstRef &IR) const override { 437 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 438 const MemoryGroup &Group = getGroup(GroupID); 439 return Group.isWaiting(); 440 } 441 hasDependentUsers(const InstRef & IR)442 bool hasDependentUsers(const InstRef &IR) const override { 443 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 444 const MemoryGroup &Group = getGroup(GroupID); 445 return !Group.isExecuted() && Group.getNumSuccessors(); 446 } 447 getCriticalPredecessor(unsigned GroupId)448 const CriticalDependency getCriticalPredecessor(unsigned GroupId) override { 449 const MemoryGroup &Group = getGroup(GroupId); 450 return Group.getCriticalPredecessor(); 451 } 452 453 /// Allocates LS resources for instruction IR. 454 /// 455 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 456 /// returning LSU_AVAILABLE. 457 /// 458 /// Rules are: 459 /// By default, rules are: 460 /// 1. A store may not pass a previous store. 461 /// 2. A load may not pass a previous store unless flag 'NoAlias' is set. 462 /// 3. A load may pass a previous load. 463 /// 4. A store may not pass a previous load (regardless of flag 'NoAlias'). 464 /// 5. A load has to wait until an older load barrier is fully executed. 465 /// 6. A store has to wait until an older store barrier is fully executed. 466 unsigned dispatch(const InstRef &IR) override; 467 onInstructionIssued(const InstRef & IR)468 virtual void onInstructionIssued(const InstRef &IR) override { 469 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 470 Groups[GroupID]->onInstructionIssued(IR); 471 } 472 473 virtual void onInstructionRetired(const InstRef &IR) override; 474 475 virtual void onInstructionExecuted(const InstRef &IR) override; 476 477 virtual void cycleEvent() override; 478 479 #ifndef NDEBUG 480 virtual void dump() const override; 481 #endif 482 483 private: isValidGroupID(unsigned Index)484 bool isValidGroupID(unsigned Index) const { 485 return Index && Groups.contains(Index); 486 } 487 getGroup(unsigned Index)488 const MemoryGroup &getGroup(unsigned Index) const { 489 assert(isValidGroupID(Index) && "Group doesn't exist!"); 490 return *Groups.find(Index)->second; 491 } 492 getGroup(unsigned Index)493 MemoryGroup &getGroup(unsigned Index) { 494 assert(isValidGroupID(Index) && "Group doesn't exist!"); 495 return *Groups.find(Index)->second; 496 } 497 createMemoryGroup()498 unsigned createMemoryGroup() { 499 Groups.insert(std::make_pair(NextGroupID, std::make_unique<MemoryGroup>())); 500 return NextGroupID++; 501 } 502 }; 503 504 } // namespace mca 505 } // namespace llvm 506 507 #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H 508