1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/TargetParser/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 std::optional<SIMemOpInfo> 233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 std::optional<SIMemOpInfo> 242 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 std::optional<SIMemOpInfo> 247 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 std::optional<SIMemOpInfo> 252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 std::optional<SIMemOpInfo> 257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 /// Inserts any necessary instructions at position \p Pos relative 316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 317 /// \p Op associated with address spaces \p AddrSpace have completed. Used 318 /// between memory instructions to enforce the order they become visible as 319 /// observed by other memory instructions executing in memory scope \p Scope. 320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 321 /// address spaces. Returns true iff any instructions inserted. 322 virtual bool insertWait(MachineBasicBlock::iterator &MI, 323 SIAtomicScope Scope, 324 SIAtomicAddrSpace AddrSpace, 325 SIMemOp Op, 326 bool IsCrossAddrSpaceOrdering, 327 Position Pos) const = 0; 328 329 /// Inserts any necessary instructions at position \p Pos relative to 330 /// instruction \p MI to ensure any subsequent memory instructions of this 331 /// thread with address spaces \p AddrSpace will observe the previous memory 332 /// operations by any thread for memory scopes up to memory scope \p Scope . 333 /// Returns true iff any instructions inserted. 334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 335 SIAtomicScope Scope, 336 SIAtomicAddrSpace AddrSpace, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure previous memory instructions by this thread 341 /// with address spaces \p AddrSpace have completed and can be observed by 342 /// subsequent memory instructions by any thread executing in memory scope \p 343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 344 /// between address spaces. Returns true iff any instructions inserted. 345 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const = 0; 350 351 /// Virtual destructor to allow derivations to be deleted. 352 virtual ~SICacheControl() = default; 353 354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 355 MachineBasicBlock::iterator &MI) const { 356 return false; 357 } 358 }; 359 360 class SIGfx6CacheControl : public SICacheControl { 361 protected: 362 363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::GLC); 367 } 368 369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 370 /// is modified, false otherwise. 371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 372 return enableNamedBit(MI, AMDGPU::CPol::SLC); 373 } 374 375 public: 376 377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 378 379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 380 SIAtomicScope Scope, 381 SIAtomicAddrSpace AddrSpace) const override; 382 383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 388 SIAtomicScope Scope, 389 SIAtomicAddrSpace AddrSpace) const override; 390 391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 392 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 393 bool IsVolatile, 394 bool IsNonTemporal) const override; 395 396 bool insertWait(MachineBasicBlock::iterator &MI, 397 SIAtomicScope Scope, 398 SIAtomicAddrSpace AddrSpace, 399 SIMemOp Op, 400 bool IsCrossAddrSpaceOrdering, 401 Position Pos) const override; 402 403 bool insertAcquire(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 Position Pos) const override; 407 408 bool insertRelease(MachineBasicBlock::iterator &MI, 409 SIAtomicScope Scope, 410 SIAtomicAddrSpace AddrSpace, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 }; 414 415 class SIGfx7CacheControl : public SIGfx6CacheControl { 416 public: 417 418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 419 420 bool insertAcquire(MachineBasicBlock::iterator &MI, 421 SIAtomicScope Scope, 422 SIAtomicAddrSpace AddrSpace, 423 Position Pos) const override; 424 425 }; 426 427 class SIGfx90ACacheControl : public SIGfx7CacheControl { 428 public: 429 430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 431 432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 433 SIAtomicScope Scope, 434 SIAtomicAddrSpace AddrSpace) const override; 435 436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 441 SIAtomicScope Scope, 442 SIAtomicAddrSpace AddrSpace) const override; 443 444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 445 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 446 bool IsVolatile, 447 bool IsNonTemporal) const override; 448 449 bool insertWait(MachineBasicBlock::iterator &MI, 450 SIAtomicScope Scope, 451 SIAtomicAddrSpace AddrSpace, 452 SIMemOp Op, 453 bool IsCrossAddrSpaceOrdering, 454 Position Pos) const override; 455 456 bool insertAcquire(MachineBasicBlock::iterator &MI, 457 SIAtomicScope Scope, 458 SIAtomicAddrSpace AddrSpace, 459 Position Pos) const override; 460 461 bool insertRelease(MachineBasicBlock::iterator &MI, 462 SIAtomicScope Scope, 463 SIAtomicAddrSpace AddrSpace, 464 bool IsCrossAddrSpaceOrdering, 465 Position Pos) const override; 466 }; 467 468 class SIGfx940CacheControl : public SIGfx90ACacheControl { 469 protected: 470 471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 472 /// is modified, false otherwise. 473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 474 return enableNamedBit(MI, AMDGPU::CPol::SC0); 475 } 476 477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 478 /// is modified, false otherwise. 479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 480 return enableNamedBit(MI, AMDGPU::CPol::SC1); 481 } 482 483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 484 /// is modified, false otherwise. 485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 486 return enableNamedBit(MI, AMDGPU::CPol::NT); 487 } 488 489 public: 490 491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 492 493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace) const override; 496 497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 502 SIAtomicScope Scope, 503 SIAtomicAddrSpace AddrSpace) const override; 504 505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 506 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 507 bool IsVolatile, 508 bool IsNonTemporal) const override; 509 510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 511 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 512 513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 515 Position Pos) const override; 516 517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 518 MachineBasicBlock::iterator &MI) const override { 519 bool Changed = false; 520 if (ST.hasForceStoreSC0SC1() && 521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 522 SIAtomicAddrSpace::GLOBAL | 523 SIAtomicAddrSpace::OTHER)) != 524 SIAtomicAddrSpace::NONE) { 525 Changed |= enableSC0Bit(MI); 526 Changed |= enableSC1Bit(MI); 527 } 528 return Changed; 529 } 530 }; 531 532 class SIGfx10CacheControl : public SIGfx7CacheControl { 533 protected: 534 535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 536 /// is modified, false otherwise. 537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 538 return enableNamedBit(MI, AMDGPU::CPol::DLC); 539 } 540 541 public: 542 543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 544 545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 546 SIAtomicScope Scope, 547 SIAtomicAddrSpace AddrSpace) const override; 548 549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 550 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 551 bool IsVolatile, 552 bool IsNonTemporal) const override; 553 554 bool insertWait(MachineBasicBlock::iterator &MI, 555 SIAtomicScope Scope, 556 SIAtomicAddrSpace AddrSpace, 557 SIMemOp Op, 558 bool IsCrossAddrSpaceOrdering, 559 Position Pos) const override; 560 561 bool insertAcquire(MachineBasicBlock::iterator &MI, 562 SIAtomicScope Scope, 563 SIAtomicAddrSpace AddrSpace, 564 Position Pos) const override; 565 }; 566 567 class SIGfx11CacheControl : public SIGfx10CacheControl { 568 public: 569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 570 571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 572 SIAtomicScope Scope, 573 SIAtomicAddrSpace AddrSpace) const override; 574 575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 576 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 577 bool IsVolatile, 578 bool IsNonTemporal) const override; 579 }; 580 581 class SIGfx12CacheControl : public SIGfx11CacheControl { 582 protected: 583 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. 584 // \returns Returns true if \p MI is modified, false otherwise. 585 bool setTH(const MachineBasicBlock::iterator MI, 586 AMDGPU::CPol::CPol Value) const; 587 // Sets Scope policy to \p Value if CPol operand is present in instruction \p 588 // MI. \returns Returns true if \p MI is modified, false otherwise. 589 bool setScope(const MachineBasicBlock::iterator MI, 590 AMDGPU::CPol::CPol Value) const; 591 592 public: 593 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 594 595 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 596 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 597 bool IsCrossAddrSpaceOrdering, Position Pos) const override; 598 599 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 600 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 601 602 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 603 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 604 bool IsVolatile, 605 bool IsNonTemporal) const override; 606 }; 607 608 class SIMemoryLegalizer final : public MachineFunctionPass { 609 private: 610 611 /// Cache Control. 612 std::unique_ptr<SICacheControl> CC = nullptr; 613 614 /// List of atomic pseudo instructions. 615 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 616 617 /// Return true iff instruction \p MI is a atomic instruction that 618 /// returns a result. 619 bool isAtomicRet(const MachineInstr &MI) const { 620 return SIInstrInfo::isAtomicRet(MI); 621 } 622 623 /// Removes all processed atomic pseudo instructions from the current 624 /// function. Returns true if current function is modified, false otherwise. 625 bool removeAtomicPseudoMIs(); 626 627 /// Expands load operation \p MI. Returns true if instructions are 628 /// added/deleted or \p MI is modified, false otherwise. 629 bool expandLoad(const SIMemOpInfo &MOI, 630 MachineBasicBlock::iterator &MI); 631 /// Expands store operation \p MI. Returns true if instructions are 632 /// added/deleted or \p MI is modified, false otherwise. 633 bool expandStore(const SIMemOpInfo &MOI, 634 MachineBasicBlock::iterator &MI); 635 /// Expands atomic fence operation \p MI. Returns true if 636 /// instructions are added/deleted or \p MI is modified, false otherwise. 637 bool expandAtomicFence(const SIMemOpInfo &MOI, 638 MachineBasicBlock::iterator &MI); 639 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 640 /// instructions are added/deleted or \p MI is modified, false otherwise. 641 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 642 MachineBasicBlock::iterator &MI); 643 644 public: 645 static char ID; 646 647 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 648 649 void getAnalysisUsage(AnalysisUsage &AU) const override { 650 AU.setPreservesCFG(); 651 MachineFunctionPass::getAnalysisUsage(AU); 652 } 653 654 StringRef getPassName() const override { 655 return PASS_NAME; 656 } 657 658 bool runOnMachineFunction(MachineFunction &MF) override; 659 }; 660 661 } // end namespace anonymous 662 663 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 664 const char *Msg) const { 665 const Function &Func = MI->getParent()->getParent()->getFunction(); 666 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 667 Func.getContext().diagnose(Diag); 668 } 669 670 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 671 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 672 SIAtomicAddrSpace InstrAddrSpace) const { 673 if (SSID == SyncScope::System) 674 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 675 if (SSID == MMI->getAgentSSID()) 676 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 677 if (SSID == MMI->getWorkgroupSSID()) 678 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 679 true); 680 if (SSID == MMI->getWavefrontSSID()) 681 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 682 true); 683 if (SSID == SyncScope::SingleThread) 684 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 685 true); 686 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 687 return std::tuple(SIAtomicScope::SYSTEM, 688 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 689 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 690 return std::tuple(SIAtomicScope::AGENT, 691 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 692 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 693 return std::tuple(SIAtomicScope::WORKGROUP, 694 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 695 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 696 return std::tuple(SIAtomicScope::WAVEFRONT, 697 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 698 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 699 return std::tuple(SIAtomicScope::SINGLETHREAD, 700 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 701 return std::nullopt; 702 } 703 704 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 705 if (AS == AMDGPUAS::FLAT_ADDRESS) 706 return SIAtomicAddrSpace::FLAT; 707 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 708 return SIAtomicAddrSpace::GLOBAL; 709 if (AS == AMDGPUAS::LOCAL_ADDRESS) 710 return SIAtomicAddrSpace::LDS; 711 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 712 return SIAtomicAddrSpace::SCRATCH; 713 if (AS == AMDGPUAS::REGION_ADDRESS) 714 return SIAtomicAddrSpace::GDS; 715 716 return SIAtomicAddrSpace::OTHER; 717 } 718 719 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 720 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 721 } 722 723 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 724 const MachineBasicBlock::iterator &MI) const { 725 assert(MI->getNumMemOperands() > 0); 726 727 SyncScope::ID SSID = SyncScope::SingleThread; 728 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 729 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 730 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 731 bool IsNonTemporal = true; 732 bool IsVolatile = false; 733 734 // Validator should check whether or not MMOs cover the entire set of 735 // locations accessed by the memory instruction. 736 for (const auto &MMO : MI->memoperands()) { 737 IsNonTemporal &= MMO->isNonTemporal(); 738 IsVolatile |= MMO->isVolatile(); 739 InstrAddrSpace |= 740 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 741 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 742 if (OpOrdering != AtomicOrdering::NotAtomic) { 743 const auto &IsSyncScopeInclusion = 744 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 745 if (!IsSyncScopeInclusion) { 746 reportUnsupported(MI, 747 "Unsupported non-inclusive atomic synchronization scope"); 748 return std::nullopt; 749 } 750 751 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 752 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 753 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 754 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 755 FailureOrdering = 756 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 757 } 758 } 759 760 SIAtomicScope Scope = SIAtomicScope::NONE; 761 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 762 bool IsCrossAddressSpaceOrdering = false; 763 if (Ordering != AtomicOrdering::NotAtomic) { 764 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 765 if (!ScopeOrNone) { 766 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 767 return std::nullopt; 768 } 769 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 770 *ScopeOrNone; 771 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 772 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 773 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 774 reportUnsupported(MI, "Unsupported atomic address space"); 775 return std::nullopt; 776 } 777 } 778 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 779 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 780 IsNonTemporal); 781 } 782 783 std::optional<SIMemOpInfo> 784 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 785 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 786 787 if (!(MI->mayLoad() && !MI->mayStore())) 788 return std::nullopt; 789 790 // Be conservative if there are no memory operands. 791 if (MI->getNumMemOperands() == 0) 792 return SIMemOpInfo(); 793 794 return constructFromMIWithMMO(MI); 795 } 796 797 std::optional<SIMemOpInfo> 798 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 799 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 800 801 if (!(!MI->mayLoad() && MI->mayStore())) 802 return std::nullopt; 803 804 // Be conservative if there are no memory operands. 805 if (MI->getNumMemOperands() == 0) 806 return SIMemOpInfo(); 807 808 return constructFromMIWithMMO(MI); 809 } 810 811 std::optional<SIMemOpInfo> 812 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 813 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 814 815 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 816 return std::nullopt; 817 818 AtomicOrdering Ordering = 819 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 820 821 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 822 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 823 if (!ScopeOrNone) { 824 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 825 return std::nullopt; 826 } 827 828 SIAtomicScope Scope = SIAtomicScope::NONE; 829 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 830 bool IsCrossAddressSpaceOrdering = false; 831 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 832 *ScopeOrNone; 833 834 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 835 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 836 reportUnsupported(MI, "Unsupported atomic address space"); 837 return std::nullopt; 838 } 839 840 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 841 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 842 } 843 844 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 845 const MachineBasicBlock::iterator &MI) const { 846 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 847 848 if (!(MI->mayLoad() && MI->mayStore())) 849 return std::nullopt; 850 851 // Be conservative if there are no memory operands. 852 if (MI->getNumMemOperands() == 0) 853 return SIMemOpInfo(); 854 855 return constructFromMIWithMMO(MI); 856 } 857 858 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 859 TII = ST.getInstrInfo(); 860 IV = getIsaVersion(ST.getCPU()); 861 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 862 } 863 864 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 865 AMDGPU::CPol::CPol Bit) const { 866 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 867 if (!CPol) 868 return false; 869 870 CPol->setImm(CPol->getImm() | Bit); 871 return true; 872 } 873 874 /* static */ 875 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 876 GCNSubtarget::Generation Generation = ST.getGeneration(); 877 if (ST.hasGFX940Insts()) 878 return std::make_unique<SIGfx940CacheControl>(ST); 879 if (ST.hasGFX90AInsts()) 880 return std::make_unique<SIGfx90ACacheControl>(ST); 881 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 882 return std::make_unique<SIGfx6CacheControl>(ST); 883 if (Generation < AMDGPUSubtarget::GFX10) 884 return std::make_unique<SIGfx7CacheControl>(ST); 885 if (Generation < AMDGPUSubtarget::GFX11) 886 return std::make_unique<SIGfx10CacheControl>(ST); 887 if (Generation < AMDGPUSubtarget::GFX12) 888 return std::make_unique<SIGfx11CacheControl>(ST); 889 return std::make_unique<SIGfx12CacheControl>(ST); 890 } 891 892 bool SIGfx6CacheControl::enableLoadCacheBypass( 893 const MachineBasicBlock::iterator &MI, 894 SIAtomicScope Scope, 895 SIAtomicAddrSpace AddrSpace) const { 896 assert(MI->mayLoad() && !MI->mayStore()); 897 bool Changed = false; 898 899 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 900 switch (Scope) { 901 case SIAtomicScope::SYSTEM: 902 case SIAtomicScope::AGENT: 903 // Set L1 cache policy to MISS_EVICT. 904 // Note: there is no L2 cache bypass policy at the ISA level. 905 Changed |= enableGLCBit(MI); 906 break; 907 case SIAtomicScope::WORKGROUP: 908 case SIAtomicScope::WAVEFRONT: 909 case SIAtomicScope::SINGLETHREAD: 910 // No cache to bypass. 911 break; 912 default: 913 llvm_unreachable("Unsupported synchronization scope"); 914 } 915 } 916 917 /// The scratch address space does not need the global memory caches 918 /// to be bypassed as all memory operations by the same thread are 919 /// sequentially consistent, and no other thread can access scratch 920 /// memory. 921 922 /// Other address spaces do not have a cache. 923 924 return Changed; 925 } 926 927 bool SIGfx6CacheControl::enableStoreCacheBypass( 928 const MachineBasicBlock::iterator &MI, 929 SIAtomicScope Scope, 930 SIAtomicAddrSpace AddrSpace) const { 931 assert(!MI->mayLoad() && MI->mayStore()); 932 bool Changed = false; 933 934 /// The L1 cache is write through so does not need to be bypassed. There is no 935 /// bypass control for the L2 cache at the isa level. 936 937 return Changed; 938 } 939 940 bool SIGfx6CacheControl::enableRMWCacheBypass( 941 const MachineBasicBlock::iterator &MI, 942 SIAtomicScope Scope, 943 SIAtomicAddrSpace AddrSpace) const { 944 assert(MI->mayLoad() && MI->mayStore()); 945 bool Changed = false; 946 947 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 948 /// bypassed, and the GLC bit is instead used to indicate if they are 949 /// return or no-return. 950 /// Note: there is no L2 cache coherent bypass control at the ISA level. 951 952 return Changed; 953 } 954 955 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 956 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 957 bool IsVolatile, bool IsNonTemporal) const { 958 // Only handle load and store, not atomic read-modify-write insructions. The 959 // latter use glc to indicate if the atomic returns a result and so must not 960 // be used for cache control. 961 assert(MI->mayLoad() ^ MI->mayStore()); 962 963 // Only update load and store, not LLVM IR atomic read-modify-write 964 // instructions. The latter are always marked as volatile so cannot sensibly 965 // handle it as do not want to pessimize all atomics. Also they do not support 966 // the nontemporal attribute. 967 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 968 969 bool Changed = false; 970 971 if (IsVolatile) { 972 // Set L1 cache policy to be MISS_EVICT for load instructions 973 // and MISS_LRU for store instructions. 974 // Note: there is no L2 cache bypass policy at the ISA level. 975 if (Op == SIMemOp::LOAD) 976 Changed |= enableGLCBit(MI); 977 978 // Ensure operation has completed at system scope to cause all volatile 979 // operations to be visible outside the program in a global order. Do not 980 // request cross address space as only the global address space can be 981 // observable outside the program, so no need to cause a waitcnt for LDS 982 // address space operations. 983 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 984 Position::AFTER); 985 986 return Changed; 987 } 988 989 if (IsNonTemporal) { 990 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 991 // for both loads and stores, and the L2 cache policy to STREAM. 992 Changed |= enableGLCBit(MI); 993 Changed |= enableSLCBit(MI); 994 return Changed; 995 } 996 997 return Changed; 998 } 999 1000 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1001 SIAtomicScope Scope, 1002 SIAtomicAddrSpace AddrSpace, 1003 SIMemOp Op, 1004 bool IsCrossAddrSpaceOrdering, 1005 Position Pos) const { 1006 bool Changed = false; 1007 1008 MachineBasicBlock &MBB = *MI->getParent(); 1009 DebugLoc DL = MI->getDebugLoc(); 1010 1011 if (Pos == Position::AFTER) 1012 ++MI; 1013 1014 bool VMCnt = false; 1015 bool LGKMCnt = false; 1016 1017 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1018 SIAtomicAddrSpace::NONE) { 1019 switch (Scope) { 1020 case SIAtomicScope::SYSTEM: 1021 case SIAtomicScope::AGENT: 1022 VMCnt |= true; 1023 break; 1024 case SIAtomicScope::WORKGROUP: 1025 case SIAtomicScope::WAVEFRONT: 1026 case SIAtomicScope::SINGLETHREAD: 1027 // The L1 cache keeps all memory operations in order for 1028 // wavefronts in the same work-group. 1029 break; 1030 default: 1031 llvm_unreachable("Unsupported synchronization scope"); 1032 } 1033 } 1034 1035 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1036 switch (Scope) { 1037 case SIAtomicScope::SYSTEM: 1038 case SIAtomicScope::AGENT: 1039 case SIAtomicScope::WORKGROUP: 1040 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1041 // not needed as LDS operations for all waves are executed in a total 1042 // global ordering as observed by all waves. Required if also 1043 // synchronizing with global/GDS memory as LDS operations could be 1044 // reordered with respect to later global/GDS memory operations of the 1045 // same wave. 1046 LGKMCnt |= IsCrossAddrSpaceOrdering; 1047 break; 1048 case SIAtomicScope::WAVEFRONT: 1049 case SIAtomicScope::SINGLETHREAD: 1050 // The LDS keeps all memory operations in order for 1051 // the same wavefront. 1052 break; 1053 default: 1054 llvm_unreachable("Unsupported synchronization scope"); 1055 } 1056 } 1057 1058 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1059 switch (Scope) { 1060 case SIAtomicScope::SYSTEM: 1061 case SIAtomicScope::AGENT: 1062 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1063 // is not needed as GDS operations for all waves are executed in a total 1064 // global ordering as observed by all waves. Required if also 1065 // synchronizing with global/LDS memory as GDS operations could be 1066 // reordered with respect to later global/LDS memory operations of the 1067 // same wave. 1068 LGKMCnt |= IsCrossAddrSpaceOrdering; 1069 break; 1070 case SIAtomicScope::WORKGROUP: 1071 case SIAtomicScope::WAVEFRONT: 1072 case SIAtomicScope::SINGLETHREAD: 1073 // The GDS keeps all memory operations in order for 1074 // the same work-group. 1075 break; 1076 default: 1077 llvm_unreachable("Unsupported synchronization scope"); 1078 } 1079 } 1080 1081 if (VMCnt || LGKMCnt) { 1082 unsigned WaitCntImmediate = 1083 AMDGPU::encodeWaitcnt(IV, 1084 VMCnt ? 0 : getVmcntBitMask(IV), 1085 getExpcntBitMask(IV), 1086 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1088 .addImm(WaitCntImmediate); 1089 Changed = true; 1090 } 1091 1092 if (Pos == Position::AFTER) 1093 --MI; 1094 1095 return Changed; 1096 } 1097 1098 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1099 SIAtomicScope Scope, 1100 SIAtomicAddrSpace AddrSpace, 1101 Position Pos) const { 1102 if (!InsertCacheInv) 1103 return false; 1104 1105 bool Changed = false; 1106 1107 MachineBasicBlock &MBB = *MI->getParent(); 1108 DebugLoc DL = MI->getDebugLoc(); 1109 1110 if (Pos == Position::AFTER) 1111 ++MI; 1112 1113 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1114 switch (Scope) { 1115 case SIAtomicScope::SYSTEM: 1116 case SIAtomicScope::AGENT: 1117 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1118 Changed = true; 1119 break; 1120 case SIAtomicScope::WORKGROUP: 1121 case SIAtomicScope::WAVEFRONT: 1122 case SIAtomicScope::SINGLETHREAD: 1123 // No cache to invalidate. 1124 break; 1125 default: 1126 llvm_unreachable("Unsupported synchronization scope"); 1127 } 1128 } 1129 1130 /// The scratch address space does not need the global memory cache 1131 /// to be flushed as all memory operations by the same thread are 1132 /// sequentially consistent, and no other thread can access scratch 1133 /// memory. 1134 1135 /// Other address spaces do not have a cache. 1136 1137 if (Pos == Position::AFTER) 1138 --MI; 1139 1140 return Changed; 1141 } 1142 1143 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1144 SIAtomicScope Scope, 1145 SIAtomicAddrSpace AddrSpace, 1146 bool IsCrossAddrSpaceOrdering, 1147 Position Pos) const { 1148 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1149 IsCrossAddrSpaceOrdering, Pos); 1150 } 1151 1152 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1153 SIAtomicScope Scope, 1154 SIAtomicAddrSpace AddrSpace, 1155 Position Pos) const { 1156 if (!InsertCacheInv) 1157 return false; 1158 1159 bool Changed = false; 1160 1161 MachineBasicBlock &MBB = *MI->getParent(); 1162 DebugLoc DL = MI->getDebugLoc(); 1163 1164 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1165 1166 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1167 ? AMDGPU::BUFFER_WBINVL1 1168 : AMDGPU::BUFFER_WBINVL1_VOL; 1169 1170 if (Pos == Position::AFTER) 1171 ++MI; 1172 1173 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1174 switch (Scope) { 1175 case SIAtomicScope::SYSTEM: 1176 case SIAtomicScope::AGENT: 1177 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1178 Changed = true; 1179 break; 1180 case SIAtomicScope::WORKGROUP: 1181 case SIAtomicScope::WAVEFRONT: 1182 case SIAtomicScope::SINGLETHREAD: 1183 // No cache to invalidate. 1184 break; 1185 default: 1186 llvm_unreachable("Unsupported synchronization scope"); 1187 } 1188 } 1189 1190 /// The scratch address space does not need the global memory cache 1191 /// to be flushed as all memory operations by the same thread are 1192 /// sequentially consistent, and no other thread can access scratch 1193 /// memory. 1194 1195 /// Other address spaces do not have a cache. 1196 1197 if (Pos == Position::AFTER) 1198 --MI; 1199 1200 return Changed; 1201 } 1202 1203 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1204 const MachineBasicBlock::iterator &MI, 1205 SIAtomicScope Scope, 1206 SIAtomicAddrSpace AddrSpace) const { 1207 assert(MI->mayLoad() && !MI->mayStore()); 1208 bool Changed = false; 1209 1210 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1211 switch (Scope) { 1212 case SIAtomicScope::SYSTEM: 1213 case SIAtomicScope::AGENT: 1214 // Set the L1 cache policy to MISS_LRU. 1215 // Note: there is no L2 cache bypass policy at the ISA level. 1216 Changed |= enableGLCBit(MI); 1217 break; 1218 case SIAtomicScope::WORKGROUP: 1219 // In threadgroup split mode the waves of a work-group can be executing on 1220 // different CUs. Therefore need to bypass the L1 which is per CU. 1221 // Otherwise in non-threadgroup split mode all waves of a work-group are 1222 // on the same CU, and so the L1 does not need to be bypassed. 1223 if (ST.isTgSplitEnabled()) 1224 Changed |= enableGLCBit(MI); 1225 break; 1226 case SIAtomicScope::WAVEFRONT: 1227 case SIAtomicScope::SINGLETHREAD: 1228 // No cache to bypass. 1229 break; 1230 default: 1231 llvm_unreachable("Unsupported synchronization scope"); 1232 } 1233 } 1234 1235 /// The scratch address space does not need the global memory caches 1236 /// to be bypassed as all memory operations by the same thread are 1237 /// sequentially consistent, and no other thread can access scratch 1238 /// memory. 1239 1240 /// Other address spaces do not have a cache. 1241 1242 return Changed; 1243 } 1244 1245 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1246 const MachineBasicBlock::iterator &MI, 1247 SIAtomicScope Scope, 1248 SIAtomicAddrSpace AddrSpace) const { 1249 assert(!MI->mayLoad() && MI->mayStore()); 1250 bool Changed = false; 1251 1252 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1253 switch (Scope) { 1254 case SIAtomicScope::SYSTEM: 1255 case SIAtomicScope::AGENT: 1256 /// Do not set glc for store atomic operations as they implicitly write 1257 /// through the L1 cache. 1258 break; 1259 case SIAtomicScope::WORKGROUP: 1260 case SIAtomicScope::WAVEFRONT: 1261 case SIAtomicScope::SINGLETHREAD: 1262 // No cache to bypass. Store atomics implicitly write through the L1 1263 // cache. 1264 break; 1265 default: 1266 llvm_unreachable("Unsupported synchronization scope"); 1267 } 1268 } 1269 1270 /// The scratch address space does not need the global memory caches 1271 /// to be bypassed as all memory operations by the same thread are 1272 /// sequentially consistent, and no other thread can access scratch 1273 /// memory. 1274 1275 /// Other address spaces do not have a cache. 1276 1277 return Changed; 1278 } 1279 1280 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1281 const MachineBasicBlock::iterator &MI, 1282 SIAtomicScope Scope, 1283 SIAtomicAddrSpace AddrSpace) const { 1284 assert(MI->mayLoad() && MI->mayStore()); 1285 bool Changed = false; 1286 1287 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1288 switch (Scope) { 1289 case SIAtomicScope::SYSTEM: 1290 case SIAtomicScope::AGENT: 1291 /// Do not set glc for RMW atomic operations as they implicitly bypass 1292 /// the L1 cache, and the glc bit is instead used to indicate if they are 1293 /// return or no-return. 1294 break; 1295 case SIAtomicScope::WORKGROUP: 1296 case SIAtomicScope::WAVEFRONT: 1297 case SIAtomicScope::SINGLETHREAD: 1298 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1299 break; 1300 default: 1301 llvm_unreachable("Unsupported synchronization scope"); 1302 } 1303 } 1304 1305 return Changed; 1306 } 1307 1308 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1309 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1310 bool IsVolatile, bool IsNonTemporal) const { 1311 // Only handle load and store, not atomic read-modify-write insructions. The 1312 // latter use glc to indicate if the atomic returns a result and so must not 1313 // be used for cache control. 1314 assert(MI->mayLoad() ^ MI->mayStore()); 1315 1316 // Only update load and store, not LLVM IR atomic read-modify-write 1317 // instructions. The latter are always marked as volatile so cannot sensibly 1318 // handle it as do not want to pessimize all atomics. Also they do not support 1319 // the nontemporal attribute. 1320 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1321 1322 bool Changed = false; 1323 1324 if (IsVolatile) { 1325 // Set L1 cache policy to be MISS_EVICT for load instructions 1326 // and MISS_LRU for store instructions. 1327 // Note: there is no L2 cache bypass policy at the ISA level. 1328 if (Op == SIMemOp::LOAD) 1329 Changed |= enableGLCBit(MI); 1330 1331 // Ensure operation has completed at system scope to cause all volatile 1332 // operations to be visible outside the program in a global order. Do not 1333 // request cross address space as only the global address space can be 1334 // observable outside the program, so no need to cause a waitcnt for LDS 1335 // address space operations. 1336 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1337 Position::AFTER); 1338 1339 return Changed; 1340 } 1341 1342 if (IsNonTemporal) { 1343 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1344 // for both loads and stores, and the L2 cache policy to STREAM. 1345 Changed |= enableGLCBit(MI); 1346 Changed |= enableSLCBit(MI); 1347 return Changed; 1348 } 1349 1350 return Changed; 1351 } 1352 1353 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1354 SIAtomicScope Scope, 1355 SIAtomicAddrSpace AddrSpace, 1356 SIMemOp Op, 1357 bool IsCrossAddrSpaceOrdering, 1358 Position Pos) const { 1359 if (ST.isTgSplitEnabled()) { 1360 // In threadgroup split mode the waves of a work-group can be executing on 1361 // different CUs. Therefore need to wait for global or GDS memory operations 1362 // to complete to ensure they are visible to waves in the other CUs. 1363 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1364 // the same CU, so no need to wait for global memory as all waves in the 1365 // work-group access the same the L1, nor wait for GDS as access are ordered 1366 // on a CU. 1367 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1368 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1369 (Scope == SIAtomicScope::WORKGROUP)) { 1370 // Same as GFX7 using agent scope. 1371 Scope = SIAtomicScope::AGENT; 1372 } 1373 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1374 // LDS memory operations. 1375 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1376 } 1377 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1378 IsCrossAddrSpaceOrdering, Pos); 1379 } 1380 1381 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1382 SIAtomicScope Scope, 1383 SIAtomicAddrSpace AddrSpace, 1384 Position Pos) const { 1385 if (!InsertCacheInv) 1386 return false; 1387 1388 bool Changed = false; 1389 1390 MachineBasicBlock &MBB = *MI->getParent(); 1391 DebugLoc DL = MI->getDebugLoc(); 1392 1393 if (Pos == Position::AFTER) 1394 ++MI; 1395 1396 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1397 switch (Scope) { 1398 case SIAtomicScope::SYSTEM: 1399 // Ensures that following loads will not see stale remote VMEM data or 1400 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1401 // CC will never be stale due to the local memory probes. 1402 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1403 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1404 // hardware does not reorder memory operations by the same wave with 1405 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1406 // remove any cache lines of earlier writes by the same wave and ensures 1407 // later reads by the same wave will refetch the cache lines. 1408 Changed = true; 1409 break; 1410 case SIAtomicScope::AGENT: 1411 // Same as GFX7. 1412 break; 1413 case SIAtomicScope::WORKGROUP: 1414 // In threadgroup split mode the waves of a work-group can be executing on 1415 // different CUs. Therefore need to invalidate the L1 which is per CU. 1416 // Otherwise in non-threadgroup split mode all waves of a work-group are 1417 // on the same CU, and so the L1 does not need to be invalidated. 1418 if (ST.isTgSplitEnabled()) { 1419 // Same as GFX7 using agent scope. 1420 Scope = SIAtomicScope::AGENT; 1421 } 1422 break; 1423 case SIAtomicScope::WAVEFRONT: 1424 case SIAtomicScope::SINGLETHREAD: 1425 // Same as GFX7. 1426 break; 1427 default: 1428 llvm_unreachable("Unsupported synchronization scope"); 1429 } 1430 } 1431 1432 /// The scratch address space does not need the global memory cache 1433 /// to be flushed as all memory operations by the same thread are 1434 /// sequentially consistent, and no other thread can access scratch 1435 /// memory. 1436 1437 /// Other address spaces do not have a cache. 1438 1439 if (Pos == Position::AFTER) 1440 --MI; 1441 1442 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1443 1444 return Changed; 1445 } 1446 1447 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1448 SIAtomicScope Scope, 1449 SIAtomicAddrSpace AddrSpace, 1450 bool IsCrossAddrSpaceOrdering, 1451 Position Pos) const { 1452 bool Changed = false; 1453 1454 MachineBasicBlock &MBB = *MI->getParent(); 1455 const DebugLoc &DL = MI->getDebugLoc(); 1456 1457 if (Pos == Position::AFTER) 1458 ++MI; 1459 1460 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1461 switch (Scope) { 1462 case SIAtomicScope::SYSTEM: 1463 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1464 // hardware does not reorder memory operations by the same wave with 1465 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1466 // to initiate writeback of any dirty cache lines of earlier writes by the 1467 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1468 // writeback has completed. 1469 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1470 // Set SC bits to indicate system scope. 1471 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1472 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1473 // vmcnt(0)" needed by the "BUFFER_WBL2". 1474 Changed = true; 1475 break; 1476 case SIAtomicScope::AGENT: 1477 case SIAtomicScope::WORKGROUP: 1478 case SIAtomicScope::WAVEFRONT: 1479 case SIAtomicScope::SINGLETHREAD: 1480 // Same as GFX7. 1481 break; 1482 default: 1483 llvm_unreachable("Unsupported synchronization scope"); 1484 } 1485 } 1486 1487 if (Pos == Position::AFTER) 1488 --MI; 1489 1490 Changed |= 1491 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1492 IsCrossAddrSpaceOrdering, Pos); 1493 1494 return Changed; 1495 } 1496 1497 bool SIGfx940CacheControl::enableLoadCacheBypass( 1498 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1499 SIAtomicAddrSpace AddrSpace) const { 1500 assert(MI->mayLoad() && !MI->mayStore()); 1501 bool Changed = false; 1502 1503 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1504 switch (Scope) { 1505 case SIAtomicScope::SYSTEM: 1506 // Set SC bits to indicate system scope. 1507 Changed |= enableSC0Bit(MI); 1508 Changed |= enableSC1Bit(MI); 1509 break; 1510 case SIAtomicScope::AGENT: 1511 // Set SC bits to indicate agent scope. 1512 Changed |= enableSC1Bit(MI); 1513 break; 1514 case SIAtomicScope::WORKGROUP: 1515 // In threadgroup split mode the waves of a work-group can be executing on 1516 // different CUs. Therefore need to bypass the L1 which is per CU. 1517 // Otherwise in non-threadgroup split mode all waves of a work-group are 1518 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1519 // bits to indicate work-group scope will do this automatically. 1520 Changed |= enableSC0Bit(MI); 1521 break; 1522 case SIAtomicScope::WAVEFRONT: 1523 case SIAtomicScope::SINGLETHREAD: 1524 // Leave SC bits unset to indicate wavefront scope. 1525 break; 1526 default: 1527 llvm_unreachable("Unsupported synchronization scope"); 1528 } 1529 } 1530 1531 /// The scratch address space does not need the global memory caches 1532 /// to be bypassed as all memory operations by the same thread are 1533 /// sequentially consistent, and no other thread can access scratch 1534 /// memory. 1535 1536 /// Other address spaces do not have a cache. 1537 1538 return Changed; 1539 } 1540 1541 bool SIGfx940CacheControl::enableStoreCacheBypass( 1542 const MachineBasicBlock::iterator &MI, 1543 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1544 assert(!MI->mayLoad() && MI->mayStore()); 1545 bool Changed = false; 1546 1547 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1548 switch (Scope) { 1549 case SIAtomicScope::SYSTEM: 1550 // Set SC bits to indicate system scope. 1551 Changed |= enableSC0Bit(MI); 1552 Changed |= enableSC1Bit(MI); 1553 break; 1554 case SIAtomicScope::AGENT: 1555 // Set SC bits to indicate agent scope. 1556 Changed |= enableSC1Bit(MI); 1557 break; 1558 case SIAtomicScope::WORKGROUP: 1559 // Set SC bits to indicate workgroup scope. 1560 Changed |= enableSC0Bit(MI); 1561 break; 1562 case SIAtomicScope::WAVEFRONT: 1563 case SIAtomicScope::SINGLETHREAD: 1564 // Leave SC bits unset to indicate wavefront scope. 1565 break; 1566 default: 1567 llvm_unreachable("Unsupported synchronization scope"); 1568 } 1569 } 1570 1571 /// The scratch address space does not need the global memory caches 1572 /// to be bypassed as all memory operations by the same thread are 1573 /// sequentially consistent, and no other thread can access scratch 1574 /// memory. 1575 1576 /// Other address spaces do not have a cache. 1577 1578 return Changed; 1579 } 1580 1581 bool SIGfx940CacheControl::enableRMWCacheBypass( 1582 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1583 SIAtomicAddrSpace AddrSpace) const { 1584 assert(MI->mayLoad() && MI->mayStore()); 1585 bool Changed = false; 1586 1587 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1588 switch (Scope) { 1589 case SIAtomicScope::SYSTEM: 1590 // Set SC1 bit to indicate system scope. 1591 Changed |= enableSC1Bit(MI); 1592 break; 1593 case SIAtomicScope::AGENT: 1594 case SIAtomicScope::WORKGROUP: 1595 case SIAtomicScope::WAVEFRONT: 1596 case SIAtomicScope::SINGLETHREAD: 1597 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1598 // to indicate system or agent scope. The SC0 bit is used to indicate if 1599 // they are return or no-return. Leave SC1 bit unset to indicate agent 1600 // scope. 1601 break; 1602 default: 1603 llvm_unreachable("Unsupported synchronization scope"); 1604 } 1605 } 1606 1607 return Changed; 1608 } 1609 1610 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1611 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1612 bool IsVolatile, bool IsNonTemporal) const { 1613 // Only handle load and store, not atomic read-modify-write insructions. The 1614 // latter use glc to indicate if the atomic returns a result and so must not 1615 // be used for cache control. 1616 assert(MI->mayLoad() ^ MI->mayStore()); 1617 1618 // Only update load and store, not LLVM IR atomic read-modify-write 1619 // instructions. The latter are always marked as volatile so cannot sensibly 1620 // handle it as do not want to pessimize all atomics. Also they do not support 1621 // the nontemporal attribute. 1622 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1623 1624 bool Changed = false; 1625 1626 if (IsVolatile) { 1627 // Set SC bits to indicate system scope. 1628 Changed |= enableSC0Bit(MI); 1629 Changed |= enableSC1Bit(MI); 1630 1631 // Ensure operation has completed at system scope to cause all volatile 1632 // operations to be visible outside the program in a global order. Do not 1633 // request cross address space as only the global address space can be 1634 // observable outside the program, so no need to cause a waitcnt for LDS 1635 // address space operations. 1636 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1637 Position::AFTER); 1638 1639 return Changed; 1640 } 1641 1642 if (IsNonTemporal) { 1643 Changed |= enableNTBit(MI); 1644 return Changed; 1645 } 1646 1647 return Changed; 1648 } 1649 1650 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1651 SIAtomicScope Scope, 1652 SIAtomicAddrSpace AddrSpace, 1653 Position Pos) const { 1654 if (!InsertCacheInv) 1655 return false; 1656 1657 bool Changed = false; 1658 1659 MachineBasicBlock &MBB = *MI->getParent(); 1660 DebugLoc DL = MI->getDebugLoc(); 1661 1662 if (Pos == Position::AFTER) 1663 ++MI; 1664 1665 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1666 switch (Scope) { 1667 case SIAtomicScope::SYSTEM: 1668 // Ensures that following loads will not see stale remote VMEM data or 1669 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1670 // CC will never be stale due to the local memory probes. 1671 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1672 // Set SC bits to indicate system scope. 1673 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1674 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1675 // hardware does not reorder memory operations by the same wave with 1676 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1677 // remove any cache lines of earlier writes by the same wave and ensures 1678 // later reads by the same wave will refetch the cache lines. 1679 Changed = true; 1680 break; 1681 case SIAtomicScope::AGENT: 1682 // Ensures that following loads will not see stale remote date or local 1683 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1684 // due to the memory probes. 1685 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1686 // Set SC bits to indicate agent scope. 1687 .addImm(AMDGPU::CPol::SC1); 1688 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1689 // does not reorder memory operations with respect to preceeding buffer 1690 // invalidate. The invalidate is guaranteed to remove any cache lines of 1691 // earlier writes and ensures later writes will refetch the cache lines. 1692 Changed = true; 1693 break; 1694 case SIAtomicScope::WORKGROUP: 1695 // In threadgroup split mode the waves of a work-group can be executing on 1696 // different CUs. Therefore need to invalidate the L1 which is per CU. 1697 // Otherwise in non-threadgroup split mode all waves of a work-group are 1698 // on the same CU, and so the L1 does not need to be invalidated. 1699 if (ST.isTgSplitEnabled()) { 1700 // Ensures L1 is invalidated if in threadgroup split mode. In 1701 // non-threadgroup split mode it is a NOP, but no point generating it in 1702 // that case if know not in that mode. 1703 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1704 // Set SC bits to indicate work-group scope. 1705 .addImm(AMDGPU::CPol::SC0); 1706 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1707 // does not reorder memory operations with respect to preceeding buffer 1708 // invalidate. The invalidate is guaranteed to remove any cache lines of 1709 // earlier writes and ensures later writes will refetch the cache lines. 1710 Changed = true; 1711 } 1712 break; 1713 case SIAtomicScope::WAVEFRONT: 1714 case SIAtomicScope::SINGLETHREAD: 1715 // Could generate "BUFFER_INV" but it would do nothing as there are no 1716 // caches to invalidate. 1717 break; 1718 default: 1719 llvm_unreachable("Unsupported synchronization scope"); 1720 } 1721 } 1722 1723 /// The scratch address space does not need the global memory cache 1724 /// to be flushed as all memory operations by the same thread are 1725 /// sequentially consistent, and no other thread can access scratch 1726 /// memory. 1727 1728 /// Other address spaces do not have a cache. 1729 1730 if (Pos == Position::AFTER) 1731 --MI; 1732 1733 return Changed; 1734 } 1735 1736 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1737 SIAtomicScope Scope, 1738 SIAtomicAddrSpace AddrSpace, 1739 bool IsCrossAddrSpaceOrdering, 1740 Position Pos) const { 1741 bool Changed = false; 1742 1743 MachineBasicBlock &MBB = *MI->getParent(); 1744 DebugLoc DL = MI->getDebugLoc(); 1745 1746 if (Pos == Position::AFTER) 1747 ++MI; 1748 1749 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1750 switch (Scope) { 1751 case SIAtomicScope::SYSTEM: 1752 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1753 // hardware does not reorder memory operations by the same wave with 1754 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1755 // to initiate writeback of any dirty cache lines of earlier writes by the 1756 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1757 // writeback has completed. 1758 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1759 // Set SC bits to indicate system scope. 1760 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1761 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1762 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1763 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1764 Changed = true; 1765 break; 1766 case SIAtomicScope::AGENT: 1767 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1768 // Set SC bits to indicate agent scope. 1769 .addImm(AMDGPU::CPol::SC1); 1770 1771 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1772 // SIAtomicScope::AGENT, the following insertWait will generate the 1773 // required "S_WAITCNT vmcnt(0)". 1774 Changed = true; 1775 break; 1776 case SIAtomicScope::WORKGROUP: 1777 case SIAtomicScope::WAVEFRONT: 1778 case SIAtomicScope::SINGLETHREAD: 1779 // Do not generate "BUFFER_WBL2" as there are no caches it would 1780 // writeback, and would require an otherwise unnecessary 1781 // "S_WAITCNT vmcnt(0)". 1782 break; 1783 default: 1784 llvm_unreachable("Unsupported synchronization scope"); 1785 } 1786 } 1787 1788 if (Pos == Position::AFTER) 1789 --MI; 1790 1791 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1792 // S_WAITCNT needed. 1793 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1794 IsCrossAddrSpaceOrdering, Pos); 1795 1796 return Changed; 1797 } 1798 1799 bool SIGfx10CacheControl::enableLoadCacheBypass( 1800 const MachineBasicBlock::iterator &MI, 1801 SIAtomicScope Scope, 1802 SIAtomicAddrSpace AddrSpace) const { 1803 assert(MI->mayLoad() && !MI->mayStore()); 1804 bool Changed = false; 1805 1806 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1807 switch (Scope) { 1808 case SIAtomicScope::SYSTEM: 1809 case SIAtomicScope::AGENT: 1810 // Set the L0 and L1 cache policies to MISS_EVICT. 1811 // Note: there is no L2 cache coherent bypass control at the ISA level. 1812 Changed |= enableGLCBit(MI); 1813 Changed |= enableDLCBit(MI); 1814 break; 1815 case SIAtomicScope::WORKGROUP: 1816 // In WGP mode the waves of a work-group can be executing on either CU of 1817 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1818 // CU mode all waves of a work-group are on the same CU, and so the L0 1819 // does not need to be bypassed. 1820 if (!ST.isCuModeEnabled()) 1821 Changed |= enableGLCBit(MI); 1822 break; 1823 case SIAtomicScope::WAVEFRONT: 1824 case SIAtomicScope::SINGLETHREAD: 1825 // No cache to bypass. 1826 break; 1827 default: 1828 llvm_unreachable("Unsupported synchronization scope"); 1829 } 1830 } 1831 1832 /// The scratch address space does not need the global memory caches 1833 /// to be bypassed as all memory operations by the same thread are 1834 /// sequentially consistent, and no other thread can access scratch 1835 /// memory. 1836 1837 /// Other address spaces do not have a cache. 1838 1839 return Changed; 1840 } 1841 1842 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1843 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1844 bool IsVolatile, bool IsNonTemporal) const { 1845 1846 // Only handle load and store, not atomic read-modify-write insructions. The 1847 // latter use glc to indicate if the atomic returns a result and so must not 1848 // be used for cache control. 1849 assert(MI->mayLoad() ^ MI->mayStore()); 1850 1851 // Only update load and store, not LLVM IR atomic read-modify-write 1852 // instructions. The latter are always marked as volatile so cannot sensibly 1853 // handle it as do not want to pessimize all atomics. Also they do not support 1854 // the nontemporal attribute. 1855 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1856 1857 bool Changed = false; 1858 1859 if (IsVolatile) { 1860 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1861 // and MISS_LRU for store instructions. 1862 // Note: there is no L2 cache coherent bypass control at the ISA level. 1863 if (Op == SIMemOp::LOAD) { 1864 Changed |= enableGLCBit(MI); 1865 Changed |= enableDLCBit(MI); 1866 } 1867 1868 // Ensure operation has completed at system scope to cause all volatile 1869 // operations to be visible outside the program in a global order. Do not 1870 // request cross address space as only the global address space can be 1871 // observable outside the program, so no need to cause a waitcnt for LDS 1872 // address space operations. 1873 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1874 Position::AFTER); 1875 return Changed; 1876 } 1877 1878 if (IsNonTemporal) { 1879 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1880 // and L2 cache policy to STREAM. 1881 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1882 // to MISS_EVICT and the L2 cache policy to STREAM. 1883 if (Op == SIMemOp::STORE) 1884 Changed |= enableGLCBit(MI); 1885 Changed |= enableSLCBit(MI); 1886 1887 return Changed; 1888 } 1889 1890 return Changed; 1891 } 1892 1893 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1894 SIAtomicScope Scope, 1895 SIAtomicAddrSpace AddrSpace, 1896 SIMemOp Op, 1897 bool IsCrossAddrSpaceOrdering, 1898 Position Pos) const { 1899 bool Changed = false; 1900 1901 MachineBasicBlock &MBB = *MI->getParent(); 1902 DebugLoc DL = MI->getDebugLoc(); 1903 1904 if (Pos == Position::AFTER) 1905 ++MI; 1906 1907 bool VMCnt = false; 1908 bool VSCnt = false; 1909 bool LGKMCnt = false; 1910 1911 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1912 SIAtomicAddrSpace::NONE) { 1913 switch (Scope) { 1914 case SIAtomicScope::SYSTEM: 1915 case SIAtomicScope::AGENT: 1916 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1917 VMCnt |= true; 1918 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1919 VSCnt |= true; 1920 break; 1921 case SIAtomicScope::WORKGROUP: 1922 // In WGP mode the waves of a work-group can be executing on either CU of 1923 // the WGP. Therefore need to wait for operations to complete to ensure 1924 // they are visible to waves in the other CU as the L0 is per CU. 1925 // Otherwise in CU mode and all waves of a work-group are on the same CU 1926 // which shares the same L0. 1927 if (!ST.isCuModeEnabled()) { 1928 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1929 VMCnt |= true; 1930 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1931 VSCnt |= true; 1932 } 1933 break; 1934 case SIAtomicScope::WAVEFRONT: 1935 case SIAtomicScope::SINGLETHREAD: 1936 // The L0 cache keeps all memory operations in order for 1937 // work-items in the same wavefront. 1938 break; 1939 default: 1940 llvm_unreachable("Unsupported synchronization scope"); 1941 } 1942 } 1943 1944 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1945 switch (Scope) { 1946 case SIAtomicScope::SYSTEM: 1947 case SIAtomicScope::AGENT: 1948 case SIAtomicScope::WORKGROUP: 1949 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1950 // not needed as LDS operations for all waves are executed in a total 1951 // global ordering as observed by all waves. Required if also 1952 // synchronizing with global/GDS memory as LDS operations could be 1953 // reordered with respect to later global/GDS memory operations of the 1954 // same wave. 1955 LGKMCnt |= IsCrossAddrSpaceOrdering; 1956 break; 1957 case SIAtomicScope::WAVEFRONT: 1958 case SIAtomicScope::SINGLETHREAD: 1959 // The LDS keeps all memory operations in order for 1960 // the same wavefront. 1961 break; 1962 default: 1963 llvm_unreachable("Unsupported synchronization scope"); 1964 } 1965 } 1966 1967 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1968 switch (Scope) { 1969 case SIAtomicScope::SYSTEM: 1970 case SIAtomicScope::AGENT: 1971 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1972 // is not needed as GDS operations for all waves are executed in a total 1973 // global ordering as observed by all waves. Required if also 1974 // synchronizing with global/LDS memory as GDS operations could be 1975 // reordered with respect to later global/LDS memory operations of the 1976 // same wave. 1977 LGKMCnt |= IsCrossAddrSpaceOrdering; 1978 break; 1979 case SIAtomicScope::WORKGROUP: 1980 case SIAtomicScope::WAVEFRONT: 1981 case SIAtomicScope::SINGLETHREAD: 1982 // The GDS keeps all memory operations in order for 1983 // the same work-group. 1984 break; 1985 default: 1986 llvm_unreachable("Unsupported synchronization scope"); 1987 } 1988 } 1989 1990 if (VMCnt || LGKMCnt) { 1991 unsigned WaitCntImmediate = 1992 AMDGPU::encodeWaitcnt(IV, 1993 VMCnt ? 0 : getVmcntBitMask(IV), 1994 getExpcntBitMask(IV), 1995 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1996 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1997 .addImm(WaitCntImmediate); 1998 Changed = true; 1999 } 2000 2001 if (VSCnt) { 2002 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 2003 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 2004 .addImm(0); 2005 Changed = true; 2006 } 2007 2008 if (Pos == Position::AFTER) 2009 --MI; 2010 2011 return Changed; 2012 } 2013 2014 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2015 SIAtomicScope Scope, 2016 SIAtomicAddrSpace AddrSpace, 2017 Position Pos) const { 2018 if (!InsertCacheInv) 2019 return false; 2020 2021 bool Changed = false; 2022 2023 MachineBasicBlock &MBB = *MI->getParent(); 2024 DebugLoc DL = MI->getDebugLoc(); 2025 2026 if (Pos == Position::AFTER) 2027 ++MI; 2028 2029 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2030 switch (Scope) { 2031 case SIAtomicScope::SYSTEM: 2032 case SIAtomicScope::AGENT: 2033 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2034 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2035 Changed = true; 2036 break; 2037 case SIAtomicScope::WORKGROUP: 2038 // In WGP mode the waves of a work-group can be executing on either CU of 2039 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2040 // in CU mode and all waves of a work-group are on the same CU, and so the 2041 // L0 does not need to be invalidated. 2042 if (!ST.isCuModeEnabled()) { 2043 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2044 Changed = true; 2045 } 2046 break; 2047 case SIAtomicScope::WAVEFRONT: 2048 case SIAtomicScope::SINGLETHREAD: 2049 // No cache to invalidate. 2050 break; 2051 default: 2052 llvm_unreachable("Unsupported synchronization scope"); 2053 } 2054 } 2055 2056 /// The scratch address space does not need the global memory cache 2057 /// to be flushed as all memory operations by the same thread are 2058 /// sequentially consistent, and no other thread can access scratch 2059 /// memory. 2060 2061 /// Other address spaces do not have a cache. 2062 2063 if (Pos == Position::AFTER) 2064 --MI; 2065 2066 return Changed; 2067 } 2068 2069 bool SIGfx11CacheControl::enableLoadCacheBypass( 2070 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2071 SIAtomicAddrSpace AddrSpace) const { 2072 assert(MI->mayLoad() && !MI->mayStore()); 2073 bool Changed = false; 2074 2075 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2076 switch (Scope) { 2077 case SIAtomicScope::SYSTEM: 2078 case SIAtomicScope::AGENT: 2079 // Set the L0 and L1 cache policies to MISS_EVICT. 2080 // Note: there is no L2 cache coherent bypass control at the ISA level. 2081 Changed |= enableGLCBit(MI); 2082 break; 2083 case SIAtomicScope::WORKGROUP: 2084 // In WGP mode the waves of a work-group can be executing on either CU of 2085 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2086 // CU mode all waves of a work-group are on the same CU, and so the L0 2087 // does not need to be bypassed. 2088 if (!ST.isCuModeEnabled()) 2089 Changed |= enableGLCBit(MI); 2090 break; 2091 case SIAtomicScope::WAVEFRONT: 2092 case SIAtomicScope::SINGLETHREAD: 2093 // No cache to bypass. 2094 break; 2095 default: 2096 llvm_unreachable("Unsupported synchronization scope"); 2097 } 2098 } 2099 2100 /// The scratch address space does not need the global memory caches 2101 /// to be bypassed as all memory operations by the same thread are 2102 /// sequentially consistent, and no other thread can access scratch 2103 /// memory. 2104 2105 /// Other address spaces do not have a cache. 2106 2107 return Changed; 2108 } 2109 2110 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2111 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2112 bool IsVolatile, bool IsNonTemporal) const { 2113 2114 // Only handle load and store, not atomic read-modify-write insructions. The 2115 // latter use glc to indicate if the atomic returns a result and so must not 2116 // be used for cache control. 2117 assert(MI->mayLoad() ^ MI->mayStore()); 2118 2119 // Only update load and store, not LLVM IR atomic read-modify-write 2120 // instructions. The latter are always marked as volatile so cannot sensibly 2121 // handle it as do not want to pessimize all atomics. Also they do not support 2122 // the nontemporal attribute. 2123 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2124 2125 bool Changed = false; 2126 2127 if (IsVolatile) { 2128 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2129 // and MISS_LRU for store instructions. 2130 // Note: there is no L2 cache coherent bypass control at the ISA level. 2131 if (Op == SIMemOp::LOAD) 2132 Changed |= enableGLCBit(MI); 2133 2134 // Set MALL NOALLOC for load and store instructions. 2135 Changed |= enableDLCBit(MI); 2136 2137 // Ensure operation has completed at system scope to cause all volatile 2138 // operations to be visible outside the program in a global order. Do not 2139 // request cross address space as only the global address space can be 2140 // observable outside the program, so no need to cause a waitcnt for LDS 2141 // address space operations. 2142 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2143 Position::AFTER); 2144 return Changed; 2145 } 2146 2147 if (IsNonTemporal) { 2148 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2149 // and L2 cache policy to STREAM. 2150 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2151 // to MISS_EVICT and the L2 cache policy to STREAM. 2152 if (Op == SIMemOp::STORE) 2153 Changed |= enableGLCBit(MI); 2154 Changed |= enableSLCBit(MI); 2155 2156 // Set MALL NOALLOC for load and store instructions. 2157 Changed |= enableDLCBit(MI); 2158 return Changed; 2159 } 2160 2161 return Changed; 2162 } 2163 2164 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, 2165 AMDGPU::CPol::CPol Value) const { 2166 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2167 if (!CPol) 2168 return false; 2169 2170 uint64_t NewTH = Value & AMDGPU::CPol::TH; 2171 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { 2172 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); 2173 return true; 2174 } 2175 2176 return false; 2177 } 2178 2179 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, 2180 AMDGPU::CPol::CPol Value) const { 2181 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2182 if (!CPol) 2183 return false; 2184 2185 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; 2186 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { 2187 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); 2188 return true; 2189 } 2190 2191 return false; 2192 } 2193 2194 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, 2195 SIAtomicScope Scope, 2196 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2197 bool IsCrossAddrSpaceOrdering, 2198 Position Pos) const { 2199 bool Changed = false; 2200 2201 MachineBasicBlock &MBB = *MI->getParent(); 2202 DebugLoc DL = MI->getDebugLoc(); 2203 2204 bool LOADCnt = false; 2205 bool DSCnt = false; 2206 bool STORECnt = false; 2207 2208 if (Pos == Position::AFTER) 2209 ++MI; 2210 2211 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2212 SIAtomicAddrSpace::NONE) { 2213 switch (Scope) { 2214 case SIAtomicScope::SYSTEM: 2215 case SIAtomicScope::AGENT: 2216 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2217 LOADCnt |= true; 2218 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2219 STORECnt |= true; 2220 break; 2221 case SIAtomicScope::WORKGROUP: 2222 // In WGP mode the waves of a work-group can be executing on either CU of 2223 // the WGP. Therefore need to wait for operations to complete to ensure 2224 // they are visible to waves in the other CU as the L0 is per CU. 2225 // Otherwise in CU mode and all waves of a work-group are on the same CU 2226 // which shares the same L0. 2227 if (!ST.isCuModeEnabled()) { 2228 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2229 LOADCnt |= true; 2230 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2231 STORECnt |= true; 2232 } 2233 break; 2234 case SIAtomicScope::WAVEFRONT: 2235 case SIAtomicScope::SINGLETHREAD: 2236 // The L0 cache keeps all memory operations in order for 2237 // work-items in the same wavefront. 2238 break; 2239 default: 2240 llvm_unreachable("Unsupported synchronization scope"); 2241 } 2242 } 2243 2244 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2245 switch (Scope) { 2246 case SIAtomicScope::SYSTEM: 2247 case SIAtomicScope::AGENT: 2248 case SIAtomicScope::WORKGROUP: 2249 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2250 // not needed as LDS operations for all waves are executed in a total 2251 // global ordering as observed by all waves. Required if also 2252 // synchronizing with global/GDS memory as LDS operations could be 2253 // reordered with respect to later global/GDS memory operations of the 2254 // same wave. 2255 DSCnt |= IsCrossAddrSpaceOrdering; 2256 break; 2257 case SIAtomicScope::WAVEFRONT: 2258 case SIAtomicScope::SINGLETHREAD: 2259 // The LDS keeps all memory operations in order for 2260 // the same wavefront. 2261 break; 2262 default: 2263 llvm_unreachable("Unsupported synchronization scope"); 2264 } 2265 } 2266 2267 if (LOADCnt) { 2268 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); 2269 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); 2270 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); 2271 Changed = true; 2272 } 2273 2274 if (STORECnt) { 2275 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); 2276 Changed = true; 2277 } 2278 2279 if (DSCnt) { 2280 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); 2281 Changed = true; 2282 } 2283 2284 if (Pos == Position::AFTER) 2285 --MI; 2286 2287 return Changed; 2288 } 2289 2290 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2291 SIAtomicScope Scope, 2292 SIAtomicAddrSpace AddrSpace, 2293 Position Pos) const { 2294 if (!InsertCacheInv) 2295 return false; 2296 2297 MachineBasicBlock &MBB = *MI->getParent(); 2298 DebugLoc DL = MI->getDebugLoc(); 2299 2300 /// The scratch address space does not need the global memory cache 2301 /// to be flushed as all memory operations by the same thread are 2302 /// sequentially consistent, and no other thread can access scratch 2303 /// memory. 2304 2305 /// Other address spaces do not have a cache. 2306 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2307 return false; 2308 2309 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2310 switch (Scope) { 2311 case SIAtomicScope::SYSTEM: 2312 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2313 break; 2314 case SIAtomicScope::AGENT: 2315 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2316 break; 2317 case SIAtomicScope::WORKGROUP: 2318 // In WGP mode the waves of a work-group can be executing on either CU of 2319 // the WGP. Therefore we need to invalidate the L0 which is per CU. 2320 // Otherwise in CU mode all waves of a work-group are on the same CU, and so 2321 // the L0 does not need to be invalidated. 2322 if (ST.isCuModeEnabled()) 2323 return false; 2324 2325 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2326 break; 2327 case SIAtomicScope::WAVEFRONT: 2328 case SIAtomicScope::SINGLETHREAD: 2329 // No cache to invalidate. 2330 return false; 2331 default: 2332 llvm_unreachable("Unsupported synchronization scope"); 2333 } 2334 2335 if (Pos == Position::AFTER) 2336 ++MI; 2337 2338 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 2339 2340 if (Pos == Position::AFTER) 2341 --MI; 2342 2343 return true; 2344 } 2345 2346 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( 2347 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2348 bool IsVolatile, bool IsNonTemporal) const { 2349 2350 // Only handle load and store, not atomic read-modify-write instructions. 2351 assert(MI->mayLoad() ^ MI->mayStore()); 2352 2353 // Only update load and store, not LLVM IR atomic read-modify-write 2354 // instructions. The latter are always marked as volatile so cannot sensibly 2355 // handle it as do not want to pessimize all atomics. Also they do not support 2356 // the nontemporal attribute. 2357 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2358 2359 bool Changed = false; 2360 2361 if (IsNonTemporal) { 2362 // Set non-temporal hint for all cache levels. 2363 Changed |= setTH(MI, AMDGPU::CPol::TH_NT); 2364 } 2365 2366 if (IsVolatile) { 2367 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2368 2369 // Ensure operation has completed at system scope to cause all volatile 2370 // operations to be visible outside the program in a global order. Do not 2371 // request cross address space as only the global address space can be 2372 // observable outside the program, so no need to cause a waitcnt for LDS 2373 // address space operations. 2374 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2375 Position::AFTER); 2376 } 2377 2378 return Changed; 2379 } 2380 2381 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2382 if (AtomicPseudoMIs.empty()) 2383 return false; 2384 2385 for (auto &MI : AtomicPseudoMIs) 2386 MI->eraseFromParent(); 2387 2388 AtomicPseudoMIs.clear(); 2389 return true; 2390 } 2391 2392 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2393 MachineBasicBlock::iterator &MI) { 2394 assert(MI->mayLoad() && !MI->mayStore()); 2395 2396 bool Changed = false; 2397 2398 if (MOI.isAtomic()) { 2399 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2400 MOI.getOrdering() == AtomicOrdering::Acquire || 2401 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2402 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2403 MOI.getOrderingAddrSpace()); 2404 } 2405 2406 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2407 Changed |= CC->insertWait(MI, MOI.getScope(), 2408 MOI.getOrderingAddrSpace(), 2409 SIMemOp::LOAD | SIMemOp::STORE, 2410 MOI.getIsCrossAddressSpaceOrdering(), 2411 Position::BEFORE); 2412 2413 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2414 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2415 Changed |= CC->insertWait(MI, MOI.getScope(), 2416 MOI.getInstrAddrSpace(), 2417 SIMemOp::LOAD, 2418 MOI.getIsCrossAddressSpaceOrdering(), 2419 Position::AFTER); 2420 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2421 MOI.getOrderingAddrSpace(), 2422 Position::AFTER); 2423 } 2424 2425 return Changed; 2426 } 2427 2428 // Atomic instructions already bypass caches to the scope specified by the 2429 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2430 // need additional treatment. 2431 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2432 SIMemOp::LOAD, MOI.isVolatile(), 2433 MOI.isNonTemporal()); 2434 return Changed; 2435 } 2436 2437 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2438 MachineBasicBlock::iterator &MI) { 2439 assert(!MI->mayLoad() && MI->mayStore()); 2440 2441 bool Changed = false; 2442 2443 if (MOI.isAtomic()) { 2444 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2445 MOI.getOrdering() == AtomicOrdering::Release || 2446 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2447 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2448 MOI.getOrderingAddrSpace()); 2449 } 2450 2451 if (MOI.getOrdering() == AtomicOrdering::Release || 2452 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2453 Changed |= CC->insertRelease(MI, MOI.getScope(), 2454 MOI.getOrderingAddrSpace(), 2455 MOI.getIsCrossAddressSpaceOrdering(), 2456 Position::BEFORE); 2457 2458 return Changed; 2459 } 2460 2461 // Atomic instructions already bypass caches to the scope specified by the 2462 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2463 // need additional treatment. 2464 Changed |= CC->enableVolatileAndOrNonTemporal( 2465 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2466 MOI.isNonTemporal()); 2467 return Changed; 2468 } 2469 2470 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2471 MachineBasicBlock::iterator &MI) { 2472 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2473 2474 AtomicPseudoMIs.push_back(MI); 2475 bool Changed = false; 2476 2477 if (MOI.isAtomic()) { 2478 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2479 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), 2480 SIMemOp::LOAD | SIMemOp::STORE, 2481 MOI.getIsCrossAddressSpaceOrdering(), 2482 Position::BEFORE); 2483 2484 if (MOI.getOrdering() == AtomicOrdering::Release || 2485 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2486 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2487 /// TODO: This relies on a barrier always generating a waitcnt 2488 /// for LDS to ensure it is not reordered with the completion of 2489 /// the proceeding LDS operations. If barrier had a memory 2490 /// ordering and memory scope, then library does not need to 2491 /// generate a fence. Could add support in this file for 2492 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2493 /// adding S_WAITCNT before a S_BARRIER. 2494 Changed |= CC->insertRelease(MI, MOI.getScope(), 2495 MOI.getOrderingAddrSpace(), 2496 MOI.getIsCrossAddressSpaceOrdering(), 2497 Position::BEFORE); 2498 2499 // TODO: If both release and invalidate are happening they could be combined 2500 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2501 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2502 // track cache invalidate and write back instructions. 2503 2504 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2505 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2506 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2507 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2508 MOI.getOrderingAddrSpace(), 2509 Position::BEFORE); 2510 2511 return Changed; 2512 } 2513 2514 return Changed; 2515 } 2516 2517 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2518 MachineBasicBlock::iterator &MI) { 2519 assert(MI->mayLoad() && MI->mayStore()); 2520 2521 bool Changed = false; 2522 2523 if (MOI.isAtomic()) { 2524 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2525 MOI.getOrdering() == AtomicOrdering::Acquire || 2526 MOI.getOrdering() == AtomicOrdering::Release || 2527 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2528 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2529 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2530 MOI.getInstrAddrSpace()); 2531 } 2532 2533 if (MOI.getOrdering() == AtomicOrdering::Release || 2534 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2535 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2536 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2537 Changed |= CC->insertRelease(MI, MOI.getScope(), 2538 MOI.getOrderingAddrSpace(), 2539 MOI.getIsCrossAddressSpaceOrdering(), 2540 Position::BEFORE); 2541 2542 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2543 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2544 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2545 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2546 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2547 Changed |= CC->insertWait(MI, MOI.getScope(), 2548 MOI.getInstrAddrSpace(), 2549 isAtomicRet(*MI) ? SIMemOp::LOAD : 2550 SIMemOp::STORE, 2551 MOI.getIsCrossAddressSpaceOrdering(), 2552 Position::AFTER); 2553 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2554 MOI.getOrderingAddrSpace(), 2555 Position::AFTER); 2556 } 2557 2558 return Changed; 2559 } 2560 2561 return Changed; 2562 } 2563 2564 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2565 bool Changed = false; 2566 2567 SIMemOpAccess MOA(MF); 2568 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2569 2570 for (auto &MBB : MF) { 2571 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2572 2573 // Unbundle instructions after the post-RA scheduler. 2574 if (MI->isBundle() && MI->mayLoadOrStore()) { 2575 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2576 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2577 I != E && I->isBundledWithPred(); ++I) { 2578 I->unbundleFromPred(); 2579 for (MachineOperand &MO : I->operands()) 2580 if (MO.isReg()) 2581 MO.setIsInternalRead(false); 2582 } 2583 2584 MI->eraseFromParent(); 2585 MI = II->getIterator(); 2586 } 2587 2588 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2589 continue; 2590 2591 if (const auto &MOI = MOA.getLoadInfo(MI)) 2592 Changed |= expandLoad(*MOI, MI); 2593 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2594 Changed |= expandStore(*MOI, MI); 2595 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2596 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2597 Changed |= expandAtomicFence(*MOI, MI); 2598 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2599 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2600 } 2601 } 2602 2603 Changed |= removeAtomicPseudoMIs(); 2604 return Changed; 2605 } 2606 2607 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2608 2609 char SIMemoryLegalizer::ID = 0; 2610 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2611 2612 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2613 return new SIMemoryLegalizer(); 2614 } 2615