1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/ADT/StringExtras.h" 22 #include "llvm/CodeGen/MachineBasicBlock.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h" 26 #include "llvm/Support/AtomicOrdering.h" 27 #include "llvm/TargetParser/TargetParser.h" 28 29 using namespace llvm; 30 using namespace llvm::AMDGPU; 31 32 #define DEBUG_TYPE "si-memory-legalizer" 33 #define PASS_NAME "SI Memory Legalizer" 34 35 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 36 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 37 cl::desc("Use this to skip inserting cache invalidating instructions.")); 38 39 namespace { 40 41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 42 43 /// Memory operation flags. Can be ORed together. 44 enum class SIMemOp { 45 NONE = 0u, 46 LOAD = 1u << 0, 47 STORE = 1u << 1, 48 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 49 }; 50 51 /// Position to insert a new instruction relative to an existing 52 /// instruction. 53 enum class Position { 54 BEFORE, 55 AFTER 56 }; 57 58 /// The atomic synchronization scopes supported by the AMDGPU target. 59 enum class SIAtomicScope { 60 NONE, 61 SINGLETHREAD, 62 WAVEFRONT, 63 WORKGROUP, 64 AGENT, 65 SYSTEM 66 }; 67 68 /// The distinct address spaces supported by the AMDGPU target for 69 /// atomic memory operation. Can be ORed together. 70 enum class SIAtomicAddrSpace { 71 NONE = 0u, 72 GLOBAL = 1u << 0, 73 LDS = 1u << 1, 74 SCRATCH = 1u << 2, 75 GDS = 1u << 3, 76 OTHER = 1u << 4, 77 78 /// The address spaces that can be accessed by a FLAT instruction. 79 FLAT = GLOBAL | LDS | SCRATCH, 80 81 /// The address spaces that support atomic instructions. 82 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 83 84 /// All address spaces. 85 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 86 87 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 88 }; 89 90 class SIMemOpInfo final { 91 private: 92 93 friend class SIMemOpAccess; 94 95 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 96 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 97 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 98 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 99 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 100 bool IsCrossAddressSpaceOrdering = false; 101 bool IsVolatile = false; 102 bool IsNonTemporal = false; 103 bool IsLastUse = false; 104 105 SIMemOpInfo( 106 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 107 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 108 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 109 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 110 bool IsCrossAddressSpaceOrdering = true, 111 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, 112 bool IsVolatile = false, bool IsNonTemporal = false, 113 bool IsLastUse = false) 114 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), 115 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), 116 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 117 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), 118 IsLastUse(IsLastUse) { 119 120 if (Ordering == AtomicOrdering::NotAtomic) { 121 assert(Scope == SIAtomicScope::NONE && 122 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 123 !IsCrossAddressSpaceOrdering && 124 FailureOrdering == AtomicOrdering::NotAtomic); 125 return; 126 } 127 128 assert(Scope != SIAtomicScope::NONE && 129 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE && 131 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 132 SIAtomicAddrSpace::NONE); 133 134 // There is also no cross address space ordering if the ordering 135 // address space is the same as the instruction address space and 136 // only contains a single address space. 137 if ((OrderingAddrSpace == InstrAddrSpace) && 138 isPowerOf2_32(uint32_t(InstrAddrSpace))) 139 this->IsCrossAddressSpaceOrdering = false; 140 141 // Limit the scope to the maximum supported by the instruction's address 142 // spaces. 143 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 144 SIAtomicAddrSpace::NONE) { 145 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 146 } else if ((InstrAddrSpace & 147 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 148 SIAtomicAddrSpace::NONE) { 149 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 150 } else if ((InstrAddrSpace & 151 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 152 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 153 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 154 } 155 } 156 157 public: 158 /// \returns Atomic synchronization scope of the machine instruction used to 159 /// create this SIMemOpInfo. 160 SIAtomicScope getScope() const { 161 return Scope; 162 } 163 164 /// \returns Ordering constraint of the machine instruction used to 165 /// create this SIMemOpInfo. 166 AtomicOrdering getOrdering() const { 167 return Ordering; 168 } 169 170 /// \returns Failure ordering constraint of the machine instruction used to 171 /// create this SIMemOpInfo. 172 AtomicOrdering getFailureOrdering() const { 173 return FailureOrdering; 174 } 175 176 /// \returns The address spaces be accessed by the machine 177 /// instruction used to create this SIMemOpInfo. 178 SIAtomicAddrSpace getInstrAddrSpace() const { 179 return InstrAddrSpace; 180 } 181 182 /// \returns The address spaces that must be ordered by the machine 183 /// instruction used to create this SIMemOpInfo. 184 SIAtomicAddrSpace getOrderingAddrSpace() const { 185 return OrderingAddrSpace; 186 } 187 188 /// \returns Return true iff memory ordering of operations on 189 /// different address spaces is required. 190 bool getIsCrossAddressSpaceOrdering() const { 191 return IsCrossAddressSpaceOrdering; 192 } 193 194 /// \returns True if memory access of the machine instruction used to 195 /// create this SIMemOpInfo is volatile, false otherwise. 196 bool isVolatile() const { 197 return IsVolatile; 198 } 199 200 /// \returns True if memory access of the machine instruction used to 201 /// create this SIMemOpInfo is nontemporal, false otherwise. 202 bool isNonTemporal() const { 203 return IsNonTemporal; 204 } 205 206 /// \returns True if memory access of the machine instruction used to 207 /// create this SIMemOpInfo is last use, false otherwise. 208 bool isLastUse() const { return IsLastUse; } 209 210 /// \returns True if ordering constraint of the machine instruction used to 211 /// create this SIMemOpInfo is unordered or higher, false otherwise. 212 bool isAtomic() const { 213 return Ordering != AtomicOrdering::NotAtomic; 214 } 215 216 }; 217 218 class SIMemOpAccess final { 219 private: 220 const AMDGPUMachineModuleInfo *MMI = nullptr; 221 222 /// Reports unsupported message \p Msg for \p MI to LLVM context. 223 void reportUnsupported(const MachineBasicBlock::iterator &MI, 224 const char *Msg) const; 225 226 /// Inspects the target synchronization scope \p SSID and determines 227 /// the SI atomic scope it corresponds to, the address spaces it 228 /// covers, and whether the memory ordering applies between address 229 /// spaces. 230 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 231 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 232 233 /// \return Return a bit set of the address spaces accessed by \p AS. 234 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 235 236 /// \returns Info constructed from \p MI, which has at least machine memory 237 /// operand. 238 std::optional<SIMemOpInfo> 239 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 240 241 public: 242 /// Construct class to support accessing the machine memory operands 243 /// of instructions in the machine function \p MF. 244 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI); 245 246 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 247 std::optional<SIMemOpInfo> 248 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 249 250 /// \returns Store info if \p MI is a store operation, "std::nullopt" 251 /// otherwise. 252 std::optional<SIMemOpInfo> 253 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 254 255 /// \returns Atomic fence info if \p MI is an atomic fence operation, 256 /// "std::nullopt" otherwise. 257 std::optional<SIMemOpInfo> 258 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 259 260 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 261 /// rmw operation, "std::nullopt" otherwise. 262 std::optional<SIMemOpInfo> 263 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 264 }; 265 266 class SICacheControl { 267 protected: 268 269 /// AMDGPU subtarget info. 270 const GCNSubtarget &ST; 271 272 /// Instruction info. 273 const SIInstrInfo *TII = nullptr; 274 275 IsaVersion IV; 276 277 /// Whether to insert cache invalidating instructions. 278 bool InsertCacheInv; 279 280 SICacheControl(const GCNSubtarget &ST); 281 282 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 283 /// \returns Returns true if \p MI is modified, false otherwise. 284 bool enableNamedBit(const MachineBasicBlock::iterator MI, 285 AMDGPU::CPol::CPol Bit) const; 286 287 public: 288 289 /// Create a cache control for the subtarget \p ST. 290 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 291 292 /// Update \p MI memory load instruction to bypass any caches up to 293 /// the \p Scope memory scope for address spaces \p 294 /// AddrSpace. Return true iff the instruction was modified. 295 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 296 SIAtomicScope Scope, 297 SIAtomicAddrSpace AddrSpace) const = 0; 298 299 /// Update \p MI memory store instruction to bypass any caches up to 300 /// the \p Scope memory scope for address spaces \p 301 /// AddrSpace. Return true iff the instruction was modified. 302 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 303 SIAtomicScope Scope, 304 SIAtomicAddrSpace AddrSpace) const = 0; 305 306 /// Update \p MI memory read-modify-write instruction to bypass any caches up 307 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 308 /// iff the instruction was modified. 309 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 310 SIAtomicScope Scope, 311 SIAtomicAddrSpace AddrSpace) const = 0; 312 313 /// Update \p MI memory instruction of kind \p Op associated with address 314 /// spaces \p AddrSpace to indicate it is volatile and/or 315 /// nontemporal/last-use. Return true iff the instruction was modified. 316 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 317 SIAtomicAddrSpace AddrSpace, 318 SIMemOp Op, bool IsVolatile, 319 bool IsNonTemporal, 320 bool IsLastUse = false) const = 0; 321 322 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { 323 return false; 324 }; 325 326 /// Inserts any necessary instructions at position \p Pos relative 327 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 328 /// \p Op associated with address spaces \p AddrSpace have completed. Used 329 /// between memory instructions to enforce the order they become visible as 330 /// observed by other memory instructions executing in memory scope \p Scope. 331 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 332 /// address spaces. Returns true iff any instructions inserted. 333 virtual bool insertWait(MachineBasicBlock::iterator &MI, 334 SIAtomicScope Scope, 335 SIAtomicAddrSpace AddrSpace, 336 SIMemOp Op, 337 bool IsCrossAddrSpaceOrdering, 338 Position Pos) const = 0; 339 340 /// Inserts any necessary instructions at position \p Pos relative to 341 /// instruction \p MI to ensure any subsequent memory instructions of this 342 /// thread with address spaces \p AddrSpace will observe the previous memory 343 /// operations by any thread for memory scopes up to memory scope \p Scope . 344 /// Returns true iff any instructions inserted. 345 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 Position Pos) const = 0; 349 350 /// Inserts any necessary instructions at position \p Pos relative to 351 /// instruction \p MI to ensure previous memory instructions by this thread 352 /// with address spaces \p AddrSpace have completed and can be observed by 353 /// subsequent memory instructions by any thread executing in memory scope \p 354 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 355 /// between address spaces. Returns true iff any instructions inserted. 356 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 357 SIAtomicScope Scope, 358 SIAtomicAddrSpace AddrSpace, 359 bool IsCrossAddrSpaceOrdering, 360 Position Pos) const = 0; 361 362 /// Virtual destructor to allow derivations to be deleted. 363 virtual ~SICacheControl() = default; 364 365 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 366 MachineBasicBlock::iterator &MI) const { 367 return false; 368 } 369 }; 370 371 class SIGfx6CacheControl : public SICacheControl { 372 protected: 373 374 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 375 /// is modified, false otherwise. 376 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 377 return enableNamedBit(MI, AMDGPU::CPol::GLC); 378 } 379 380 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 381 /// is modified, false otherwise. 382 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 383 return enableNamedBit(MI, AMDGPU::CPol::SLC); 384 } 385 386 public: 387 388 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 389 390 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace) const override; 393 394 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 395 SIAtomicScope Scope, 396 SIAtomicAddrSpace AddrSpace) const override; 397 398 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 399 SIAtomicScope Scope, 400 SIAtomicAddrSpace AddrSpace) const override; 401 402 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 403 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 404 bool IsVolatile, bool IsNonTemporal, 405 bool IsLastUse) const override; 406 407 bool insertWait(MachineBasicBlock::iterator &MI, 408 SIAtomicScope Scope, 409 SIAtomicAddrSpace AddrSpace, 410 SIMemOp Op, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 414 bool insertAcquire(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 Position Pos) const override; 418 419 bool insertRelease(MachineBasicBlock::iterator &MI, 420 SIAtomicScope Scope, 421 SIAtomicAddrSpace AddrSpace, 422 bool IsCrossAddrSpaceOrdering, 423 Position Pos) const override; 424 }; 425 426 class SIGfx7CacheControl : public SIGfx6CacheControl { 427 public: 428 429 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 430 431 bool insertAcquire(MachineBasicBlock::iterator &MI, 432 SIAtomicScope Scope, 433 SIAtomicAddrSpace AddrSpace, 434 Position Pos) const override; 435 436 }; 437 438 class SIGfx90ACacheControl : public SIGfx7CacheControl { 439 public: 440 441 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 442 443 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace) const override; 446 447 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 448 SIAtomicScope Scope, 449 SIAtomicAddrSpace AddrSpace) const override; 450 451 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 452 SIAtomicScope Scope, 453 SIAtomicAddrSpace AddrSpace) const override; 454 455 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 456 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 457 bool IsVolatile, bool IsNonTemporal, 458 bool IsLastUse) const override; 459 460 bool insertWait(MachineBasicBlock::iterator &MI, 461 SIAtomicScope Scope, 462 SIAtomicAddrSpace AddrSpace, 463 SIMemOp Op, 464 bool IsCrossAddrSpaceOrdering, 465 Position Pos) const override; 466 467 bool insertAcquire(MachineBasicBlock::iterator &MI, 468 SIAtomicScope Scope, 469 SIAtomicAddrSpace AddrSpace, 470 Position Pos) const override; 471 472 bool insertRelease(MachineBasicBlock::iterator &MI, 473 SIAtomicScope Scope, 474 SIAtomicAddrSpace AddrSpace, 475 bool IsCrossAddrSpaceOrdering, 476 Position Pos) const override; 477 }; 478 479 class SIGfx940CacheControl : public SIGfx90ACacheControl { 480 protected: 481 482 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 483 /// is modified, false otherwise. 484 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 485 return enableNamedBit(MI, AMDGPU::CPol::SC0); 486 } 487 488 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 489 /// is modified, false otherwise. 490 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 491 return enableNamedBit(MI, AMDGPU::CPol::SC1); 492 } 493 494 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 495 /// is modified, false otherwise. 496 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 497 return enableNamedBit(MI, AMDGPU::CPol::NT); 498 } 499 500 public: 501 502 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 503 504 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 505 SIAtomicScope Scope, 506 SIAtomicAddrSpace AddrSpace) const override; 507 508 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 509 SIAtomicScope Scope, 510 SIAtomicAddrSpace AddrSpace) const override; 511 512 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 513 SIAtomicScope Scope, 514 SIAtomicAddrSpace AddrSpace) const override; 515 516 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 517 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 518 bool IsVolatile, bool IsNonTemporal, 519 bool IsLastUse) const override; 520 521 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 522 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 523 524 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 525 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 526 Position Pos) const override; 527 528 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 529 MachineBasicBlock::iterator &MI) const override { 530 bool Changed = false; 531 if (ST.hasForceStoreSC0SC1() && 532 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 533 SIAtomicAddrSpace::GLOBAL | 534 SIAtomicAddrSpace::OTHER)) != 535 SIAtomicAddrSpace::NONE) { 536 Changed |= enableSC0Bit(MI); 537 Changed |= enableSC1Bit(MI); 538 } 539 return Changed; 540 } 541 }; 542 543 class SIGfx10CacheControl : public SIGfx7CacheControl { 544 protected: 545 546 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 547 /// is modified, false otherwise. 548 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 549 return enableNamedBit(MI, AMDGPU::CPol::DLC); 550 } 551 552 public: 553 554 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 555 556 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 557 SIAtomicScope Scope, 558 SIAtomicAddrSpace AddrSpace) const override; 559 560 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 561 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 562 bool IsVolatile, bool IsNonTemporal, 563 bool IsLastUse) const override; 564 565 bool insertWait(MachineBasicBlock::iterator &MI, 566 SIAtomicScope Scope, 567 SIAtomicAddrSpace AddrSpace, 568 SIMemOp Op, 569 bool IsCrossAddrSpaceOrdering, 570 Position Pos) const override; 571 572 bool insertAcquire(MachineBasicBlock::iterator &MI, 573 SIAtomicScope Scope, 574 SIAtomicAddrSpace AddrSpace, 575 Position Pos) const override; 576 }; 577 578 class SIGfx11CacheControl : public SIGfx10CacheControl { 579 public: 580 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 581 582 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 583 SIAtomicScope Scope, 584 SIAtomicAddrSpace AddrSpace) const override; 585 586 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 587 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 588 bool IsVolatile, bool IsNonTemporal, 589 bool IsLastUse) const override; 590 }; 591 592 class SIGfx12CacheControl : public SIGfx11CacheControl { 593 protected: 594 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. 595 // \returns Returns true if \p MI is modified, false otherwise. 596 bool setTH(const MachineBasicBlock::iterator MI, 597 AMDGPU::CPol::CPol Value) const; 598 // Sets Scope policy to \p Value if CPol operand is present in instruction \p 599 // MI. \returns Returns true if \p MI is modified, false otherwise. 600 bool setScope(const MachineBasicBlock::iterator MI, 601 AMDGPU::CPol::CPol Value) const; 602 603 // Stores with system scope (SCOPE_SYS) need to wait for: 604 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 605 // - non-returning-atomics - wait for STORECNT==0 606 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits 607 // since it does not distinguish atomics-with-return from regular stores. 608 // There is no need to wait if memory is cached (mtype != UC). 609 bool 610 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; 611 612 bool setAtomicScope(const MachineBasicBlock::iterator &MI, 613 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; 614 615 public: 616 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 617 618 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 619 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 620 bool IsCrossAddrSpaceOrdering, Position Pos) const override; 621 622 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 623 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 624 625 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 626 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 627 bool IsVolatile, bool IsNonTemporal, 628 bool IsLastUse) const override; 629 630 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; 631 632 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 633 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 634 Position Pos) const override; 635 636 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 637 SIAtomicScope Scope, 638 SIAtomicAddrSpace AddrSpace) const override { 639 return setAtomicScope(MI, Scope, AddrSpace); 640 } 641 642 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 643 SIAtomicScope Scope, 644 SIAtomicAddrSpace AddrSpace) const override { 645 return setAtomicScope(MI, Scope, AddrSpace); 646 } 647 648 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 649 SIAtomicScope Scope, 650 SIAtomicAddrSpace AddrSpace) const override { 651 return setAtomicScope(MI, Scope, AddrSpace); 652 } 653 }; 654 655 class SIMemoryLegalizer final : public MachineFunctionPass { 656 private: 657 658 /// Cache Control. 659 std::unique_ptr<SICacheControl> CC = nullptr; 660 661 /// List of atomic pseudo instructions. 662 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 663 664 /// Return true iff instruction \p MI is a atomic instruction that 665 /// returns a result. 666 bool isAtomicRet(const MachineInstr &MI) const { 667 return SIInstrInfo::isAtomicRet(MI); 668 } 669 670 /// Removes all processed atomic pseudo instructions from the current 671 /// function. Returns true if current function is modified, false otherwise. 672 bool removeAtomicPseudoMIs(); 673 674 /// Expands load operation \p MI. Returns true if instructions are 675 /// added/deleted or \p MI is modified, false otherwise. 676 bool expandLoad(const SIMemOpInfo &MOI, 677 MachineBasicBlock::iterator &MI); 678 /// Expands store operation \p MI. Returns true if instructions are 679 /// added/deleted or \p MI is modified, false otherwise. 680 bool expandStore(const SIMemOpInfo &MOI, 681 MachineBasicBlock::iterator &MI); 682 /// Expands atomic fence operation \p MI. Returns true if 683 /// instructions are added/deleted or \p MI is modified, false otherwise. 684 bool expandAtomicFence(const SIMemOpInfo &MOI, 685 MachineBasicBlock::iterator &MI); 686 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 687 /// instructions are added/deleted or \p MI is modified, false otherwise. 688 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 689 MachineBasicBlock::iterator &MI); 690 691 public: 692 static char ID; 693 694 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 695 696 void getAnalysisUsage(AnalysisUsage &AU) const override { 697 AU.setPreservesCFG(); 698 MachineFunctionPass::getAnalysisUsage(AU); 699 } 700 701 StringRef getPassName() const override { 702 return PASS_NAME; 703 } 704 705 bool runOnMachineFunction(MachineFunction &MF) override; 706 }; 707 708 static const StringMap<SIAtomicAddrSpace> ASNames = {{ 709 {"global", SIAtomicAddrSpace::GLOBAL}, 710 {"local", SIAtomicAddrSpace::LDS}, 711 }}; 712 713 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { 714 const MachineFunction *MF = MI.getMF(); 715 const Function &Fn = MF->getFunction(); 716 SmallString<128> Str; 717 raw_svector_ostream OS(Str); 718 OS << "unknown address space '" << AS << "'; expected one of "; 719 ListSeparator LS; 720 for (const auto &[Name, Val] : ASNames) 721 OS << LS << '\'' << Name << '\''; 722 DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning); 723 Fn.getContext().diagnose(BadTag); 724 } 725 726 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. 727 /// If this tag isn't present, or if it has no meaningful values, returns \p 728 /// Default. Otherwise returns all the address spaces concerned by the MMRA. 729 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, 730 SIAtomicAddrSpace Default) { 731 static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; 732 733 auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); 734 if (!MMRA) 735 return Default; 736 737 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; 738 for (const auto &[Prefix, Suffix] : MMRA) { 739 if (Prefix != FenceASPrefix) 740 continue; 741 742 if (auto It = ASNames.find(Suffix); It != ASNames.end()) 743 Result |= It->second; 744 else 745 diagnoseUnknownMMRAASName(MI, Suffix); 746 } 747 748 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; 749 } 750 751 } // end anonymous namespace 752 753 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 754 const char *Msg) const { 755 const Function &Func = MI->getParent()->getParent()->getFunction(); 756 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 757 Func.getContext().diagnose(Diag); 758 } 759 760 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 761 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 762 SIAtomicAddrSpace InstrAddrSpace) const { 763 if (SSID == SyncScope::System) 764 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 765 if (SSID == MMI->getAgentSSID()) 766 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 767 if (SSID == MMI->getWorkgroupSSID()) 768 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 769 true); 770 if (SSID == MMI->getWavefrontSSID()) 771 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 772 true); 773 if (SSID == SyncScope::SingleThread) 774 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 775 true); 776 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 777 return std::tuple(SIAtomicScope::SYSTEM, 778 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 779 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 780 return std::tuple(SIAtomicScope::AGENT, 781 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 782 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 783 return std::tuple(SIAtomicScope::WORKGROUP, 784 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 785 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 786 return std::tuple(SIAtomicScope::WAVEFRONT, 787 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 788 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 789 return std::tuple(SIAtomicScope::SINGLETHREAD, 790 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 791 return std::nullopt; 792 } 793 794 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 795 if (AS == AMDGPUAS::FLAT_ADDRESS) 796 return SIAtomicAddrSpace::FLAT; 797 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 798 return SIAtomicAddrSpace::GLOBAL; 799 if (AS == AMDGPUAS::LOCAL_ADDRESS) 800 return SIAtomicAddrSpace::LDS; 801 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 802 return SIAtomicAddrSpace::SCRATCH; 803 if (AS == AMDGPUAS::REGION_ADDRESS) 804 return SIAtomicAddrSpace::GDS; 805 806 return SIAtomicAddrSpace::OTHER; 807 } 808 809 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_) 810 : MMI(&MMI_) {} 811 812 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 813 const MachineBasicBlock::iterator &MI) const { 814 assert(MI->getNumMemOperands() > 0); 815 816 SyncScope::ID SSID = SyncScope::SingleThread; 817 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 818 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 819 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 820 bool IsNonTemporal = true; 821 bool IsVolatile = false; 822 bool IsLastUse = false; 823 824 // Validator should check whether or not MMOs cover the entire set of 825 // locations accessed by the memory instruction. 826 for (const auto &MMO : MI->memoperands()) { 827 IsNonTemporal &= MMO->isNonTemporal(); 828 IsVolatile |= MMO->isVolatile(); 829 IsLastUse |= MMO->getFlags() & MOLastUse; 830 InstrAddrSpace |= 831 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 832 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 833 if (OpOrdering != AtomicOrdering::NotAtomic) { 834 const auto &IsSyncScopeInclusion = 835 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 836 if (!IsSyncScopeInclusion) { 837 reportUnsupported(MI, 838 "Unsupported non-inclusive atomic synchronization scope"); 839 return std::nullopt; 840 } 841 842 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 843 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 844 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 845 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 846 FailureOrdering = 847 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 848 } 849 } 850 851 SIAtomicScope Scope = SIAtomicScope::NONE; 852 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 853 bool IsCrossAddressSpaceOrdering = false; 854 if (Ordering != AtomicOrdering::NotAtomic) { 855 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 856 if (!ScopeOrNone) { 857 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 858 return std::nullopt; 859 } 860 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 861 *ScopeOrNone; 862 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 863 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 864 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 865 reportUnsupported(MI, "Unsupported atomic address space"); 866 return std::nullopt; 867 } 868 } 869 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 870 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 871 IsNonTemporal, IsLastUse); 872 } 873 874 std::optional<SIMemOpInfo> 875 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 876 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 877 878 if (!(MI->mayLoad() && !MI->mayStore())) 879 return std::nullopt; 880 881 // Be conservative if there are no memory operands. 882 if (MI->getNumMemOperands() == 0) 883 return SIMemOpInfo(); 884 885 return constructFromMIWithMMO(MI); 886 } 887 888 std::optional<SIMemOpInfo> 889 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 890 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 891 892 if (!(!MI->mayLoad() && MI->mayStore())) 893 return std::nullopt; 894 895 // Be conservative if there are no memory operands. 896 if (MI->getNumMemOperands() == 0) 897 return SIMemOpInfo(); 898 899 return constructFromMIWithMMO(MI); 900 } 901 902 std::optional<SIMemOpInfo> 903 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 904 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 905 906 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 907 return std::nullopt; 908 909 AtomicOrdering Ordering = 910 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 911 912 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 913 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 914 if (!ScopeOrNone) { 915 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 916 return std::nullopt; 917 } 918 919 SIAtomicScope Scope = SIAtomicScope::NONE; 920 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 921 bool IsCrossAddressSpaceOrdering = false; 922 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 923 *ScopeOrNone; 924 925 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 926 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 927 reportUnsupported(MI, "Unsupported atomic address space"); 928 return std::nullopt; 929 } 930 931 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 932 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 933 } 934 935 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 936 const MachineBasicBlock::iterator &MI) const { 937 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 938 939 if (!(MI->mayLoad() && MI->mayStore())) 940 return std::nullopt; 941 942 // Be conservative if there are no memory operands. 943 if (MI->getNumMemOperands() == 0) 944 return SIMemOpInfo(); 945 946 return constructFromMIWithMMO(MI); 947 } 948 949 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 950 TII = ST.getInstrInfo(); 951 IV = getIsaVersion(ST.getCPU()); 952 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 953 } 954 955 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 956 AMDGPU::CPol::CPol Bit) const { 957 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 958 if (!CPol) 959 return false; 960 961 CPol->setImm(CPol->getImm() | Bit); 962 return true; 963 } 964 965 /* static */ 966 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 967 GCNSubtarget::Generation Generation = ST.getGeneration(); 968 if (ST.hasGFX940Insts()) 969 return std::make_unique<SIGfx940CacheControl>(ST); 970 if (ST.hasGFX90AInsts()) 971 return std::make_unique<SIGfx90ACacheControl>(ST); 972 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 973 return std::make_unique<SIGfx6CacheControl>(ST); 974 if (Generation < AMDGPUSubtarget::GFX10) 975 return std::make_unique<SIGfx7CacheControl>(ST); 976 if (Generation < AMDGPUSubtarget::GFX11) 977 return std::make_unique<SIGfx10CacheControl>(ST); 978 if (Generation < AMDGPUSubtarget::GFX12) 979 return std::make_unique<SIGfx11CacheControl>(ST); 980 return std::make_unique<SIGfx12CacheControl>(ST); 981 } 982 983 bool SIGfx6CacheControl::enableLoadCacheBypass( 984 const MachineBasicBlock::iterator &MI, 985 SIAtomicScope Scope, 986 SIAtomicAddrSpace AddrSpace) const { 987 assert(MI->mayLoad() && !MI->mayStore()); 988 bool Changed = false; 989 990 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 991 switch (Scope) { 992 case SIAtomicScope::SYSTEM: 993 case SIAtomicScope::AGENT: 994 // Set L1 cache policy to MISS_EVICT. 995 // Note: there is no L2 cache bypass policy at the ISA level. 996 Changed |= enableGLCBit(MI); 997 break; 998 case SIAtomicScope::WORKGROUP: 999 case SIAtomicScope::WAVEFRONT: 1000 case SIAtomicScope::SINGLETHREAD: 1001 // No cache to bypass. 1002 break; 1003 default: 1004 llvm_unreachable("Unsupported synchronization scope"); 1005 } 1006 } 1007 1008 /// The scratch address space does not need the global memory caches 1009 /// to be bypassed as all memory operations by the same thread are 1010 /// sequentially consistent, and no other thread can access scratch 1011 /// memory. 1012 1013 /// Other address spaces do not have a cache. 1014 1015 return Changed; 1016 } 1017 1018 bool SIGfx6CacheControl::enableStoreCacheBypass( 1019 const MachineBasicBlock::iterator &MI, 1020 SIAtomicScope Scope, 1021 SIAtomicAddrSpace AddrSpace) const { 1022 assert(!MI->mayLoad() && MI->mayStore()); 1023 bool Changed = false; 1024 1025 /// The L1 cache is write through so does not need to be bypassed. There is no 1026 /// bypass control for the L2 cache at the isa level. 1027 1028 return Changed; 1029 } 1030 1031 bool SIGfx6CacheControl::enableRMWCacheBypass( 1032 const MachineBasicBlock::iterator &MI, 1033 SIAtomicScope Scope, 1034 SIAtomicAddrSpace AddrSpace) const { 1035 assert(MI->mayLoad() && MI->mayStore()); 1036 bool Changed = false; 1037 1038 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 1039 /// bypassed, and the GLC bit is instead used to indicate if they are 1040 /// return or no-return. 1041 /// Note: there is no L2 cache coherent bypass control at the ISA level. 1042 1043 return Changed; 1044 } 1045 1046 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 1047 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1048 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1049 // Only handle load and store, not atomic read-modify-write insructions. The 1050 // latter use glc to indicate if the atomic returns a result and so must not 1051 // be used for cache control. 1052 assert(MI->mayLoad() ^ MI->mayStore()); 1053 1054 // Only update load and store, not LLVM IR atomic read-modify-write 1055 // instructions. The latter are always marked as volatile so cannot sensibly 1056 // handle it as do not want to pessimize all atomics. Also they do not support 1057 // the nontemporal attribute. 1058 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1059 1060 bool Changed = false; 1061 1062 if (IsVolatile) { 1063 // Set L1 cache policy to be MISS_EVICT for load instructions 1064 // and MISS_LRU for store instructions. 1065 // Note: there is no L2 cache bypass policy at the ISA level. 1066 if (Op == SIMemOp::LOAD) 1067 Changed |= enableGLCBit(MI); 1068 1069 // Ensure operation has completed at system scope to cause all volatile 1070 // operations to be visible outside the program in a global order. Do not 1071 // request cross address space as only the global address space can be 1072 // observable outside the program, so no need to cause a waitcnt for LDS 1073 // address space operations. 1074 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1075 Position::AFTER); 1076 1077 return Changed; 1078 } 1079 1080 if (IsNonTemporal) { 1081 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1082 // for both loads and stores, and the L2 cache policy to STREAM. 1083 Changed |= enableGLCBit(MI); 1084 Changed |= enableSLCBit(MI); 1085 return Changed; 1086 } 1087 1088 return Changed; 1089 } 1090 1091 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1092 SIAtomicScope Scope, 1093 SIAtomicAddrSpace AddrSpace, 1094 SIMemOp Op, 1095 bool IsCrossAddrSpaceOrdering, 1096 Position Pos) const { 1097 bool Changed = false; 1098 1099 MachineBasicBlock &MBB = *MI->getParent(); 1100 DebugLoc DL = MI->getDebugLoc(); 1101 1102 if (Pos == Position::AFTER) 1103 ++MI; 1104 1105 bool VMCnt = false; 1106 bool LGKMCnt = false; 1107 1108 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1109 SIAtomicAddrSpace::NONE) { 1110 switch (Scope) { 1111 case SIAtomicScope::SYSTEM: 1112 case SIAtomicScope::AGENT: 1113 VMCnt |= true; 1114 break; 1115 case SIAtomicScope::WORKGROUP: 1116 case SIAtomicScope::WAVEFRONT: 1117 case SIAtomicScope::SINGLETHREAD: 1118 // The L1 cache keeps all memory operations in order for 1119 // wavefronts in the same work-group. 1120 break; 1121 default: 1122 llvm_unreachable("Unsupported synchronization scope"); 1123 } 1124 } 1125 1126 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1127 switch (Scope) { 1128 case SIAtomicScope::SYSTEM: 1129 case SIAtomicScope::AGENT: 1130 case SIAtomicScope::WORKGROUP: 1131 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1132 // not needed as LDS operations for all waves are executed in a total 1133 // global ordering as observed by all waves. Required if also 1134 // synchronizing with global/GDS memory as LDS operations could be 1135 // reordered with respect to later global/GDS memory operations of the 1136 // same wave. 1137 LGKMCnt |= IsCrossAddrSpaceOrdering; 1138 break; 1139 case SIAtomicScope::WAVEFRONT: 1140 case SIAtomicScope::SINGLETHREAD: 1141 // The LDS keeps all memory operations in order for 1142 // the same wavefront. 1143 break; 1144 default: 1145 llvm_unreachable("Unsupported synchronization scope"); 1146 } 1147 } 1148 1149 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1150 switch (Scope) { 1151 case SIAtomicScope::SYSTEM: 1152 case SIAtomicScope::AGENT: 1153 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1154 // is not needed as GDS operations for all waves are executed in a total 1155 // global ordering as observed by all waves. Required if also 1156 // synchronizing with global/LDS memory as GDS operations could be 1157 // reordered with respect to later global/LDS memory operations of the 1158 // same wave. 1159 LGKMCnt |= IsCrossAddrSpaceOrdering; 1160 break; 1161 case SIAtomicScope::WORKGROUP: 1162 case SIAtomicScope::WAVEFRONT: 1163 case SIAtomicScope::SINGLETHREAD: 1164 // The GDS keeps all memory operations in order for 1165 // the same work-group. 1166 break; 1167 default: 1168 llvm_unreachable("Unsupported synchronization scope"); 1169 } 1170 } 1171 1172 if (VMCnt || LGKMCnt) { 1173 unsigned WaitCntImmediate = 1174 AMDGPU::encodeWaitcnt(IV, 1175 VMCnt ? 0 : getVmcntBitMask(IV), 1176 getExpcntBitMask(IV), 1177 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1178 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1179 .addImm(WaitCntImmediate); 1180 Changed = true; 1181 } 1182 1183 if (Pos == Position::AFTER) 1184 --MI; 1185 1186 return Changed; 1187 } 1188 1189 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1190 SIAtomicScope Scope, 1191 SIAtomicAddrSpace AddrSpace, 1192 Position Pos) const { 1193 if (!InsertCacheInv) 1194 return false; 1195 1196 bool Changed = false; 1197 1198 MachineBasicBlock &MBB = *MI->getParent(); 1199 DebugLoc DL = MI->getDebugLoc(); 1200 1201 if (Pos == Position::AFTER) 1202 ++MI; 1203 1204 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1205 switch (Scope) { 1206 case SIAtomicScope::SYSTEM: 1207 case SIAtomicScope::AGENT: 1208 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1209 Changed = true; 1210 break; 1211 case SIAtomicScope::WORKGROUP: 1212 case SIAtomicScope::WAVEFRONT: 1213 case SIAtomicScope::SINGLETHREAD: 1214 // No cache to invalidate. 1215 break; 1216 default: 1217 llvm_unreachable("Unsupported synchronization scope"); 1218 } 1219 } 1220 1221 /// The scratch address space does not need the global memory cache 1222 /// to be flushed as all memory operations by the same thread are 1223 /// sequentially consistent, and no other thread can access scratch 1224 /// memory. 1225 1226 /// Other address spaces do not have a cache. 1227 1228 if (Pos == Position::AFTER) 1229 --MI; 1230 1231 return Changed; 1232 } 1233 1234 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1235 SIAtomicScope Scope, 1236 SIAtomicAddrSpace AddrSpace, 1237 bool IsCrossAddrSpaceOrdering, 1238 Position Pos) const { 1239 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1240 IsCrossAddrSpaceOrdering, Pos); 1241 } 1242 1243 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1244 SIAtomicScope Scope, 1245 SIAtomicAddrSpace AddrSpace, 1246 Position Pos) const { 1247 if (!InsertCacheInv) 1248 return false; 1249 1250 bool Changed = false; 1251 1252 MachineBasicBlock &MBB = *MI->getParent(); 1253 DebugLoc DL = MI->getDebugLoc(); 1254 1255 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1256 1257 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1258 ? AMDGPU::BUFFER_WBINVL1 1259 : AMDGPU::BUFFER_WBINVL1_VOL; 1260 1261 if (Pos == Position::AFTER) 1262 ++MI; 1263 1264 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1265 switch (Scope) { 1266 case SIAtomicScope::SYSTEM: 1267 case SIAtomicScope::AGENT: 1268 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1269 Changed = true; 1270 break; 1271 case SIAtomicScope::WORKGROUP: 1272 case SIAtomicScope::WAVEFRONT: 1273 case SIAtomicScope::SINGLETHREAD: 1274 // No cache to invalidate. 1275 break; 1276 default: 1277 llvm_unreachable("Unsupported synchronization scope"); 1278 } 1279 } 1280 1281 /// The scratch address space does not need the global memory cache 1282 /// to be flushed as all memory operations by the same thread are 1283 /// sequentially consistent, and no other thread can access scratch 1284 /// memory. 1285 1286 /// Other address spaces do not have a cache. 1287 1288 if (Pos == Position::AFTER) 1289 --MI; 1290 1291 return Changed; 1292 } 1293 1294 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1295 const MachineBasicBlock::iterator &MI, 1296 SIAtomicScope Scope, 1297 SIAtomicAddrSpace AddrSpace) const { 1298 assert(MI->mayLoad() && !MI->mayStore()); 1299 bool Changed = false; 1300 1301 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1302 switch (Scope) { 1303 case SIAtomicScope::SYSTEM: 1304 case SIAtomicScope::AGENT: 1305 // Set the L1 cache policy to MISS_LRU. 1306 // Note: there is no L2 cache bypass policy at the ISA level. 1307 Changed |= enableGLCBit(MI); 1308 break; 1309 case SIAtomicScope::WORKGROUP: 1310 // In threadgroup split mode the waves of a work-group can be executing on 1311 // different CUs. Therefore need to bypass the L1 which is per CU. 1312 // Otherwise in non-threadgroup split mode all waves of a work-group are 1313 // on the same CU, and so the L1 does not need to be bypassed. 1314 if (ST.isTgSplitEnabled()) 1315 Changed |= enableGLCBit(MI); 1316 break; 1317 case SIAtomicScope::WAVEFRONT: 1318 case SIAtomicScope::SINGLETHREAD: 1319 // No cache to bypass. 1320 break; 1321 default: 1322 llvm_unreachable("Unsupported synchronization scope"); 1323 } 1324 } 1325 1326 /// The scratch address space does not need the global memory caches 1327 /// to be bypassed as all memory operations by the same thread are 1328 /// sequentially consistent, and no other thread can access scratch 1329 /// memory. 1330 1331 /// Other address spaces do not have a cache. 1332 1333 return Changed; 1334 } 1335 1336 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1337 const MachineBasicBlock::iterator &MI, 1338 SIAtomicScope Scope, 1339 SIAtomicAddrSpace AddrSpace) const { 1340 assert(!MI->mayLoad() && MI->mayStore()); 1341 bool Changed = false; 1342 1343 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1344 switch (Scope) { 1345 case SIAtomicScope::SYSTEM: 1346 case SIAtomicScope::AGENT: 1347 /// Do not set glc for store atomic operations as they implicitly write 1348 /// through the L1 cache. 1349 break; 1350 case SIAtomicScope::WORKGROUP: 1351 case SIAtomicScope::WAVEFRONT: 1352 case SIAtomicScope::SINGLETHREAD: 1353 // No cache to bypass. Store atomics implicitly write through the L1 1354 // cache. 1355 break; 1356 default: 1357 llvm_unreachable("Unsupported synchronization scope"); 1358 } 1359 } 1360 1361 /// The scratch address space does not need the global memory caches 1362 /// to be bypassed as all memory operations by the same thread are 1363 /// sequentially consistent, and no other thread can access scratch 1364 /// memory. 1365 1366 /// Other address spaces do not have a cache. 1367 1368 return Changed; 1369 } 1370 1371 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1372 const MachineBasicBlock::iterator &MI, 1373 SIAtomicScope Scope, 1374 SIAtomicAddrSpace AddrSpace) const { 1375 assert(MI->mayLoad() && MI->mayStore()); 1376 bool Changed = false; 1377 1378 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1379 switch (Scope) { 1380 case SIAtomicScope::SYSTEM: 1381 case SIAtomicScope::AGENT: 1382 /// Do not set glc for RMW atomic operations as they implicitly bypass 1383 /// the L1 cache, and the glc bit is instead used to indicate if they are 1384 /// return or no-return. 1385 break; 1386 case SIAtomicScope::WORKGROUP: 1387 case SIAtomicScope::WAVEFRONT: 1388 case SIAtomicScope::SINGLETHREAD: 1389 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1390 break; 1391 default: 1392 llvm_unreachable("Unsupported synchronization scope"); 1393 } 1394 } 1395 1396 return Changed; 1397 } 1398 1399 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1400 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1401 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1402 // Only handle load and store, not atomic read-modify-write insructions. The 1403 // latter use glc to indicate if the atomic returns a result and so must not 1404 // be used for cache control. 1405 assert(MI->mayLoad() ^ MI->mayStore()); 1406 1407 // Only update load and store, not LLVM IR atomic read-modify-write 1408 // instructions. The latter are always marked as volatile so cannot sensibly 1409 // handle it as do not want to pessimize all atomics. Also they do not support 1410 // the nontemporal attribute. 1411 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1412 1413 bool Changed = false; 1414 1415 if (IsVolatile) { 1416 // Set L1 cache policy to be MISS_EVICT for load instructions 1417 // and MISS_LRU for store instructions. 1418 // Note: there is no L2 cache bypass policy at the ISA level. 1419 if (Op == SIMemOp::LOAD) 1420 Changed |= enableGLCBit(MI); 1421 1422 // Ensure operation has completed at system scope to cause all volatile 1423 // operations to be visible outside the program in a global order. Do not 1424 // request cross address space as only the global address space can be 1425 // observable outside the program, so no need to cause a waitcnt for LDS 1426 // address space operations. 1427 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1428 Position::AFTER); 1429 1430 return Changed; 1431 } 1432 1433 if (IsNonTemporal) { 1434 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1435 // for both loads and stores, and the L2 cache policy to STREAM. 1436 Changed |= enableGLCBit(MI); 1437 Changed |= enableSLCBit(MI); 1438 return Changed; 1439 } 1440 1441 return Changed; 1442 } 1443 1444 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1445 SIAtomicScope Scope, 1446 SIAtomicAddrSpace AddrSpace, 1447 SIMemOp Op, 1448 bool IsCrossAddrSpaceOrdering, 1449 Position Pos) const { 1450 if (ST.isTgSplitEnabled()) { 1451 // In threadgroup split mode the waves of a work-group can be executing on 1452 // different CUs. Therefore need to wait for global or GDS memory operations 1453 // to complete to ensure they are visible to waves in the other CUs. 1454 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1455 // the same CU, so no need to wait for global memory as all waves in the 1456 // work-group access the same the L1, nor wait for GDS as access are ordered 1457 // on a CU. 1458 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1459 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1460 (Scope == SIAtomicScope::WORKGROUP)) { 1461 // Same as GFX7 using agent scope. 1462 Scope = SIAtomicScope::AGENT; 1463 } 1464 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1465 // LDS memory operations. 1466 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1467 } 1468 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1469 IsCrossAddrSpaceOrdering, Pos); 1470 } 1471 1472 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1473 SIAtomicScope Scope, 1474 SIAtomicAddrSpace AddrSpace, 1475 Position Pos) const { 1476 if (!InsertCacheInv) 1477 return false; 1478 1479 bool Changed = false; 1480 1481 MachineBasicBlock &MBB = *MI->getParent(); 1482 DebugLoc DL = MI->getDebugLoc(); 1483 1484 if (Pos == Position::AFTER) 1485 ++MI; 1486 1487 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1488 switch (Scope) { 1489 case SIAtomicScope::SYSTEM: 1490 // Ensures that following loads will not see stale remote VMEM data or 1491 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1492 // CC will never be stale due to the local memory probes. 1493 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1494 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1495 // hardware does not reorder memory operations by the same wave with 1496 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1497 // remove any cache lines of earlier writes by the same wave and ensures 1498 // later reads by the same wave will refetch the cache lines. 1499 Changed = true; 1500 break; 1501 case SIAtomicScope::AGENT: 1502 // Same as GFX7. 1503 break; 1504 case SIAtomicScope::WORKGROUP: 1505 // In threadgroup split mode the waves of a work-group can be executing on 1506 // different CUs. Therefore need to invalidate the L1 which is per CU. 1507 // Otherwise in non-threadgroup split mode all waves of a work-group are 1508 // on the same CU, and so the L1 does not need to be invalidated. 1509 if (ST.isTgSplitEnabled()) { 1510 // Same as GFX7 using agent scope. 1511 Scope = SIAtomicScope::AGENT; 1512 } 1513 break; 1514 case SIAtomicScope::WAVEFRONT: 1515 case SIAtomicScope::SINGLETHREAD: 1516 // Same as GFX7. 1517 break; 1518 default: 1519 llvm_unreachable("Unsupported synchronization scope"); 1520 } 1521 } 1522 1523 /// The scratch address space does not need the global memory cache 1524 /// to be flushed as all memory operations by the same thread are 1525 /// sequentially consistent, and no other thread can access scratch 1526 /// memory. 1527 1528 /// Other address spaces do not have a cache. 1529 1530 if (Pos == Position::AFTER) 1531 --MI; 1532 1533 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1534 1535 return Changed; 1536 } 1537 1538 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1539 SIAtomicScope Scope, 1540 SIAtomicAddrSpace AddrSpace, 1541 bool IsCrossAddrSpaceOrdering, 1542 Position Pos) const { 1543 bool Changed = false; 1544 1545 MachineBasicBlock &MBB = *MI->getParent(); 1546 const DebugLoc &DL = MI->getDebugLoc(); 1547 1548 if (Pos == Position::AFTER) 1549 ++MI; 1550 1551 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1552 switch (Scope) { 1553 case SIAtomicScope::SYSTEM: 1554 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1555 // hardware does not reorder memory operations by the same wave with 1556 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1557 // to initiate writeback of any dirty cache lines of earlier writes by the 1558 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1559 // writeback has completed. 1560 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1561 // Set SC bits to indicate system scope. 1562 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1563 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1564 // vmcnt(0)" needed by the "BUFFER_WBL2". 1565 Changed = true; 1566 break; 1567 case SIAtomicScope::AGENT: 1568 case SIAtomicScope::WORKGROUP: 1569 case SIAtomicScope::WAVEFRONT: 1570 case SIAtomicScope::SINGLETHREAD: 1571 // Same as GFX7. 1572 break; 1573 default: 1574 llvm_unreachable("Unsupported synchronization scope"); 1575 } 1576 } 1577 1578 if (Pos == Position::AFTER) 1579 --MI; 1580 1581 Changed |= 1582 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1583 IsCrossAddrSpaceOrdering, Pos); 1584 1585 return Changed; 1586 } 1587 1588 bool SIGfx940CacheControl::enableLoadCacheBypass( 1589 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1590 SIAtomicAddrSpace AddrSpace) const { 1591 assert(MI->mayLoad() && !MI->mayStore()); 1592 bool Changed = false; 1593 1594 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1595 switch (Scope) { 1596 case SIAtomicScope::SYSTEM: 1597 // Set SC bits to indicate system scope. 1598 Changed |= enableSC0Bit(MI); 1599 Changed |= enableSC1Bit(MI); 1600 break; 1601 case SIAtomicScope::AGENT: 1602 // Set SC bits to indicate agent scope. 1603 Changed |= enableSC1Bit(MI); 1604 break; 1605 case SIAtomicScope::WORKGROUP: 1606 // In threadgroup split mode the waves of a work-group can be executing on 1607 // different CUs. Therefore need to bypass the L1 which is per CU. 1608 // Otherwise in non-threadgroup split mode all waves of a work-group are 1609 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1610 // bits to indicate work-group scope will do this automatically. 1611 Changed |= enableSC0Bit(MI); 1612 break; 1613 case SIAtomicScope::WAVEFRONT: 1614 case SIAtomicScope::SINGLETHREAD: 1615 // Leave SC bits unset to indicate wavefront scope. 1616 break; 1617 default: 1618 llvm_unreachable("Unsupported synchronization scope"); 1619 } 1620 } 1621 1622 /// The scratch address space does not need the global memory caches 1623 /// to be bypassed as all memory operations by the same thread are 1624 /// sequentially consistent, and no other thread can access scratch 1625 /// memory. 1626 1627 /// Other address spaces do not have a cache. 1628 1629 return Changed; 1630 } 1631 1632 bool SIGfx940CacheControl::enableStoreCacheBypass( 1633 const MachineBasicBlock::iterator &MI, 1634 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1635 assert(!MI->mayLoad() && MI->mayStore()); 1636 bool Changed = false; 1637 1638 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1639 switch (Scope) { 1640 case SIAtomicScope::SYSTEM: 1641 // Set SC bits to indicate system scope. 1642 Changed |= enableSC0Bit(MI); 1643 Changed |= enableSC1Bit(MI); 1644 break; 1645 case SIAtomicScope::AGENT: 1646 // Set SC bits to indicate agent scope. 1647 Changed |= enableSC1Bit(MI); 1648 break; 1649 case SIAtomicScope::WORKGROUP: 1650 // Set SC bits to indicate workgroup scope. 1651 Changed |= enableSC0Bit(MI); 1652 break; 1653 case SIAtomicScope::WAVEFRONT: 1654 case SIAtomicScope::SINGLETHREAD: 1655 // Leave SC bits unset to indicate wavefront scope. 1656 break; 1657 default: 1658 llvm_unreachable("Unsupported synchronization scope"); 1659 } 1660 } 1661 1662 /// The scratch address space does not need the global memory caches 1663 /// to be bypassed as all memory operations by the same thread are 1664 /// sequentially consistent, and no other thread can access scratch 1665 /// memory. 1666 1667 /// Other address spaces do not have a cache. 1668 1669 return Changed; 1670 } 1671 1672 bool SIGfx940CacheControl::enableRMWCacheBypass( 1673 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1674 SIAtomicAddrSpace AddrSpace) const { 1675 assert(MI->mayLoad() && MI->mayStore()); 1676 bool Changed = false; 1677 1678 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1679 switch (Scope) { 1680 case SIAtomicScope::SYSTEM: 1681 // Set SC1 bit to indicate system scope. 1682 Changed |= enableSC1Bit(MI); 1683 break; 1684 case SIAtomicScope::AGENT: 1685 case SIAtomicScope::WORKGROUP: 1686 case SIAtomicScope::WAVEFRONT: 1687 case SIAtomicScope::SINGLETHREAD: 1688 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1689 // to indicate system or agent scope. The SC0 bit is used to indicate if 1690 // they are return or no-return. Leave SC1 bit unset to indicate agent 1691 // scope. 1692 break; 1693 default: 1694 llvm_unreachable("Unsupported synchronization scope"); 1695 } 1696 } 1697 1698 return Changed; 1699 } 1700 1701 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1702 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1703 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1704 // Only handle load and store, not atomic read-modify-write insructions. The 1705 // latter use glc to indicate if the atomic returns a result and so must not 1706 // be used for cache control. 1707 assert(MI->mayLoad() ^ MI->mayStore()); 1708 1709 // Only update load and store, not LLVM IR atomic read-modify-write 1710 // instructions. The latter are always marked as volatile so cannot sensibly 1711 // handle it as do not want to pessimize all atomics. Also they do not support 1712 // the nontemporal attribute. 1713 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1714 1715 bool Changed = false; 1716 1717 if (IsVolatile) { 1718 // Set SC bits to indicate system scope. 1719 Changed |= enableSC0Bit(MI); 1720 Changed |= enableSC1Bit(MI); 1721 1722 // Ensure operation has completed at system scope to cause all volatile 1723 // operations to be visible outside the program in a global order. Do not 1724 // request cross address space as only the global address space can be 1725 // observable outside the program, so no need to cause a waitcnt for LDS 1726 // address space operations. 1727 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1728 Position::AFTER); 1729 1730 return Changed; 1731 } 1732 1733 if (IsNonTemporal) { 1734 Changed |= enableNTBit(MI); 1735 return Changed; 1736 } 1737 1738 return Changed; 1739 } 1740 1741 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1742 SIAtomicScope Scope, 1743 SIAtomicAddrSpace AddrSpace, 1744 Position Pos) const { 1745 if (!InsertCacheInv) 1746 return false; 1747 1748 bool Changed = false; 1749 1750 MachineBasicBlock &MBB = *MI->getParent(); 1751 DebugLoc DL = MI->getDebugLoc(); 1752 1753 if (Pos == Position::AFTER) 1754 ++MI; 1755 1756 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1757 switch (Scope) { 1758 case SIAtomicScope::SYSTEM: 1759 // Ensures that following loads will not see stale remote VMEM data or 1760 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1761 // CC will never be stale due to the local memory probes. 1762 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1763 // Set SC bits to indicate system scope. 1764 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1765 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1766 // hardware does not reorder memory operations by the same wave with 1767 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1768 // remove any cache lines of earlier writes by the same wave and ensures 1769 // later reads by the same wave will refetch the cache lines. 1770 Changed = true; 1771 break; 1772 case SIAtomicScope::AGENT: 1773 // Ensures that following loads will not see stale remote date or local 1774 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1775 // due to the memory probes. 1776 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1777 // Set SC bits to indicate agent scope. 1778 .addImm(AMDGPU::CPol::SC1); 1779 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1780 // does not reorder memory operations with respect to preceeding buffer 1781 // invalidate. The invalidate is guaranteed to remove any cache lines of 1782 // earlier writes and ensures later writes will refetch the cache lines. 1783 Changed = true; 1784 break; 1785 case SIAtomicScope::WORKGROUP: 1786 // In threadgroup split mode the waves of a work-group can be executing on 1787 // different CUs. Therefore need to invalidate the L1 which is per CU. 1788 // Otherwise in non-threadgroup split mode all waves of a work-group are 1789 // on the same CU, and so the L1 does not need to be invalidated. 1790 if (ST.isTgSplitEnabled()) { 1791 // Ensures L1 is invalidated if in threadgroup split mode. In 1792 // non-threadgroup split mode it is a NOP, but no point generating it in 1793 // that case if know not in that mode. 1794 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1795 // Set SC bits to indicate work-group scope. 1796 .addImm(AMDGPU::CPol::SC0); 1797 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1798 // does not reorder memory operations with respect to preceeding buffer 1799 // invalidate. The invalidate is guaranteed to remove any cache lines of 1800 // earlier writes and ensures later writes will refetch the cache lines. 1801 Changed = true; 1802 } 1803 break; 1804 case SIAtomicScope::WAVEFRONT: 1805 case SIAtomicScope::SINGLETHREAD: 1806 // Could generate "BUFFER_INV" but it would do nothing as there are no 1807 // caches to invalidate. 1808 break; 1809 default: 1810 llvm_unreachable("Unsupported synchronization scope"); 1811 } 1812 } 1813 1814 /// The scratch address space does not need the global memory cache 1815 /// to be flushed as all memory operations by the same thread are 1816 /// sequentially consistent, and no other thread can access scratch 1817 /// memory. 1818 1819 /// Other address spaces do not have a cache. 1820 1821 if (Pos == Position::AFTER) 1822 --MI; 1823 1824 return Changed; 1825 } 1826 1827 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1828 SIAtomicScope Scope, 1829 SIAtomicAddrSpace AddrSpace, 1830 bool IsCrossAddrSpaceOrdering, 1831 Position Pos) const { 1832 bool Changed = false; 1833 1834 MachineBasicBlock &MBB = *MI->getParent(); 1835 DebugLoc DL = MI->getDebugLoc(); 1836 1837 if (Pos == Position::AFTER) 1838 ++MI; 1839 1840 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1841 switch (Scope) { 1842 case SIAtomicScope::SYSTEM: 1843 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1844 // hardware does not reorder memory operations by the same wave with 1845 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1846 // to initiate writeback of any dirty cache lines of earlier writes by the 1847 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1848 // writeback has completed. 1849 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1850 // Set SC bits to indicate system scope. 1851 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1852 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1853 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1854 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1855 Changed = true; 1856 break; 1857 case SIAtomicScope::AGENT: 1858 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1859 // Set SC bits to indicate agent scope. 1860 .addImm(AMDGPU::CPol::SC1); 1861 1862 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1863 // SIAtomicScope::AGENT, the following insertWait will generate the 1864 // required "S_WAITCNT vmcnt(0)". 1865 Changed = true; 1866 break; 1867 case SIAtomicScope::WORKGROUP: 1868 case SIAtomicScope::WAVEFRONT: 1869 case SIAtomicScope::SINGLETHREAD: 1870 // Do not generate "BUFFER_WBL2" as there are no caches it would 1871 // writeback, and would require an otherwise unnecessary 1872 // "S_WAITCNT vmcnt(0)". 1873 break; 1874 default: 1875 llvm_unreachable("Unsupported synchronization scope"); 1876 } 1877 } 1878 1879 if (Pos == Position::AFTER) 1880 --MI; 1881 1882 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1883 // S_WAITCNT needed. 1884 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1885 IsCrossAddrSpaceOrdering, Pos); 1886 1887 return Changed; 1888 } 1889 1890 bool SIGfx10CacheControl::enableLoadCacheBypass( 1891 const MachineBasicBlock::iterator &MI, 1892 SIAtomicScope Scope, 1893 SIAtomicAddrSpace AddrSpace) const { 1894 assert(MI->mayLoad() && !MI->mayStore()); 1895 bool Changed = false; 1896 1897 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1898 switch (Scope) { 1899 case SIAtomicScope::SYSTEM: 1900 case SIAtomicScope::AGENT: 1901 // Set the L0 and L1 cache policies to MISS_EVICT. 1902 // Note: there is no L2 cache coherent bypass control at the ISA level. 1903 Changed |= enableGLCBit(MI); 1904 Changed |= enableDLCBit(MI); 1905 break; 1906 case SIAtomicScope::WORKGROUP: 1907 // In WGP mode the waves of a work-group can be executing on either CU of 1908 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1909 // CU mode all waves of a work-group are on the same CU, and so the L0 1910 // does not need to be bypassed. 1911 if (!ST.isCuModeEnabled()) 1912 Changed |= enableGLCBit(MI); 1913 break; 1914 case SIAtomicScope::WAVEFRONT: 1915 case SIAtomicScope::SINGLETHREAD: 1916 // No cache to bypass. 1917 break; 1918 default: 1919 llvm_unreachable("Unsupported synchronization scope"); 1920 } 1921 } 1922 1923 /// The scratch address space does not need the global memory caches 1924 /// to be bypassed as all memory operations by the same thread are 1925 /// sequentially consistent, and no other thread can access scratch 1926 /// memory. 1927 1928 /// Other address spaces do not have a cache. 1929 1930 return Changed; 1931 } 1932 1933 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1934 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1935 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1936 1937 // Only handle load and store, not atomic read-modify-write insructions. The 1938 // latter use glc to indicate if the atomic returns a result and so must not 1939 // be used for cache control. 1940 assert(MI->mayLoad() ^ MI->mayStore()); 1941 1942 // Only update load and store, not LLVM IR atomic read-modify-write 1943 // instructions. The latter are always marked as volatile so cannot sensibly 1944 // handle it as do not want to pessimize all atomics. Also they do not support 1945 // the nontemporal attribute. 1946 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1947 1948 bool Changed = false; 1949 1950 if (IsVolatile) { 1951 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1952 // and MISS_LRU for store instructions. 1953 // Note: there is no L2 cache coherent bypass control at the ISA level. 1954 if (Op == SIMemOp::LOAD) { 1955 Changed |= enableGLCBit(MI); 1956 Changed |= enableDLCBit(MI); 1957 } 1958 1959 // Ensure operation has completed at system scope to cause all volatile 1960 // operations to be visible outside the program in a global order. Do not 1961 // request cross address space as only the global address space can be 1962 // observable outside the program, so no need to cause a waitcnt for LDS 1963 // address space operations. 1964 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1965 Position::AFTER); 1966 return Changed; 1967 } 1968 1969 if (IsNonTemporal) { 1970 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1971 // and L2 cache policy to STREAM. 1972 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1973 // to MISS_EVICT and the L2 cache policy to STREAM. 1974 if (Op == SIMemOp::STORE) 1975 Changed |= enableGLCBit(MI); 1976 Changed |= enableSLCBit(MI); 1977 1978 return Changed; 1979 } 1980 1981 return Changed; 1982 } 1983 1984 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1985 SIAtomicScope Scope, 1986 SIAtomicAddrSpace AddrSpace, 1987 SIMemOp Op, 1988 bool IsCrossAddrSpaceOrdering, 1989 Position Pos) const { 1990 bool Changed = false; 1991 1992 MachineBasicBlock &MBB = *MI->getParent(); 1993 DebugLoc DL = MI->getDebugLoc(); 1994 1995 if (Pos == Position::AFTER) 1996 ++MI; 1997 1998 bool VMCnt = false; 1999 bool VSCnt = false; 2000 bool LGKMCnt = false; 2001 2002 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2003 SIAtomicAddrSpace::NONE) { 2004 switch (Scope) { 2005 case SIAtomicScope::SYSTEM: 2006 case SIAtomicScope::AGENT: 2007 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2008 VMCnt |= true; 2009 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2010 VSCnt |= true; 2011 break; 2012 case SIAtomicScope::WORKGROUP: 2013 // In WGP mode the waves of a work-group can be executing on either CU of 2014 // the WGP. Therefore need to wait for operations to complete to ensure 2015 // they are visible to waves in the other CU as the L0 is per CU. 2016 // Otherwise in CU mode and all waves of a work-group are on the same CU 2017 // which shares the same L0. 2018 if (!ST.isCuModeEnabled()) { 2019 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2020 VMCnt |= true; 2021 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2022 VSCnt |= true; 2023 } 2024 break; 2025 case SIAtomicScope::WAVEFRONT: 2026 case SIAtomicScope::SINGLETHREAD: 2027 // The L0 cache keeps all memory operations in order for 2028 // work-items in the same wavefront. 2029 break; 2030 default: 2031 llvm_unreachable("Unsupported synchronization scope"); 2032 } 2033 } 2034 2035 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2036 switch (Scope) { 2037 case SIAtomicScope::SYSTEM: 2038 case SIAtomicScope::AGENT: 2039 case SIAtomicScope::WORKGROUP: 2040 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2041 // not needed as LDS operations for all waves are executed in a total 2042 // global ordering as observed by all waves. Required if also 2043 // synchronizing with global/GDS memory as LDS operations could be 2044 // reordered with respect to later global/GDS memory operations of the 2045 // same wave. 2046 LGKMCnt |= IsCrossAddrSpaceOrdering; 2047 break; 2048 case SIAtomicScope::WAVEFRONT: 2049 case SIAtomicScope::SINGLETHREAD: 2050 // The LDS keeps all memory operations in order for 2051 // the same wavefront. 2052 break; 2053 default: 2054 llvm_unreachable("Unsupported synchronization scope"); 2055 } 2056 } 2057 2058 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 2059 switch (Scope) { 2060 case SIAtomicScope::SYSTEM: 2061 case SIAtomicScope::AGENT: 2062 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 2063 // is not needed as GDS operations for all waves are executed in a total 2064 // global ordering as observed by all waves. Required if also 2065 // synchronizing with global/LDS memory as GDS operations could be 2066 // reordered with respect to later global/LDS memory operations of the 2067 // same wave. 2068 LGKMCnt |= IsCrossAddrSpaceOrdering; 2069 break; 2070 case SIAtomicScope::WORKGROUP: 2071 case SIAtomicScope::WAVEFRONT: 2072 case SIAtomicScope::SINGLETHREAD: 2073 // The GDS keeps all memory operations in order for 2074 // the same work-group. 2075 break; 2076 default: 2077 llvm_unreachable("Unsupported synchronization scope"); 2078 } 2079 } 2080 2081 if (VMCnt || LGKMCnt) { 2082 unsigned WaitCntImmediate = 2083 AMDGPU::encodeWaitcnt(IV, 2084 VMCnt ? 0 : getVmcntBitMask(IV), 2085 getExpcntBitMask(IV), 2086 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 2087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 2088 .addImm(WaitCntImmediate); 2089 Changed = true; 2090 } 2091 2092 if (VSCnt) { 2093 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 2094 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 2095 .addImm(0); 2096 Changed = true; 2097 } 2098 2099 if (Pos == Position::AFTER) 2100 --MI; 2101 2102 return Changed; 2103 } 2104 2105 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2106 SIAtomicScope Scope, 2107 SIAtomicAddrSpace AddrSpace, 2108 Position Pos) const { 2109 if (!InsertCacheInv) 2110 return false; 2111 2112 bool Changed = false; 2113 2114 MachineBasicBlock &MBB = *MI->getParent(); 2115 DebugLoc DL = MI->getDebugLoc(); 2116 2117 if (Pos == Position::AFTER) 2118 ++MI; 2119 2120 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2121 switch (Scope) { 2122 case SIAtomicScope::SYSTEM: 2123 case SIAtomicScope::AGENT: 2124 // The order of invalidates matter here. We must invalidate "outer in" 2125 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is 2126 // invalidated. 2127 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2129 Changed = true; 2130 break; 2131 case SIAtomicScope::WORKGROUP: 2132 // In WGP mode the waves of a work-group can be executing on either CU of 2133 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2134 // in CU mode and all waves of a work-group are on the same CU, and so the 2135 // L0 does not need to be invalidated. 2136 if (!ST.isCuModeEnabled()) { 2137 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2138 Changed = true; 2139 } 2140 break; 2141 case SIAtomicScope::WAVEFRONT: 2142 case SIAtomicScope::SINGLETHREAD: 2143 // No cache to invalidate. 2144 break; 2145 default: 2146 llvm_unreachable("Unsupported synchronization scope"); 2147 } 2148 } 2149 2150 /// The scratch address space does not need the global memory cache 2151 /// to be flushed as all memory operations by the same thread are 2152 /// sequentially consistent, and no other thread can access scratch 2153 /// memory. 2154 2155 /// Other address spaces do not have a cache. 2156 2157 if (Pos == Position::AFTER) 2158 --MI; 2159 2160 return Changed; 2161 } 2162 2163 bool SIGfx11CacheControl::enableLoadCacheBypass( 2164 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2165 SIAtomicAddrSpace AddrSpace) const { 2166 assert(MI->mayLoad() && !MI->mayStore()); 2167 bool Changed = false; 2168 2169 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2170 switch (Scope) { 2171 case SIAtomicScope::SYSTEM: 2172 case SIAtomicScope::AGENT: 2173 // Set the L0 and L1 cache policies to MISS_EVICT. 2174 // Note: there is no L2 cache coherent bypass control at the ISA level. 2175 Changed |= enableGLCBit(MI); 2176 break; 2177 case SIAtomicScope::WORKGROUP: 2178 // In WGP mode the waves of a work-group can be executing on either CU of 2179 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2180 // CU mode all waves of a work-group are on the same CU, and so the L0 2181 // does not need to be bypassed. 2182 if (!ST.isCuModeEnabled()) 2183 Changed |= enableGLCBit(MI); 2184 break; 2185 case SIAtomicScope::WAVEFRONT: 2186 case SIAtomicScope::SINGLETHREAD: 2187 // No cache to bypass. 2188 break; 2189 default: 2190 llvm_unreachable("Unsupported synchronization scope"); 2191 } 2192 } 2193 2194 /// The scratch address space does not need the global memory caches 2195 /// to be bypassed as all memory operations by the same thread are 2196 /// sequentially consistent, and no other thread can access scratch 2197 /// memory. 2198 2199 /// Other address spaces do not have a cache. 2200 2201 return Changed; 2202 } 2203 2204 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2205 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2206 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 2207 2208 // Only handle load and store, not atomic read-modify-write insructions. The 2209 // latter use glc to indicate if the atomic returns a result and so must not 2210 // be used for cache control. 2211 assert(MI->mayLoad() ^ MI->mayStore()); 2212 2213 // Only update load and store, not LLVM IR atomic read-modify-write 2214 // instructions. The latter are always marked as volatile so cannot sensibly 2215 // handle it as do not want to pessimize all atomics. Also they do not support 2216 // the nontemporal attribute. 2217 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2218 2219 bool Changed = false; 2220 2221 if (IsVolatile) { 2222 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2223 // and MISS_LRU for store instructions. 2224 // Note: there is no L2 cache coherent bypass control at the ISA level. 2225 if (Op == SIMemOp::LOAD) 2226 Changed |= enableGLCBit(MI); 2227 2228 // Set MALL NOALLOC for load and store instructions. 2229 Changed |= enableDLCBit(MI); 2230 2231 // Ensure operation has completed at system scope to cause all volatile 2232 // operations to be visible outside the program in a global order. Do not 2233 // request cross address space as only the global address space can be 2234 // observable outside the program, so no need to cause a waitcnt for LDS 2235 // address space operations. 2236 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2237 Position::AFTER); 2238 return Changed; 2239 } 2240 2241 if (IsNonTemporal) { 2242 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2243 // and L2 cache policy to STREAM. 2244 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2245 // to MISS_EVICT and the L2 cache policy to STREAM. 2246 if (Op == SIMemOp::STORE) 2247 Changed |= enableGLCBit(MI); 2248 Changed |= enableSLCBit(MI); 2249 2250 // Set MALL NOALLOC for load and store instructions. 2251 Changed |= enableDLCBit(MI); 2252 return Changed; 2253 } 2254 2255 return Changed; 2256 } 2257 2258 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, 2259 AMDGPU::CPol::CPol Value) const { 2260 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2261 if (!CPol) 2262 return false; 2263 2264 uint64_t NewTH = Value & AMDGPU::CPol::TH; 2265 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { 2266 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); 2267 return true; 2268 } 2269 2270 return false; 2271 } 2272 2273 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, 2274 AMDGPU::CPol::CPol Value) const { 2275 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2276 if (!CPol) 2277 return false; 2278 2279 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; 2280 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { 2281 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); 2282 return true; 2283 } 2284 2285 return false; 2286 } 2287 2288 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( 2289 const MachineBasicBlock::iterator MI) const { 2290 // TODO: implement flag for frontend to give us a hint not to insert waits. 2291 2292 MachineBasicBlock &MBB = *MI->getParent(); 2293 const DebugLoc &DL = MI->getDebugLoc(); 2294 2295 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); 2296 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); 2297 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); 2298 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); 2299 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); 2300 2301 return true; 2302 } 2303 2304 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, 2305 SIAtomicScope Scope, 2306 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2307 bool IsCrossAddrSpaceOrdering, 2308 Position Pos) const { 2309 bool Changed = false; 2310 2311 MachineBasicBlock &MBB = *MI->getParent(); 2312 DebugLoc DL = MI->getDebugLoc(); 2313 2314 bool LOADCnt = false; 2315 bool DSCnt = false; 2316 bool STORECnt = false; 2317 2318 if (Pos == Position::AFTER) 2319 ++MI; 2320 2321 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2322 SIAtomicAddrSpace::NONE) { 2323 switch (Scope) { 2324 case SIAtomicScope::SYSTEM: 2325 case SIAtomicScope::AGENT: 2326 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2327 LOADCnt |= true; 2328 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2329 STORECnt |= true; 2330 break; 2331 case SIAtomicScope::WORKGROUP: 2332 // In WGP mode the waves of a work-group can be executing on either CU of 2333 // the WGP. Therefore need to wait for operations to complete to ensure 2334 // they are visible to waves in the other CU as the L0 is per CU. 2335 // Otherwise in CU mode and all waves of a work-group are on the same CU 2336 // which shares the same L0. 2337 if (!ST.isCuModeEnabled()) { 2338 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 2339 LOADCnt |= true; 2340 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 2341 STORECnt |= true; 2342 } 2343 break; 2344 case SIAtomicScope::WAVEFRONT: 2345 case SIAtomicScope::SINGLETHREAD: 2346 // The L0 cache keeps all memory operations in order for 2347 // work-items in the same wavefront. 2348 break; 2349 default: 2350 llvm_unreachable("Unsupported synchronization scope"); 2351 } 2352 } 2353 2354 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 2355 switch (Scope) { 2356 case SIAtomicScope::SYSTEM: 2357 case SIAtomicScope::AGENT: 2358 case SIAtomicScope::WORKGROUP: 2359 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2360 // not needed as LDS operations for all waves are executed in a total 2361 // global ordering as observed by all waves. Required if also 2362 // synchronizing with global/GDS memory as LDS operations could be 2363 // reordered with respect to later global/GDS memory operations of the 2364 // same wave. 2365 DSCnt |= IsCrossAddrSpaceOrdering; 2366 break; 2367 case SIAtomicScope::WAVEFRONT: 2368 case SIAtomicScope::SINGLETHREAD: 2369 // The LDS keeps all memory operations in order for 2370 // the same wavefront. 2371 break; 2372 default: 2373 llvm_unreachable("Unsupported synchronization scope"); 2374 } 2375 } 2376 2377 if (LOADCnt) { 2378 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); 2379 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); 2380 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); 2381 Changed = true; 2382 } 2383 2384 if (STORECnt) { 2385 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); 2386 Changed = true; 2387 } 2388 2389 if (DSCnt) { 2390 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); 2391 Changed = true; 2392 } 2393 2394 if (Pos == Position::AFTER) 2395 --MI; 2396 2397 return Changed; 2398 } 2399 2400 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2401 SIAtomicScope Scope, 2402 SIAtomicAddrSpace AddrSpace, 2403 Position Pos) const { 2404 if (!InsertCacheInv) 2405 return false; 2406 2407 MachineBasicBlock &MBB = *MI->getParent(); 2408 DebugLoc DL = MI->getDebugLoc(); 2409 2410 /// The scratch address space does not need the global memory cache 2411 /// to be flushed as all memory operations by the same thread are 2412 /// sequentially consistent, and no other thread can access scratch 2413 /// memory. 2414 2415 /// Other address spaces do not have a cache. 2416 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2417 return false; 2418 2419 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2420 switch (Scope) { 2421 case SIAtomicScope::SYSTEM: 2422 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2423 break; 2424 case SIAtomicScope::AGENT: 2425 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2426 break; 2427 case SIAtomicScope::WORKGROUP: 2428 // In WGP mode the waves of a work-group can be executing on either CU of 2429 // the WGP. Therefore we need to invalidate the L0 which is per CU. 2430 // Otherwise in CU mode all waves of a work-group are on the same CU, and so 2431 // the L0 does not need to be invalidated. 2432 if (ST.isCuModeEnabled()) 2433 return false; 2434 2435 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2436 break; 2437 case SIAtomicScope::WAVEFRONT: 2438 case SIAtomicScope::SINGLETHREAD: 2439 // No cache to invalidate. 2440 return false; 2441 default: 2442 llvm_unreachable("Unsupported synchronization scope"); 2443 } 2444 2445 if (Pos == Position::AFTER) 2446 ++MI; 2447 2448 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 2449 2450 if (Pos == Position::AFTER) 2451 --MI; 2452 2453 return true; 2454 } 2455 2456 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 2457 SIAtomicScope Scope, 2458 SIAtomicAddrSpace AddrSpace, 2459 bool IsCrossAddrSpaceOrdering, 2460 Position Pos) const { 2461 MachineBasicBlock &MBB = *MI->getParent(); 2462 DebugLoc DL = MI->getDebugLoc(); 2463 2464 // The scratch address space does not need the global memory cache 2465 // writeback as all memory operations by the same thread are 2466 // sequentially consistent, and no other thread can access scratch 2467 // memory. 2468 2469 // Other address spaces do not have a cache. 2470 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2471 return false; 2472 2473 if (Pos == Position::AFTER) 2474 ++MI; 2475 2476 // GLOBAL_WB is always needed, even for write-through caches, as it 2477 // additionally ensures all operations have reached the desired cache level. 2478 bool SkipWB = false; 2479 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2480 switch (Scope) { 2481 case SIAtomicScope::SYSTEM: 2482 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2483 break; 2484 case SIAtomicScope::AGENT: 2485 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2486 break; 2487 case SIAtomicScope::WORKGROUP: 2488 // In WGP mode the waves of a work-group can be executing on either CU of 2489 // the WGP. Therefore we need to ensure all operations have reached L1, 2490 // hence the SCOPE_SE WB. 2491 // For CU mode, we need operations to reach L0, so the wait is enough - 2492 // there are no ways for an operation to report completion without reaching 2493 // at least L0. 2494 if (ST.isCuModeEnabled()) 2495 SkipWB = true; 2496 else 2497 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2498 break; 2499 case SIAtomicScope::WAVEFRONT: 2500 case SIAtomicScope::SINGLETHREAD: 2501 // No cache to invalidate. 2502 return false; 2503 default: 2504 llvm_unreachable("Unsupported synchronization scope"); 2505 } 2506 2507 if (!SkipWB) 2508 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm); 2509 2510 if (Pos == Position::AFTER) 2511 --MI; 2512 2513 // We always have to wait for previous memory operations (load/store) to 2514 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), 2515 // we of course need to wait for that as well. 2516 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2517 IsCrossAddrSpaceOrdering, Pos); 2518 2519 return true; 2520 } 2521 2522 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( 2523 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2524 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 2525 2526 // Only handle load and store, not atomic read-modify-write instructions. 2527 assert(MI->mayLoad() ^ MI->mayStore()); 2528 2529 // Only update load and store, not LLVM IR atomic read-modify-write 2530 // instructions. The latter are always marked as volatile so cannot sensibly 2531 // handle it as do not want to pessimize all atomics. Also they do not support 2532 // the nontemporal attribute. 2533 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2534 2535 bool Changed = false; 2536 2537 if (IsLastUse) { 2538 // Set last-use hint. 2539 Changed |= setTH(MI, AMDGPU::CPol::TH_LU); 2540 } else if (IsNonTemporal) { 2541 // Set non-temporal hint for all cache levels. 2542 Changed |= setTH(MI, AMDGPU::CPol::TH_NT); 2543 } 2544 2545 if (IsVolatile) { 2546 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2547 2548 if (Op == SIMemOp::STORE) 2549 Changed |= insertWaitsBeforeSystemScopeStore(MI); 2550 2551 // Ensure operation has completed at system scope to cause all volatile 2552 // operations to be visible outside the program in a global order. Do not 2553 // request cross address space as only the global address space can be 2554 // observable outside the program, so no need to cause a waitcnt for LDS 2555 // address space operations. 2556 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2557 Position::AFTER); 2558 } 2559 2560 return Changed; 2561 } 2562 2563 bool SIGfx12CacheControl::expandSystemScopeStore( 2564 MachineBasicBlock::iterator &MI) const { 2565 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2566 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) 2567 return insertWaitsBeforeSystemScopeStore(MI); 2568 2569 return false; 2570 } 2571 2572 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, 2573 SIAtomicScope Scope, 2574 SIAtomicAddrSpace AddrSpace) const { 2575 bool Changed = false; 2576 2577 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2578 switch (Scope) { 2579 case SIAtomicScope::SYSTEM: 2580 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2581 break; 2582 case SIAtomicScope::AGENT: 2583 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); 2584 break; 2585 case SIAtomicScope::WORKGROUP: 2586 // In workgroup mode, SCOPE_SE is needed as waves can executes on 2587 // different CUs that access different L0s. 2588 if (!ST.isCuModeEnabled()) 2589 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); 2590 break; 2591 case SIAtomicScope::WAVEFRONT: 2592 case SIAtomicScope::SINGLETHREAD: 2593 // No cache to bypass. 2594 break; 2595 default: 2596 llvm_unreachable("Unsupported synchronization scope"); 2597 } 2598 } 2599 2600 // The scratch address space does not need the global memory caches 2601 // to be bypassed as all memory operations by the same thread are 2602 // sequentially consistent, and no other thread can access scratch 2603 // memory. 2604 2605 // Other address spaces do not have a cache. 2606 2607 return Changed; 2608 } 2609 2610 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2611 if (AtomicPseudoMIs.empty()) 2612 return false; 2613 2614 for (auto &MI : AtomicPseudoMIs) 2615 MI->eraseFromParent(); 2616 2617 AtomicPseudoMIs.clear(); 2618 return true; 2619 } 2620 2621 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2622 MachineBasicBlock::iterator &MI) { 2623 assert(MI->mayLoad() && !MI->mayStore()); 2624 2625 bool Changed = false; 2626 2627 if (MOI.isAtomic()) { 2628 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2629 MOI.getOrdering() == AtomicOrdering::Acquire || 2630 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2631 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2632 MOI.getOrderingAddrSpace()); 2633 } 2634 2635 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2636 Changed |= CC->insertWait(MI, MOI.getScope(), 2637 MOI.getOrderingAddrSpace(), 2638 SIMemOp::LOAD | SIMemOp::STORE, 2639 MOI.getIsCrossAddressSpaceOrdering(), 2640 Position::BEFORE); 2641 2642 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2643 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2644 Changed |= CC->insertWait(MI, MOI.getScope(), 2645 MOI.getInstrAddrSpace(), 2646 SIMemOp::LOAD, 2647 MOI.getIsCrossAddressSpaceOrdering(), 2648 Position::AFTER); 2649 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2650 MOI.getOrderingAddrSpace(), 2651 Position::AFTER); 2652 } 2653 2654 return Changed; 2655 } 2656 2657 // Atomic instructions already bypass caches to the scope specified by the 2658 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use 2659 // instructions need additional treatment. 2660 Changed |= CC->enableVolatileAndOrNonTemporal( 2661 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), 2662 MOI.isNonTemporal(), MOI.isLastUse()); 2663 2664 return Changed; 2665 } 2666 2667 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2668 MachineBasicBlock::iterator &MI) { 2669 assert(!MI->mayLoad() && MI->mayStore()); 2670 2671 bool Changed = false; 2672 2673 if (MOI.isAtomic()) { 2674 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2675 MOI.getOrdering() == AtomicOrdering::Release || 2676 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2677 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2678 MOI.getOrderingAddrSpace()); 2679 } 2680 2681 if (MOI.getOrdering() == AtomicOrdering::Release || 2682 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2683 Changed |= CC->insertRelease(MI, MOI.getScope(), 2684 MOI.getOrderingAddrSpace(), 2685 MOI.getIsCrossAddressSpaceOrdering(), 2686 Position::BEFORE); 2687 2688 return Changed; 2689 } 2690 2691 // Atomic instructions already bypass caches to the scope specified by the 2692 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2693 // need additional treatment. 2694 Changed |= CC->enableVolatileAndOrNonTemporal( 2695 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2696 MOI.isNonTemporal()); 2697 2698 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is 2699 // instruction field, do not confuse it with atomic scope. 2700 Changed |= CC->expandSystemScopeStore(MI); 2701 return Changed; 2702 } 2703 2704 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2705 MachineBasicBlock::iterator &MI) { 2706 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2707 2708 AtomicPseudoMIs.push_back(MI); 2709 bool Changed = false; 2710 2711 // Refine fenced address space based on MMRAs. 2712 // 2713 // TODO: Should we support this MMRA on other atomic operations? 2714 auto OrderingAddrSpace = 2715 getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); 2716 2717 if (MOI.isAtomic()) { 2718 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2719 Changed |= CC->insertWait( 2720 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2721 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); 2722 2723 if (MOI.getOrdering() == AtomicOrdering::Release || 2724 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2725 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2726 /// TODO: This relies on a barrier always generating a waitcnt 2727 /// for LDS to ensure it is not reordered with the completion of 2728 /// the proceeding LDS operations. If barrier had a memory 2729 /// ordering and memory scope, then library does not need to 2730 /// generate a fence. Could add support in this file for 2731 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2732 /// adding S_WAITCNT before a S_BARRIER. 2733 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, 2734 MOI.getIsCrossAddressSpaceOrdering(), 2735 Position::BEFORE); 2736 2737 // TODO: If both release and invalidate are happening they could be combined 2738 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2739 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2740 // track cache invalidate and write back instructions. 2741 2742 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2743 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2744 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2745 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, 2746 Position::BEFORE); 2747 2748 return Changed; 2749 } 2750 2751 return Changed; 2752 } 2753 2754 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2755 MachineBasicBlock::iterator &MI) { 2756 assert(MI->mayLoad() && MI->mayStore()); 2757 2758 bool Changed = false; 2759 2760 if (MOI.isAtomic()) { 2761 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2762 MOI.getOrdering() == AtomicOrdering::Acquire || 2763 MOI.getOrdering() == AtomicOrdering::Release || 2764 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2765 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2766 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2767 MOI.getInstrAddrSpace()); 2768 } 2769 2770 if (MOI.getOrdering() == AtomicOrdering::Release || 2771 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2772 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2773 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2774 Changed |= CC->insertRelease(MI, MOI.getScope(), 2775 MOI.getOrderingAddrSpace(), 2776 MOI.getIsCrossAddressSpaceOrdering(), 2777 Position::BEFORE); 2778 2779 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2780 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2781 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2782 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2783 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2784 Changed |= CC->insertWait(MI, MOI.getScope(), 2785 MOI.getInstrAddrSpace(), 2786 isAtomicRet(*MI) ? SIMemOp::LOAD : 2787 SIMemOp::STORE, 2788 MOI.getIsCrossAddressSpaceOrdering(), 2789 Position::AFTER); 2790 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2791 MOI.getOrderingAddrSpace(), 2792 Position::AFTER); 2793 } 2794 2795 return Changed; 2796 } 2797 2798 return Changed; 2799 } 2800 2801 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2802 bool Changed = false; 2803 2804 const MachineModuleInfo &MMI = 2805 getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 2806 2807 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>()); 2808 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2809 2810 for (auto &MBB : MF) { 2811 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2812 2813 // Unbundle instructions after the post-RA scheduler. 2814 if (MI->isBundle() && MI->mayLoadOrStore()) { 2815 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2816 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2817 I != E && I->isBundledWithPred(); ++I) { 2818 I->unbundleFromPred(); 2819 for (MachineOperand &MO : I->operands()) 2820 if (MO.isReg()) 2821 MO.setIsInternalRead(false); 2822 } 2823 2824 MI->eraseFromParent(); 2825 MI = II->getIterator(); 2826 } 2827 2828 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2829 continue; 2830 2831 if (const auto &MOI = MOA.getLoadInfo(MI)) 2832 Changed |= expandLoad(*MOI, MI); 2833 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2834 Changed |= expandStore(*MOI, MI); 2835 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2836 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2837 Changed |= expandAtomicFence(*MOI, MI); 2838 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2839 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2840 } 2841 } 2842 2843 Changed |= removeAtomicPseudoMIs(); 2844 return Changed; 2845 } 2846 2847 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2848 2849 char SIMemoryLegalizer::ID = 0; 2850 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2851 2852 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2853 return new SIMemoryLegalizer(); 2854 } 2855