1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/TargetParser/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 std::optional<SIMemOpInfo> 233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 std::optional<SIMemOpInfo> 242 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 std::optional<SIMemOpInfo> 247 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 std::optional<SIMemOpInfo> 252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 std::optional<SIMemOpInfo> 257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 /// Inserts any necessary instructions at position \p Pos relative 316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 317 /// \p Op associated with address spaces \p AddrSpace have completed. Used 318 /// between memory instructions to enforce the order they become visible as 319 /// observed by other memory instructions executing in memory scope \p Scope. 320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 321 /// address spaces. Returns true iff any instructions inserted. 322 virtual bool insertWait(MachineBasicBlock::iterator &MI, 323 SIAtomicScope Scope, 324 SIAtomicAddrSpace AddrSpace, 325 SIMemOp Op, 326 bool IsCrossAddrSpaceOrdering, 327 Position Pos) const = 0; 328 329 /// Inserts any necessary instructions at position \p Pos relative to 330 /// instruction \p MI to ensure any subsequent memory instructions of this 331 /// thread with address spaces \p AddrSpace will observe the previous memory 332 /// operations by any thread for memory scopes up to memory scope \p Scope . 333 /// Returns true iff any instructions inserted. 334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 335 SIAtomicScope Scope, 336 SIAtomicAddrSpace AddrSpace, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure previous memory instructions by this thread 341 /// with address spaces \p AddrSpace have completed and can be observed by 342 /// subsequent memory instructions by any thread executing in memory scope \p 343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 344 /// between address spaces. Returns true iff any instructions inserted. 345 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const = 0; 350 351 /// Virtual destructor to allow derivations to be deleted. 352 virtual ~SICacheControl() = default; 353 354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 355 MachineBasicBlock::iterator &MI) const { 356 return false; 357 } 358 }; 359 360 class SIGfx6CacheControl : public SICacheControl { 361 protected: 362 363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::GLC); 367 } 368 369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 370 /// is modified, false otherwise. 371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 372 return enableNamedBit(MI, AMDGPU::CPol::SLC); 373 } 374 375 public: 376 377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 378 379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 380 SIAtomicScope Scope, 381 SIAtomicAddrSpace AddrSpace) const override; 382 383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 388 SIAtomicScope Scope, 389 SIAtomicAddrSpace AddrSpace) const override; 390 391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 392 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 393 bool IsVolatile, 394 bool IsNonTemporal) const override; 395 396 bool insertWait(MachineBasicBlock::iterator &MI, 397 SIAtomicScope Scope, 398 SIAtomicAddrSpace AddrSpace, 399 SIMemOp Op, 400 bool IsCrossAddrSpaceOrdering, 401 Position Pos) const override; 402 403 bool insertAcquire(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 Position Pos) const override; 407 408 bool insertRelease(MachineBasicBlock::iterator &MI, 409 SIAtomicScope Scope, 410 SIAtomicAddrSpace AddrSpace, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 }; 414 415 class SIGfx7CacheControl : public SIGfx6CacheControl { 416 public: 417 418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 419 420 bool insertAcquire(MachineBasicBlock::iterator &MI, 421 SIAtomicScope Scope, 422 SIAtomicAddrSpace AddrSpace, 423 Position Pos) const override; 424 425 }; 426 427 class SIGfx90ACacheControl : public SIGfx7CacheControl { 428 public: 429 430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 431 432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 433 SIAtomicScope Scope, 434 SIAtomicAddrSpace AddrSpace) const override; 435 436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 441 SIAtomicScope Scope, 442 SIAtomicAddrSpace AddrSpace) const override; 443 444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 445 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 446 bool IsVolatile, 447 bool IsNonTemporal) const override; 448 449 bool insertWait(MachineBasicBlock::iterator &MI, 450 SIAtomicScope Scope, 451 SIAtomicAddrSpace AddrSpace, 452 SIMemOp Op, 453 bool IsCrossAddrSpaceOrdering, 454 Position Pos) const override; 455 456 bool insertAcquire(MachineBasicBlock::iterator &MI, 457 SIAtomicScope Scope, 458 SIAtomicAddrSpace AddrSpace, 459 Position Pos) const override; 460 461 bool insertRelease(MachineBasicBlock::iterator &MI, 462 SIAtomicScope Scope, 463 SIAtomicAddrSpace AddrSpace, 464 bool IsCrossAddrSpaceOrdering, 465 Position Pos) const override; 466 }; 467 468 class SIGfx940CacheControl : public SIGfx90ACacheControl { 469 protected: 470 471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 472 /// is modified, false otherwise. 473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 474 return enableNamedBit(MI, AMDGPU::CPol::SC0); 475 } 476 477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 478 /// is modified, false otherwise. 479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 480 return enableNamedBit(MI, AMDGPU::CPol::SC1); 481 } 482 483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 484 /// is modified, false otherwise. 485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 486 return enableNamedBit(MI, AMDGPU::CPol::NT); 487 } 488 489 public: 490 491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 492 493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace) const override; 496 497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 502 SIAtomicScope Scope, 503 SIAtomicAddrSpace AddrSpace) const override; 504 505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 506 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 507 bool IsVolatile, 508 bool IsNonTemporal) const override; 509 510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 511 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 512 513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 515 Position Pos) const override; 516 517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 518 MachineBasicBlock::iterator &MI) const override { 519 bool Changed = false; 520 if (ST.hasForceStoreSC0SC1() && 521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 522 SIAtomicAddrSpace::GLOBAL | 523 SIAtomicAddrSpace::OTHER)) != 524 SIAtomicAddrSpace::NONE) { 525 Changed |= enableSC0Bit(MI); 526 Changed |= enableSC1Bit(MI); 527 } 528 return Changed; 529 } 530 }; 531 532 class SIGfx10CacheControl : public SIGfx7CacheControl { 533 protected: 534 535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 536 /// is modified, false otherwise. 537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 538 return enableNamedBit(MI, AMDGPU::CPol::DLC); 539 } 540 541 public: 542 543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 544 545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 546 SIAtomicScope Scope, 547 SIAtomicAddrSpace AddrSpace) const override; 548 549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 550 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 551 bool IsVolatile, 552 bool IsNonTemporal) const override; 553 554 bool insertWait(MachineBasicBlock::iterator &MI, 555 SIAtomicScope Scope, 556 SIAtomicAddrSpace AddrSpace, 557 SIMemOp Op, 558 bool IsCrossAddrSpaceOrdering, 559 Position Pos) const override; 560 561 bool insertAcquire(MachineBasicBlock::iterator &MI, 562 SIAtomicScope Scope, 563 SIAtomicAddrSpace AddrSpace, 564 Position Pos) const override; 565 }; 566 567 class SIGfx11CacheControl : public SIGfx10CacheControl { 568 public: 569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 570 571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 572 SIAtomicScope Scope, 573 SIAtomicAddrSpace AddrSpace) const override; 574 575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 576 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 577 bool IsVolatile, 578 bool IsNonTemporal) const override; 579 }; 580 581 class SIGfx12CacheControl : public SIGfx11CacheControl { 582 public: 583 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 584 585 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 586 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 587 }; 588 589 class SIMemoryLegalizer final : public MachineFunctionPass { 590 private: 591 592 /// Cache Control. 593 std::unique_ptr<SICacheControl> CC = nullptr; 594 595 /// List of atomic pseudo instructions. 596 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 597 598 /// Return true iff instruction \p MI is a atomic instruction that 599 /// returns a result. 600 bool isAtomicRet(const MachineInstr &MI) const { 601 return SIInstrInfo::isAtomicRet(MI); 602 } 603 604 /// Removes all processed atomic pseudo instructions from the current 605 /// function. Returns true if current function is modified, false otherwise. 606 bool removeAtomicPseudoMIs(); 607 608 /// Expands load operation \p MI. Returns true if instructions are 609 /// added/deleted or \p MI is modified, false otherwise. 610 bool expandLoad(const SIMemOpInfo &MOI, 611 MachineBasicBlock::iterator &MI); 612 /// Expands store operation \p MI. Returns true if instructions are 613 /// added/deleted or \p MI is modified, false otherwise. 614 bool expandStore(const SIMemOpInfo &MOI, 615 MachineBasicBlock::iterator &MI); 616 /// Expands atomic fence operation \p MI. Returns true if 617 /// instructions are added/deleted or \p MI is modified, false otherwise. 618 bool expandAtomicFence(const SIMemOpInfo &MOI, 619 MachineBasicBlock::iterator &MI); 620 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 621 /// instructions are added/deleted or \p MI is modified, false otherwise. 622 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 623 MachineBasicBlock::iterator &MI); 624 625 public: 626 static char ID; 627 628 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 629 630 void getAnalysisUsage(AnalysisUsage &AU) const override { 631 AU.setPreservesCFG(); 632 MachineFunctionPass::getAnalysisUsage(AU); 633 } 634 635 StringRef getPassName() const override { 636 return PASS_NAME; 637 } 638 639 bool runOnMachineFunction(MachineFunction &MF) override; 640 }; 641 642 } // end namespace anonymous 643 644 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 645 const char *Msg) const { 646 const Function &Func = MI->getParent()->getParent()->getFunction(); 647 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 648 Func.getContext().diagnose(Diag); 649 } 650 651 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 652 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 653 SIAtomicAddrSpace InstrAddrSpace) const { 654 if (SSID == SyncScope::System) 655 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 656 if (SSID == MMI->getAgentSSID()) 657 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 658 if (SSID == MMI->getWorkgroupSSID()) 659 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 660 true); 661 if (SSID == MMI->getWavefrontSSID()) 662 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 663 true); 664 if (SSID == SyncScope::SingleThread) 665 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 666 true); 667 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 668 return std::tuple(SIAtomicScope::SYSTEM, 669 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 670 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 671 return std::tuple(SIAtomicScope::AGENT, 672 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 673 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 674 return std::tuple(SIAtomicScope::WORKGROUP, 675 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 676 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 677 return std::tuple(SIAtomicScope::WAVEFRONT, 678 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 679 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 680 return std::tuple(SIAtomicScope::SINGLETHREAD, 681 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 682 return std::nullopt; 683 } 684 685 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 686 if (AS == AMDGPUAS::FLAT_ADDRESS) 687 return SIAtomicAddrSpace::FLAT; 688 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 689 return SIAtomicAddrSpace::GLOBAL; 690 if (AS == AMDGPUAS::LOCAL_ADDRESS) 691 return SIAtomicAddrSpace::LDS; 692 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 693 return SIAtomicAddrSpace::SCRATCH; 694 if (AS == AMDGPUAS::REGION_ADDRESS) 695 return SIAtomicAddrSpace::GDS; 696 697 return SIAtomicAddrSpace::OTHER; 698 } 699 700 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 701 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 702 } 703 704 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 705 const MachineBasicBlock::iterator &MI) const { 706 assert(MI->getNumMemOperands() > 0); 707 708 SyncScope::ID SSID = SyncScope::SingleThread; 709 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 710 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 711 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 712 bool IsNonTemporal = true; 713 bool IsVolatile = false; 714 715 // Validator should check whether or not MMOs cover the entire set of 716 // locations accessed by the memory instruction. 717 for (const auto &MMO : MI->memoperands()) { 718 IsNonTemporal &= MMO->isNonTemporal(); 719 IsVolatile |= MMO->isVolatile(); 720 InstrAddrSpace |= 721 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 722 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 723 if (OpOrdering != AtomicOrdering::NotAtomic) { 724 const auto &IsSyncScopeInclusion = 725 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 726 if (!IsSyncScopeInclusion) { 727 reportUnsupported(MI, 728 "Unsupported non-inclusive atomic synchronization scope"); 729 return std::nullopt; 730 } 731 732 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 733 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 734 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 735 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 736 FailureOrdering = 737 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 738 } 739 } 740 741 SIAtomicScope Scope = SIAtomicScope::NONE; 742 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 743 bool IsCrossAddressSpaceOrdering = false; 744 if (Ordering != AtomicOrdering::NotAtomic) { 745 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 746 if (!ScopeOrNone) { 747 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 748 return std::nullopt; 749 } 750 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 751 *ScopeOrNone; 752 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 753 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 754 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 755 reportUnsupported(MI, "Unsupported atomic address space"); 756 return std::nullopt; 757 } 758 } 759 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 760 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 761 IsNonTemporal); 762 } 763 764 std::optional<SIMemOpInfo> 765 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 766 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 767 768 if (!(MI->mayLoad() && !MI->mayStore())) 769 return std::nullopt; 770 771 // Be conservative if there are no memory operands. 772 if (MI->getNumMemOperands() == 0) 773 return SIMemOpInfo(); 774 775 return constructFromMIWithMMO(MI); 776 } 777 778 std::optional<SIMemOpInfo> 779 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 780 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 781 782 if (!(!MI->mayLoad() && MI->mayStore())) 783 return std::nullopt; 784 785 // Be conservative if there are no memory operands. 786 if (MI->getNumMemOperands() == 0) 787 return SIMemOpInfo(); 788 789 return constructFromMIWithMMO(MI); 790 } 791 792 std::optional<SIMemOpInfo> 793 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 794 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 795 796 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 797 return std::nullopt; 798 799 AtomicOrdering Ordering = 800 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 801 802 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 803 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 804 if (!ScopeOrNone) { 805 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 806 return std::nullopt; 807 } 808 809 SIAtomicScope Scope = SIAtomicScope::NONE; 810 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 811 bool IsCrossAddressSpaceOrdering = false; 812 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 813 *ScopeOrNone; 814 815 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 816 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 817 reportUnsupported(MI, "Unsupported atomic address space"); 818 return std::nullopt; 819 } 820 821 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 822 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 823 } 824 825 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 826 const MachineBasicBlock::iterator &MI) const { 827 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 828 829 if (!(MI->mayLoad() && MI->mayStore())) 830 return std::nullopt; 831 832 // Be conservative if there are no memory operands. 833 if (MI->getNumMemOperands() == 0) 834 return SIMemOpInfo(); 835 836 return constructFromMIWithMMO(MI); 837 } 838 839 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 840 TII = ST.getInstrInfo(); 841 IV = getIsaVersion(ST.getCPU()); 842 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 843 } 844 845 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 846 AMDGPU::CPol::CPol Bit) const { 847 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 848 if (!CPol) 849 return false; 850 851 CPol->setImm(CPol->getImm() | Bit); 852 return true; 853 } 854 855 /* static */ 856 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 857 GCNSubtarget::Generation Generation = ST.getGeneration(); 858 if (ST.hasGFX940Insts()) 859 return std::make_unique<SIGfx940CacheControl>(ST); 860 if (ST.hasGFX90AInsts()) 861 return std::make_unique<SIGfx90ACacheControl>(ST); 862 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 863 return std::make_unique<SIGfx6CacheControl>(ST); 864 if (Generation < AMDGPUSubtarget::GFX10) 865 return std::make_unique<SIGfx7CacheControl>(ST); 866 if (Generation < AMDGPUSubtarget::GFX11) 867 return std::make_unique<SIGfx10CacheControl>(ST); 868 if (Generation < AMDGPUSubtarget::GFX12) 869 return std::make_unique<SIGfx11CacheControl>(ST); 870 return std::make_unique<SIGfx12CacheControl>(ST); 871 } 872 873 bool SIGfx6CacheControl::enableLoadCacheBypass( 874 const MachineBasicBlock::iterator &MI, 875 SIAtomicScope Scope, 876 SIAtomicAddrSpace AddrSpace) const { 877 assert(MI->mayLoad() && !MI->mayStore()); 878 bool Changed = false; 879 880 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 881 switch (Scope) { 882 case SIAtomicScope::SYSTEM: 883 case SIAtomicScope::AGENT: 884 // Set L1 cache policy to MISS_EVICT. 885 // Note: there is no L2 cache bypass policy at the ISA level. 886 Changed |= enableGLCBit(MI); 887 break; 888 case SIAtomicScope::WORKGROUP: 889 case SIAtomicScope::WAVEFRONT: 890 case SIAtomicScope::SINGLETHREAD: 891 // No cache to bypass. 892 break; 893 default: 894 llvm_unreachable("Unsupported synchronization scope"); 895 } 896 } 897 898 /// The scratch address space does not need the global memory caches 899 /// to be bypassed as all memory operations by the same thread are 900 /// sequentially consistent, and no other thread can access scratch 901 /// memory. 902 903 /// Other address spaces do not have a cache. 904 905 return Changed; 906 } 907 908 bool SIGfx6CacheControl::enableStoreCacheBypass( 909 const MachineBasicBlock::iterator &MI, 910 SIAtomicScope Scope, 911 SIAtomicAddrSpace AddrSpace) const { 912 assert(!MI->mayLoad() && MI->mayStore()); 913 bool Changed = false; 914 915 /// The L1 cache is write through so does not need to be bypassed. There is no 916 /// bypass control for the L2 cache at the isa level. 917 918 return Changed; 919 } 920 921 bool SIGfx6CacheControl::enableRMWCacheBypass( 922 const MachineBasicBlock::iterator &MI, 923 SIAtomicScope Scope, 924 SIAtomicAddrSpace AddrSpace) const { 925 assert(MI->mayLoad() && MI->mayStore()); 926 bool Changed = false; 927 928 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 929 /// bypassed, and the GLC bit is instead used to indicate if they are 930 /// return or no-return. 931 /// Note: there is no L2 cache coherent bypass control at the ISA level. 932 933 return Changed; 934 } 935 936 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 937 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 938 bool IsVolatile, bool IsNonTemporal) const { 939 // Only handle load and store, not atomic read-modify-write insructions. The 940 // latter use glc to indicate if the atomic returns a result and so must not 941 // be used for cache control. 942 assert(MI->mayLoad() ^ MI->mayStore()); 943 944 // Only update load and store, not LLVM IR atomic read-modify-write 945 // instructions. The latter are always marked as volatile so cannot sensibly 946 // handle it as do not want to pessimize all atomics. Also they do not support 947 // the nontemporal attribute. 948 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 949 950 bool Changed = false; 951 952 if (IsVolatile) { 953 // Set L1 cache policy to be MISS_EVICT for load instructions 954 // and MISS_LRU for store instructions. 955 // Note: there is no L2 cache bypass policy at the ISA level. 956 if (Op == SIMemOp::LOAD) 957 Changed |= enableGLCBit(MI); 958 959 // Ensure operation has completed at system scope to cause all volatile 960 // operations to be visible outside the program in a global order. Do not 961 // request cross address space as only the global address space can be 962 // observable outside the program, so no need to cause a waitcnt for LDS 963 // address space operations. 964 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 965 Position::AFTER); 966 967 return Changed; 968 } 969 970 if (IsNonTemporal) { 971 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 972 // for both loads and stores, and the L2 cache policy to STREAM. 973 Changed |= enableGLCBit(MI); 974 Changed |= enableSLCBit(MI); 975 return Changed; 976 } 977 978 return Changed; 979 } 980 981 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 982 SIAtomicScope Scope, 983 SIAtomicAddrSpace AddrSpace, 984 SIMemOp Op, 985 bool IsCrossAddrSpaceOrdering, 986 Position Pos) const { 987 bool Changed = false; 988 989 MachineBasicBlock &MBB = *MI->getParent(); 990 DebugLoc DL = MI->getDebugLoc(); 991 992 if (Pos == Position::AFTER) 993 ++MI; 994 995 bool VMCnt = false; 996 bool LGKMCnt = false; 997 998 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 999 SIAtomicAddrSpace::NONE) { 1000 switch (Scope) { 1001 case SIAtomicScope::SYSTEM: 1002 case SIAtomicScope::AGENT: 1003 VMCnt |= true; 1004 break; 1005 case SIAtomicScope::WORKGROUP: 1006 case SIAtomicScope::WAVEFRONT: 1007 case SIAtomicScope::SINGLETHREAD: 1008 // The L1 cache keeps all memory operations in order for 1009 // wavefronts in the same work-group. 1010 break; 1011 default: 1012 llvm_unreachable("Unsupported synchronization scope"); 1013 } 1014 } 1015 1016 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1017 switch (Scope) { 1018 case SIAtomicScope::SYSTEM: 1019 case SIAtomicScope::AGENT: 1020 case SIAtomicScope::WORKGROUP: 1021 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1022 // not needed as LDS operations for all waves are executed in a total 1023 // global ordering as observed by all waves. Required if also 1024 // synchronizing with global/GDS memory as LDS operations could be 1025 // reordered with respect to later global/GDS memory operations of the 1026 // same wave. 1027 LGKMCnt |= IsCrossAddrSpaceOrdering; 1028 break; 1029 case SIAtomicScope::WAVEFRONT: 1030 case SIAtomicScope::SINGLETHREAD: 1031 // The LDS keeps all memory operations in order for 1032 // the same wavefront. 1033 break; 1034 default: 1035 llvm_unreachable("Unsupported synchronization scope"); 1036 } 1037 } 1038 1039 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1040 switch (Scope) { 1041 case SIAtomicScope::SYSTEM: 1042 case SIAtomicScope::AGENT: 1043 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1044 // is not needed as GDS operations for all waves are executed in a total 1045 // global ordering as observed by all waves. Required if also 1046 // synchronizing with global/LDS memory as GDS operations could be 1047 // reordered with respect to later global/LDS memory operations of the 1048 // same wave. 1049 LGKMCnt |= IsCrossAddrSpaceOrdering; 1050 break; 1051 case SIAtomicScope::WORKGROUP: 1052 case SIAtomicScope::WAVEFRONT: 1053 case SIAtomicScope::SINGLETHREAD: 1054 // The GDS keeps all memory operations in order for 1055 // the same work-group. 1056 break; 1057 default: 1058 llvm_unreachable("Unsupported synchronization scope"); 1059 } 1060 } 1061 1062 if (VMCnt || LGKMCnt) { 1063 unsigned WaitCntImmediate = 1064 AMDGPU::encodeWaitcnt(IV, 1065 VMCnt ? 0 : getVmcntBitMask(IV), 1066 getExpcntBitMask(IV), 1067 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1068 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1069 .addImm(WaitCntImmediate); 1070 Changed = true; 1071 } 1072 1073 if (Pos == Position::AFTER) 1074 --MI; 1075 1076 return Changed; 1077 } 1078 1079 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1080 SIAtomicScope Scope, 1081 SIAtomicAddrSpace AddrSpace, 1082 Position Pos) const { 1083 if (!InsertCacheInv) 1084 return false; 1085 1086 bool Changed = false; 1087 1088 MachineBasicBlock &MBB = *MI->getParent(); 1089 DebugLoc DL = MI->getDebugLoc(); 1090 1091 if (Pos == Position::AFTER) 1092 ++MI; 1093 1094 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1095 switch (Scope) { 1096 case SIAtomicScope::SYSTEM: 1097 case SIAtomicScope::AGENT: 1098 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1099 Changed = true; 1100 break; 1101 case SIAtomicScope::WORKGROUP: 1102 case SIAtomicScope::WAVEFRONT: 1103 case SIAtomicScope::SINGLETHREAD: 1104 // No cache to invalidate. 1105 break; 1106 default: 1107 llvm_unreachable("Unsupported synchronization scope"); 1108 } 1109 } 1110 1111 /// The scratch address space does not need the global memory cache 1112 /// to be flushed as all memory operations by the same thread are 1113 /// sequentially consistent, and no other thread can access scratch 1114 /// memory. 1115 1116 /// Other address spaces do not have a cache. 1117 1118 if (Pos == Position::AFTER) 1119 --MI; 1120 1121 return Changed; 1122 } 1123 1124 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1125 SIAtomicScope Scope, 1126 SIAtomicAddrSpace AddrSpace, 1127 bool IsCrossAddrSpaceOrdering, 1128 Position Pos) const { 1129 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1130 IsCrossAddrSpaceOrdering, Pos); 1131 } 1132 1133 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1134 SIAtomicScope Scope, 1135 SIAtomicAddrSpace AddrSpace, 1136 Position Pos) const { 1137 if (!InsertCacheInv) 1138 return false; 1139 1140 bool Changed = false; 1141 1142 MachineBasicBlock &MBB = *MI->getParent(); 1143 DebugLoc DL = MI->getDebugLoc(); 1144 1145 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1146 1147 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1148 ? AMDGPU::BUFFER_WBINVL1 1149 : AMDGPU::BUFFER_WBINVL1_VOL; 1150 1151 if (Pos == Position::AFTER) 1152 ++MI; 1153 1154 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1155 switch (Scope) { 1156 case SIAtomicScope::SYSTEM: 1157 case SIAtomicScope::AGENT: 1158 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1159 Changed = true; 1160 break; 1161 case SIAtomicScope::WORKGROUP: 1162 case SIAtomicScope::WAVEFRONT: 1163 case SIAtomicScope::SINGLETHREAD: 1164 // No cache to invalidate. 1165 break; 1166 default: 1167 llvm_unreachable("Unsupported synchronization scope"); 1168 } 1169 } 1170 1171 /// The scratch address space does not need the global memory cache 1172 /// to be flushed as all memory operations by the same thread are 1173 /// sequentially consistent, and no other thread can access scratch 1174 /// memory. 1175 1176 /// Other address spaces do not have a cache. 1177 1178 if (Pos == Position::AFTER) 1179 --MI; 1180 1181 return Changed; 1182 } 1183 1184 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1185 const MachineBasicBlock::iterator &MI, 1186 SIAtomicScope Scope, 1187 SIAtomicAddrSpace AddrSpace) const { 1188 assert(MI->mayLoad() && !MI->mayStore()); 1189 bool Changed = false; 1190 1191 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1192 switch (Scope) { 1193 case SIAtomicScope::SYSTEM: 1194 case SIAtomicScope::AGENT: 1195 // Set the L1 cache policy to MISS_LRU. 1196 // Note: there is no L2 cache bypass policy at the ISA level. 1197 Changed |= enableGLCBit(MI); 1198 break; 1199 case SIAtomicScope::WORKGROUP: 1200 // In threadgroup split mode the waves of a work-group can be executing on 1201 // different CUs. Therefore need to bypass the L1 which is per CU. 1202 // Otherwise in non-threadgroup split mode all waves of a work-group are 1203 // on the same CU, and so the L1 does not need to be bypassed. 1204 if (ST.isTgSplitEnabled()) 1205 Changed |= enableGLCBit(MI); 1206 break; 1207 case SIAtomicScope::WAVEFRONT: 1208 case SIAtomicScope::SINGLETHREAD: 1209 // No cache to bypass. 1210 break; 1211 default: 1212 llvm_unreachable("Unsupported synchronization scope"); 1213 } 1214 } 1215 1216 /// The scratch address space does not need the global memory caches 1217 /// to be bypassed as all memory operations by the same thread are 1218 /// sequentially consistent, and no other thread can access scratch 1219 /// memory. 1220 1221 /// Other address spaces do not have a cache. 1222 1223 return Changed; 1224 } 1225 1226 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1227 const MachineBasicBlock::iterator &MI, 1228 SIAtomicScope Scope, 1229 SIAtomicAddrSpace AddrSpace) const { 1230 assert(!MI->mayLoad() && MI->mayStore()); 1231 bool Changed = false; 1232 1233 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1234 switch (Scope) { 1235 case SIAtomicScope::SYSTEM: 1236 case SIAtomicScope::AGENT: 1237 /// Do not set glc for store atomic operations as they implicitly write 1238 /// through the L1 cache. 1239 break; 1240 case SIAtomicScope::WORKGROUP: 1241 case SIAtomicScope::WAVEFRONT: 1242 case SIAtomicScope::SINGLETHREAD: 1243 // No cache to bypass. Store atomics implicitly write through the L1 1244 // cache. 1245 break; 1246 default: 1247 llvm_unreachable("Unsupported synchronization scope"); 1248 } 1249 } 1250 1251 /// The scratch address space does not need the global memory caches 1252 /// to be bypassed as all memory operations by the same thread are 1253 /// sequentially consistent, and no other thread can access scratch 1254 /// memory. 1255 1256 /// Other address spaces do not have a cache. 1257 1258 return Changed; 1259 } 1260 1261 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1262 const MachineBasicBlock::iterator &MI, 1263 SIAtomicScope Scope, 1264 SIAtomicAddrSpace AddrSpace) const { 1265 assert(MI->mayLoad() && MI->mayStore()); 1266 bool Changed = false; 1267 1268 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1269 switch (Scope) { 1270 case SIAtomicScope::SYSTEM: 1271 case SIAtomicScope::AGENT: 1272 /// Do not set glc for RMW atomic operations as they implicitly bypass 1273 /// the L1 cache, and the glc bit is instead used to indicate if they are 1274 /// return or no-return. 1275 break; 1276 case SIAtomicScope::WORKGROUP: 1277 case SIAtomicScope::WAVEFRONT: 1278 case SIAtomicScope::SINGLETHREAD: 1279 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1280 break; 1281 default: 1282 llvm_unreachable("Unsupported synchronization scope"); 1283 } 1284 } 1285 1286 return Changed; 1287 } 1288 1289 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1290 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1291 bool IsVolatile, bool IsNonTemporal) const { 1292 // Only handle load and store, not atomic read-modify-write insructions. The 1293 // latter use glc to indicate if the atomic returns a result and so must not 1294 // be used for cache control. 1295 assert(MI->mayLoad() ^ MI->mayStore()); 1296 1297 // Only update load and store, not LLVM IR atomic read-modify-write 1298 // instructions. The latter are always marked as volatile so cannot sensibly 1299 // handle it as do not want to pessimize all atomics. Also they do not support 1300 // the nontemporal attribute. 1301 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1302 1303 bool Changed = false; 1304 1305 if (IsVolatile) { 1306 // Set L1 cache policy to be MISS_EVICT for load instructions 1307 // and MISS_LRU for store instructions. 1308 // Note: there is no L2 cache bypass policy at the ISA level. 1309 if (Op == SIMemOp::LOAD) 1310 Changed |= enableGLCBit(MI); 1311 1312 // Ensure operation has completed at system scope to cause all volatile 1313 // operations to be visible outside the program in a global order. Do not 1314 // request cross address space as only the global address space can be 1315 // observable outside the program, so no need to cause a waitcnt for LDS 1316 // address space operations. 1317 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1318 Position::AFTER); 1319 1320 return Changed; 1321 } 1322 1323 if (IsNonTemporal) { 1324 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1325 // for both loads and stores, and the L2 cache policy to STREAM. 1326 Changed |= enableGLCBit(MI); 1327 Changed |= enableSLCBit(MI); 1328 return Changed; 1329 } 1330 1331 return Changed; 1332 } 1333 1334 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1335 SIAtomicScope Scope, 1336 SIAtomicAddrSpace AddrSpace, 1337 SIMemOp Op, 1338 bool IsCrossAddrSpaceOrdering, 1339 Position Pos) const { 1340 if (ST.isTgSplitEnabled()) { 1341 // In threadgroup split mode the waves of a work-group can be executing on 1342 // different CUs. Therefore need to wait for global or GDS memory operations 1343 // to complete to ensure they are visible to waves in the other CUs. 1344 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1345 // the same CU, so no need to wait for global memory as all waves in the 1346 // work-group access the same the L1, nor wait for GDS as access are ordered 1347 // on a CU. 1348 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1349 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1350 (Scope == SIAtomicScope::WORKGROUP)) { 1351 // Same as GFX7 using agent scope. 1352 Scope = SIAtomicScope::AGENT; 1353 } 1354 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1355 // LDS memory operations. 1356 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1357 } 1358 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1359 IsCrossAddrSpaceOrdering, Pos); 1360 } 1361 1362 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1363 SIAtomicScope Scope, 1364 SIAtomicAddrSpace AddrSpace, 1365 Position Pos) const { 1366 if (!InsertCacheInv) 1367 return false; 1368 1369 bool Changed = false; 1370 1371 MachineBasicBlock &MBB = *MI->getParent(); 1372 DebugLoc DL = MI->getDebugLoc(); 1373 1374 if (Pos == Position::AFTER) 1375 ++MI; 1376 1377 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1378 switch (Scope) { 1379 case SIAtomicScope::SYSTEM: 1380 // Ensures that following loads will not see stale remote VMEM data or 1381 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1382 // CC will never be stale due to the local memory probes. 1383 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1384 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1385 // hardware does not reorder memory operations by the same wave with 1386 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1387 // remove any cache lines of earlier writes by the same wave and ensures 1388 // later reads by the same wave will refetch the cache lines. 1389 Changed = true; 1390 break; 1391 case SIAtomicScope::AGENT: 1392 // Same as GFX7. 1393 break; 1394 case SIAtomicScope::WORKGROUP: 1395 // In threadgroup split mode the waves of a work-group can be executing on 1396 // different CUs. Therefore need to invalidate the L1 which is per CU. 1397 // Otherwise in non-threadgroup split mode all waves of a work-group are 1398 // on the same CU, and so the L1 does not need to be invalidated. 1399 if (ST.isTgSplitEnabled()) { 1400 // Same as GFX7 using agent scope. 1401 Scope = SIAtomicScope::AGENT; 1402 } 1403 break; 1404 case SIAtomicScope::WAVEFRONT: 1405 case SIAtomicScope::SINGLETHREAD: 1406 // Same as GFX7. 1407 break; 1408 default: 1409 llvm_unreachable("Unsupported synchronization scope"); 1410 } 1411 } 1412 1413 /// The scratch address space does not need the global memory cache 1414 /// to be flushed as all memory operations by the same thread are 1415 /// sequentially consistent, and no other thread can access scratch 1416 /// memory. 1417 1418 /// Other address spaces do not have a cache. 1419 1420 if (Pos == Position::AFTER) 1421 --MI; 1422 1423 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1424 1425 return Changed; 1426 } 1427 1428 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1429 SIAtomicScope Scope, 1430 SIAtomicAddrSpace AddrSpace, 1431 bool IsCrossAddrSpaceOrdering, 1432 Position Pos) const { 1433 bool Changed = false; 1434 1435 MachineBasicBlock &MBB = *MI->getParent(); 1436 const DebugLoc &DL = MI->getDebugLoc(); 1437 1438 if (Pos == Position::AFTER) 1439 ++MI; 1440 1441 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1442 switch (Scope) { 1443 case SIAtomicScope::SYSTEM: 1444 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1445 // hardware does not reorder memory operations by the same wave with 1446 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1447 // to initiate writeback of any dirty cache lines of earlier writes by the 1448 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1449 // writeback has completed. 1450 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1451 // Set SC bits to indicate system scope. 1452 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1453 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1454 // vmcnt(0)" needed by the "BUFFER_WBL2". 1455 Changed = true; 1456 break; 1457 case SIAtomicScope::AGENT: 1458 case SIAtomicScope::WORKGROUP: 1459 case SIAtomicScope::WAVEFRONT: 1460 case SIAtomicScope::SINGLETHREAD: 1461 // Same as GFX7. 1462 break; 1463 default: 1464 llvm_unreachable("Unsupported synchronization scope"); 1465 } 1466 } 1467 1468 if (Pos == Position::AFTER) 1469 --MI; 1470 1471 Changed |= 1472 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1473 IsCrossAddrSpaceOrdering, Pos); 1474 1475 return Changed; 1476 } 1477 1478 bool SIGfx940CacheControl::enableLoadCacheBypass( 1479 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1480 SIAtomicAddrSpace AddrSpace) const { 1481 assert(MI->mayLoad() && !MI->mayStore()); 1482 bool Changed = false; 1483 1484 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1485 switch (Scope) { 1486 case SIAtomicScope::SYSTEM: 1487 // Set SC bits to indicate system scope. 1488 Changed |= enableSC0Bit(MI); 1489 Changed |= enableSC1Bit(MI); 1490 break; 1491 case SIAtomicScope::AGENT: 1492 // Set SC bits to indicate agent scope. 1493 Changed |= enableSC1Bit(MI); 1494 break; 1495 case SIAtomicScope::WORKGROUP: 1496 // In threadgroup split mode the waves of a work-group can be executing on 1497 // different CUs. Therefore need to bypass the L1 which is per CU. 1498 // Otherwise in non-threadgroup split mode all waves of a work-group are 1499 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1500 // bits to indicate work-group scope will do this automatically. 1501 Changed |= enableSC0Bit(MI); 1502 break; 1503 case SIAtomicScope::WAVEFRONT: 1504 case SIAtomicScope::SINGLETHREAD: 1505 // Leave SC bits unset to indicate wavefront scope. 1506 break; 1507 default: 1508 llvm_unreachable("Unsupported synchronization scope"); 1509 } 1510 } 1511 1512 /// The scratch address space does not need the global memory caches 1513 /// to be bypassed as all memory operations by the same thread are 1514 /// sequentially consistent, and no other thread can access scratch 1515 /// memory. 1516 1517 /// Other address spaces do not have a cache. 1518 1519 return Changed; 1520 } 1521 1522 bool SIGfx940CacheControl::enableStoreCacheBypass( 1523 const MachineBasicBlock::iterator &MI, 1524 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1525 assert(!MI->mayLoad() && MI->mayStore()); 1526 bool Changed = false; 1527 1528 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1529 switch (Scope) { 1530 case SIAtomicScope::SYSTEM: 1531 // Set SC bits to indicate system scope. 1532 Changed |= enableSC0Bit(MI); 1533 Changed |= enableSC1Bit(MI); 1534 break; 1535 case SIAtomicScope::AGENT: 1536 // Set SC bits to indicate agent scope. 1537 Changed |= enableSC1Bit(MI); 1538 break; 1539 case SIAtomicScope::WORKGROUP: 1540 // Set SC bits to indicate workgroup scope. 1541 Changed |= enableSC0Bit(MI); 1542 break; 1543 case SIAtomicScope::WAVEFRONT: 1544 case SIAtomicScope::SINGLETHREAD: 1545 // Leave SC bits unset to indicate wavefront scope. 1546 break; 1547 default: 1548 llvm_unreachable("Unsupported synchronization scope"); 1549 } 1550 } 1551 1552 /// The scratch address space does not need the global memory caches 1553 /// to be bypassed as all memory operations by the same thread are 1554 /// sequentially consistent, and no other thread can access scratch 1555 /// memory. 1556 1557 /// Other address spaces do not have a cache. 1558 1559 return Changed; 1560 } 1561 1562 bool SIGfx940CacheControl::enableRMWCacheBypass( 1563 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1564 SIAtomicAddrSpace AddrSpace) const { 1565 assert(MI->mayLoad() && MI->mayStore()); 1566 bool Changed = false; 1567 1568 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1569 switch (Scope) { 1570 case SIAtomicScope::SYSTEM: 1571 // Set SC1 bit to indicate system scope. 1572 Changed |= enableSC1Bit(MI); 1573 break; 1574 case SIAtomicScope::AGENT: 1575 case SIAtomicScope::WORKGROUP: 1576 case SIAtomicScope::WAVEFRONT: 1577 case SIAtomicScope::SINGLETHREAD: 1578 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1579 // to indicate system or agent scope. The SC0 bit is used to indicate if 1580 // they are return or no-return. Leave SC1 bit unset to indicate agent 1581 // scope. 1582 break; 1583 default: 1584 llvm_unreachable("Unsupported synchronization scope"); 1585 } 1586 } 1587 1588 return Changed; 1589 } 1590 1591 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1592 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1593 bool IsVolatile, bool IsNonTemporal) const { 1594 // Only handle load and store, not atomic read-modify-write insructions. The 1595 // latter use glc to indicate if the atomic returns a result and so must not 1596 // be used for cache control. 1597 assert(MI->mayLoad() ^ MI->mayStore()); 1598 1599 // Only update load and store, not LLVM IR atomic read-modify-write 1600 // instructions. The latter are always marked as volatile so cannot sensibly 1601 // handle it as do not want to pessimize all atomics. Also they do not support 1602 // the nontemporal attribute. 1603 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1604 1605 bool Changed = false; 1606 1607 if (IsVolatile) { 1608 // Set SC bits to indicate system scope. 1609 Changed |= enableSC0Bit(MI); 1610 Changed |= enableSC1Bit(MI); 1611 1612 // Ensure operation has completed at system scope to cause all volatile 1613 // operations to be visible outside the program in a global order. Do not 1614 // request cross address space as only the global address space can be 1615 // observable outside the program, so no need to cause a waitcnt for LDS 1616 // address space operations. 1617 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1618 Position::AFTER); 1619 1620 return Changed; 1621 } 1622 1623 if (IsNonTemporal) { 1624 Changed |= enableNTBit(MI); 1625 return Changed; 1626 } 1627 1628 return Changed; 1629 } 1630 1631 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1632 SIAtomicScope Scope, 1633 SIAtomicAddrSpace AddrSpace, 1634 Position Pos) const { 1635 if (!InsertCacheInv) 1636 return false; 1637 1638 bool Changed = false; 1639 1640 MachineBasicBlock &MBB = *MI->getParent(); 1641 DebugLoc DL = MI->getDebugLoc(); 1642 1643 if (Pos == Position::AFTER) 1644 ++MI; 1645 1646 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1647 switch (Scope) { 1648 case SIAtomicScope::SYSTEM: 1649 // Ensures that following loads will not see stale remote VMEM data or 1650 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1651 // CC will never be stale due to the local memory probes. 1652 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1653 // Set SC bits to indicate system scope. 1654 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1655 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1656 // hardware does not reorder memory operations by the same wave with 1657 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1658 // remove any cache lines of earlier writes by the same wave and ensures 1659 // later reads by the same wave will refetch the cache lines. 1660 Changed = true; 1661 break; 1662 case SIAtomicScope::AGENT: 1663 // Ensures that following loads will not see stale remote date or local 1664 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1665 // due to the memory probes. 1666 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1667 // Set SC bits to indicate agent scope. 1668 .addImm(AMDGPU::CPol::SC1); 1669 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1670 // does not reorder memory operations with respect to preceeding buffer 1671 // invalidate. The invalidate is guaranteed to remove any cache lines of 1672 // earlier writes and ensures later writes will refetch the cache lines. 1673 Changed = true; 1674 break; 1675 case SIAtomicScope::WORKGROUP: 1676 // In threadgroup split mode the waves of a work-group can be executing on 1677 // different CUs. Therefore need to invalidate the L1 which is per CU. 1678 // Otherwise in non-threadgroup split mode all waves of a work-group are 1679 // on the same CU, and so the L1 does not need to be invalidated. 1680 if (ST.isTgSplitEnabled()) { 1681 // Ensures L1 is invalidated if in threadgroup split mode. In 1682 // non-threadgroup split mode it is a NOP, but no point generating it in 1683 // that case if know not in that mode. 1684 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1685 // Set SC bits to indicate work-group scope. 1686 .addImm(AMDGPU::CPol::SC0); 1687 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1688 // does not reorder memory operations with respect to preceeding buffer 1689 // invalidate. The invalidate is guaranteed to remove any cache lines of 1690 // earlier writes and ensures later writes will refetch the cache lines. 1691 Changed = true; 1692 } 1693 break; 1694 case SIAtomicScope::WAVEFRONT: 1695 case SIAtomicScope::SINGLETHREAD: 1696 // Could generate "BUFFER_INV" but it would do nothing as there are no 1697 // caches to invalidate. 1698 break; 1699 default: 1700 llvm_unreachable("Unsupported synchronization scope"); 1701 } 1702 } 1703 1704 /// The scratch address space does not need the global memory cache 1705 /// to be flushed as all memory operations by the same thread are 1706 /// sequentially consistent, and no other thread can access scratch 1707 /// memory. 1708 1709 /// Other address spaces do not have a cache. 1710 1711 if (Pos == Position::AFTER) 1712 --MI; 1713 1714 return Changed; 1715 } 1716 1717 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1718 SIAtomicScope Scope, 1719 SIAtomicAddrSpace AddrSpace, 1720 bool IsCrossAddrSpaceOrdering, 1721 Position Pos) const { 1722 bool Changed = false; 1723 1724 MachineBasicBlock &MBB = *MI->getParent(); 1725 DebugLoc DL = MI->getDebugLoc(); 1726 1727 if (Pos == Position::AFTER) 1728 ++MI; 1729 1730 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1731 switch (Scope) { 1732 case SIAtomicScope::SYSTEM: 1733 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1734 // hardware does not reorder memory operations by the same wave with 1735 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1736 // to initiate writeback of any dirty cache lines of earlier writes by the 1737 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1738 // writeback has completed. 1739 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1740 // Set SC bits to indicate system scope. 1741 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1742 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1743 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1744 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1745 Changed = true; 1746 break; 1747 case SIAtomicScope::AGENT: 1748 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1749 // Set SC bits to indicate agent scope. 1750 .addImm(AMDGPU::CPol::SC1); 1751 1752 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1753 // SIAtomicScope::AGENT, the following insertWait will generate the 1754 // required "S_WAITCNT vmcnt(0)". 1755 Changed = true; 1756 break; 1757 case SIAtomicScope::WORKGROUP: 1758 case SIAtomicScope::WAVEFRONT: 1759 case SIAtomicScope::SINGLETHREAD: 1760 // Do not generate "BUFFER_WBL2" as there are no caches it would 1761 // writeback, and would require an otherwise unnecessary 1762 // "S_WAITCNT vmcnt(0)". 1763 break; 1764 default: 1765 llvm_unreachable("Unsupported synchronization scope"); 1766 } 1767 } 1768 1769 if (Pos == Position::AFTER) 1770 --MI; 1771 1772 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1773 // S_WAITCNT needed. 1774 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1775 IsCrossAddrSpaceOrdering, Pos); 1776 1777 return Changed; 1778 } 1779 1780 bool SIGfx10CacheControl::enableLoadCacheBypass( 1781 const MachineBasicBlock::iterator &MI, 1782 SIAtomicScope Scope, 1783 SIAtomicAddrSpace AddrSpace) const { 1784 assert(MI->mayLoad() && !MI->mayStore()); 1785 bool Changed = false; 1786 1787 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1788 switch (Scope) { 1789 case SIAtomicScope::SYSTEM: 1790 case SIAtomicScope::AGENT: 1791 // Set the L0 and L1 cache policies to MISS_EVICT. 1792 // Note: there is no L2 cache coherent bypass control at the ISA level. 1793 Changed |= enableGLCBit(MI); 1794 Changed |= enableDLCBit(MI); 1795 break; 1796 case SIAtomicScope::WORKGROUP: 1797 // In WGP mode the waves of a work-group can be executing on either CU of 1798 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1799 // CU mode all waves of a work-group are on the same CU, and so the L0 1800 // does not need to be bypassed. 1801 if (!ST.isCuModeEnabled()) 1802 Changed |= enableGLCBit(MI); 1803 break; 1804 case SIAtomicScope::WAVEFRONT: 1805 case SIAtomicScope::SINGLETHREAD: 1806 // No cache to bypass. 1807 break; 1808 default: 1809 llvm_unreachable("Unsupported synchronization scope"); 1810 } 1811 } 1812 1813 /// The scratch address space does not need the global memory caches 1814 /// to be bypassed as all memory operations by the same thread are 1815 /// sequentially consistent, and no other thread can access scratch 1816 /// memory. 1817 1818 /// Other address spaces do not have a cache. 1819 1820 return Changed; 1821 } 1822 1823 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1824 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1825 bool IsVolatile, bool IsNonTemporal) const { 1826 1827 // Only handle load and store, not atomic read-modify-write insructions. The 1828 // latter use glc to indicate if the atomic returns a result and so must not 1829 // be used for cache control. 1830 assert(MI->mayLoad() ^ MI->mayStore()); 1831 1832 // Only update load and store, not LLVM IR atomic read-modify-write 1833 // instructions. The latter are always marked as volatile so cannot sensibly 1834 // handle it as do not want to pessimize all atomics. Also they do not support 1835 // the nontemporal attribute. 1836 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1837 1838 bool Changed = false; 1839 1840 if (IsVolatile) { 1841 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1842 // and MISS_LRU for store instructions. 1843 // Note: there is no L2 cache coherent bypass control at the ISA level. 1844 if (Op == SIMemOp::LOAD) { 1845 Changed |= enableGLCBit(MI); 1846 Changed |= enableDLCBit(MI); 1847 } 1848 1849 // Ensure operation has completed at system scope to cause all volatile 1850 // operations to be visible outside the program in a global order. Do not 1851 // request cross address space as only the global address space can be 1852 // observable outside the program, so no need to cause a waitcnt for LDS 1853 // address space operations. 1854 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1855 Position::AFTER); 1856 return Changed; 1857 } 1858 1859 if (IsNonTemporal) { 1860 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1861 // and L2 cache policy to STREAM. 1862 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1863 // to MISS_EVICT and the L2 cache policy to STREAM. 1864 if (Op == SIMemOp::STORE) 1865 Changed |= enableGLCBit(MI); 1866 Changed |= enableSLCBit(MI); 1867 1868 return Changed; 1869 } 1870 1871 return Changed; 1872 } 1873 1874 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1875 SIAtomicScope Scope, 1876 SIAtomicAddrSpace AddrSpace, 1877 SIMemOp Op, 1878 bool IsCrossAddrSpaceOrdering, 1879 Position Pos) const { 1880 bool Changed = false; 1881 1882 MachineBasicBlock &MBB = *MI->getParent(); 1883 DebugLoc DL = MI->getDebugLoc(); 1884 1885 if (Pos == Position::AFTER) 1886 ++MI; 1887 1888 bool VMCnt = false; 1889 bool VSCnt = false; 1890 bool LGKMCnt = false; 1891 1892 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1893 SIAtomicAddrSpace::NONE) { 1894 switch (Scope) { 1895 case SIAtomicScope::SYSTEM: 1896 case SIAtomicScope::AGENT: 1897 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1898 VMCnt |= true; 1899 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1900 VSCnt |= true; 1901 break; 1902 case SIAtomicScope::WORKGROUP: 1903 // In WGP mode the waves of a work-group can be executing on either CU of 1904 // the WGP. Therefore need to wait for operations to complete to ensure 1905 // they are visible to waves in the other CU as the L0 is per CU. 1906 // Otherwise in CU mode and all waves of a work-group are on the same CU 1907 // which shares the same L0. 1908 if (!ST.isCuModeEnabled()) { 1909 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1910 VMCnt |= true; 1911 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1912 VSCnt |= true; 1913 } 1914 break; 1915 case SIAtomicScope::WAVEFRONT: 1916 case SIAtomicScope::SINGLETHREAD: 1917 // The L0 cache keeps all memory operations in order for 1918 // work-items in the same wavefront. 1919 break; 1920 default: 1921 llvm_unreachable("Unsupported synchronization scope"); 1922 } 1923 } 1924 1925 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1926 switch (Scope) { 1927 case SIAtomicScope::SYSTEM: 1928 case SIAtomicScope::AGENT: 1929 case SIAtomicScope::WORKGROUP: 1930 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1931 // not needed as LDS operations for all waves are executed in a total 1932 // global ordering as observed by all waves. Required if also 1933 // synchronizing with global/GDS memory as LDS operations could be 1934 // reordered with respect to later global/GDS memory operations of the 1935 // same wave. 1936 LGKMCnt |= IsCrossAddrSpaceOrdering; 1937 break; 1938 case SIAtomicScope::WAVEFRONT: 1939 case SIAtomicScope::SINGLETHREAD: 1940 // The LDS keeps all memory operations in order for 1941 // the same wavefront. 1942 break; 1943 default: 1944 llvm_unreachable("Unsupported synchronization scope"); 1945 } 1946 } 1947 1948 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1949 switch (Scope) { 1950 case SIAtomicScope::SYSTEM: 1951 case SIAtomicScope::AGENT: 1952 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1953 // is not needed as GDS operations for all waves are executed in a total 1954 // global ordering as observed by all waves. Required if also 1955 // synchronizing with global/LDS memory as GDS operations could be 1956 // reordered with respect to later global/LDS memory operations of the 1957 // same wave. 1958 LGKMCnt |= IsCrossAddrSpaceOrdering; 1959 break; 1960 case SIAtomicScope::WORKGROUP: 1961 case SIAtomicScope::WAVEFRONT: 1962 case SIAtomicScope::SINGLETHREAD: 1963 // The GDS keeps all memory operations in order for 1964 // the same work-group. 1965 break; 1966 default: 1967 llvm_unreachable("Unsupported synchronization scope"); 1968 } 1969 } 1970 1971 if (VMCnt || LGKMCnt) { 1972 unsigned WaitCntImmediate = 1973 AMDGPU::encodeWaitcnt(IV, 1974 VMCnt ? 0 : getVmcntBitMask(IV), 1975 getExpcntBitMask(IV), 1976 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1977 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 1978 .addImm(WaitCntImmediate); 1979 Changed = true; 1980 } 1981 1982 if (VSCnt) { 1983 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 1984 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1985 .addImm(0); 1986 Changed = true; 1987 } 1988 1989 if (Pos == Position::AFTER) 1990 --MI; 1991 1992 return Changed; 1993 } 1994 1995 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1996 SIAtomicScope Scope, 1997 SIAtomicAddrSpace AddrSpace, 1998 Position Pos) const { 1999 if (!InsertCacheInv) 2000 return false; 2001 2002 bool Changed = false; 2003 2004 MachineBasicBlock &MBB = *MI->getParent(); 2005 DebugLoc DL = MI->getDebugLoc(); 2006 2007 if (Pos == Position::AFTER) 2008 ++MI; 2009 2010 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2011 switch (Scope) { 2012 case SIAtomicScope::SYSTEM: 2013 case SIAtomicScope::AGENT: 2014 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2015 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2016 Changed = true; 2017 break; 2018 case SIAtomicScope::WORKGROUP: 2019 // In WGP mode the waves of a work-group can be executing on either CU of 2020 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2021 // in CU mode and all waves of a work-group are on the same CU, and so the 2022 // L0 does not need to be invalidated. 2023 if (!ST.isCuModeEnabled()) { 2024 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2025 Changed = true; 2026 } 2027 break; 2028 case SIAtomicScope::WAVEFRONT: 2029 case SIAtomicScope::SINGLETHREAD: 2030 // No cache to invalidate. 2031 break; 2032 default: 2033 llvm_unreachable("Unsupported synchronization scope"); 2034 } 2035 } 2036 2037 /// The scratch address space does not need the global memory cache 2038 /// to be flushed as all memory operations by the same thread are 2039 /// sequentially consistent, and no other thread can access scratch 2040 /// memory. 2041 2042 /// Other address spaces do not have a cache. 2043 2044 if (Pos == Position::AFTER) 2045 --MI; 2046 2047 return Changed; 2048 } 2049 2050 bool SIGfx11CacheControl::enableLoadCacheBypass( 2051 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2052 SIAtomicAddrSpace AddrSpace) const { 2053 assert(MI->mayLoad() && !MI->mayStore()); 2054 bool Changed = false; 2055 2056 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2057 switch (Scope) { 2058 case SIAtomicScope::SYSTEM: 2059 case SIAtomicScope::AGENT: 2060 // Set the L0 and L1 cache policies to MISS_EVICT. 2061 // Note: there is no L2 cache coherent bypass control at the ISA level. 2062 Changed |= enableGLCBit(MI); 2063 break; 2064 case SIAtomicScope::WORKGROUP: 2065 // In WGP mode the waves of a work-group can be executing on either CU of 2066 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2067 // CU mode all waves of a work-group are on the same CU, and so the L0 2068 // does not need to be bypassed. 2069 if (!ST.isCuModeEnabled()) 2070 Changed |= enableGLCBit(MI); 2071 break; 2072 case SIAtomicScope::WAVEFRONT: 2073 case SIAtomicScope::SINGLETHREAD: 2074 // No cache to bypass. 2075 break; 2076 default: 2077 llvm_unreachable("Unsupported synchronization scope"); 2078 } 2079 } 2080 2081 /// The scratch address space does not need the global memory caches 2082 /// to be bypassed as all memory operations by the same thread are 2083 /// sequentially consistent, and no other thread can access scratch 2084 /// memory. 2085 2086 /// Other address spaces do not have a cache. 2087 2088 return Changed; 2089 } 2090 2091 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2092 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2093 bool IsVolatile, bool IsNonTemporal) const { 2094 2095 // Only handle load and store, not atomic read-modify-write insructions. The 2096 // latter use glc to indicate if the atomic returns a result and so must not 2097 // be used for cache control. 2098 assert(MI->mayLoad() ^ MI->mayStore()); 2099 2100 // Only update load and store, not LLVM IR atomic read-modify-write 2101 // instructions. The latter are always marked as volatile so cannot sensibly 2102 // handle it as do not want to pessimize all atomics. Also they do not support 2103 // the nontemporal attribute. 2104 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2105 2106 bool Changed = false; 2107 2108 if (IsVolatile) { 2109 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2110 // and MISS_LRU for store instructions. 2111 // Note: there is no L2 cache coherent bypass control at the ISA level. 2112 if (Op == SIMemOp::LOAD) 2113 Changed |= enableGLCBit(MI); 2114 2115 // Set MALL NOALLOC for load and store instructions. 2116 Changed |= enableDLCBit(MI); 2117 2118 // Ensure operation has completed at system scope to cause all volatile 2119 // operations to be visible outside the program in a global order. Do not 2120 // request cross address space as only the global address space can be 2121 // observable outside the program, so no need to cause a waitcnt for LDS 2122 // address space operations. 2123 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2124 Position::AFTER); 2125 return Changed; 2126 } 2127 2128 if (IsNonTemporal) { 2129 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2130 // and L2 cache policy to STREAM. 2131 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2132 // to MISS_EVICT and the L2 cache policy to STREAM. 2133 if (Op == SIMemOp::STORE) 2134 Changed |= enableGLCBit(MI); 2135 Changed |= enableSLCBit(MI); 2136 2137 // Set MALL NOALLOC for load and store instructions. 2138 Changed |= enableDLCBit(MI); 2139 return Changed; 2140 } 2141 2142 return Changed; 2143 } 2144 2145 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2146 SIAtomicScope Scope, 2147 SIAtomicAddrSpace AddrSpace, 2148 Position Pos) const { 2149 if (!InsertCacheInv) 2150 return false; 2151 2152 MachineBasicBlock &MBB = *MI->getParent(); 2153 DebugLoc DL = MI->getDebugLoc(); 2154 2155 /// The scratch address space does not need the global memory cache 2156 /// to be flushed as all memory operations by the same thread are 2157 /// sequentially consistent, and no other thread can access scratch 2158 /// memory. 2159 2160 /// Other address spaces do not have a cache. 2161 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2162 return false; 2163 2164 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2165 switch (Scope) { 2166 case SIAtomicScope::SYSTEM: 2167 ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2168 break; 2169 case SIAtomicScope::AGENT: 2170 ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2171 break; 2172 case SIAtomicScope::WORKGROUP: 2173 // In WGP mode the waves of a work-group can be executing on either CU of 2174 // the WGP. Therefore we need to invalidate the L0 which is per CU. 2175 // Otherwise in CU mode all waves of a work-group are on the same CU, and so 2176 // the L0 does not need to be invalidated. 2177 if (ST.isCuModeEnabled()) 2178 return false; 2179 2180 ScopeImm = AMDGPU::CPol::SCOPE_SE; 2181 break; 2182 case SIAtomicScope::WAVEFRONT: 2183 case SIAtomicScope::SINGLETHREAD: 2184 // No cache to invalidate. 2185 return false; 2186 default: 2187 llvm_unreachable("Unsupported synchronization scope"); 2188 } 2189 2190 if (Pos == Position::AFTER) 2191 ++MI; 2192 2193 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 2194 2195 if (Pos == Position::AFTER) 2196 --MI; 2197 2198 return true; 2199 } 2200 2201 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2202 if (AtomicPseudoMIs.empty()) 2203 return false; 2204 2205 for (auto &MI : AtomicPseudoMIs) 2206 MI->eraseFromParent(); 2207 2208 AtomicPseudoMIs.clear(); 2209 return true; 2210 } 2211 2212 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2213 MachineBasicBlock::iterator &MI) { 2214 assert(MI->mayLoad() && !MI->mayStore()); 2215 2216 bool Changed = false; 2217 2218 if (MOI.isAtomic()) { 2219 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2220 MOI.getOrdering() == AtomicOrdering::Acquire || 2221 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2222 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2223 MOI.getOrderingAddrSpace()); 2224 } 2225 2226 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2227 Changed |= CC->insertWait(MI, MOI.getScope(), 2228 MOI.getOrderingAddrSpace(), 2229 SIMemOp::LOAD | SIMemOp::STORE, 2230 MOI.getIsCrossAddressSpaceOrdering(), 2231 Position::BEFORE); 2232 2233 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2234 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2235 Changed |= CC->insertWait(MI, MOI.getScope(), 2236 MOI.getInstrAddrSpace(), 2237 SIMemOp::LOAD, 2238 MOI.getIsCrossAddressSpaceOrdering(), 2239 Position::AFTER); 2240 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2241 MOI.getOrderingAddrSpace(), 2242 Position::AFTER); 2243 } 2244 2245 return Changed; 2246 } 2247 2248 // Atomic instructions already bypass caches to the scope specified by the 2249 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2250 // need additional treatment. 2251 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2252 SIMemOp::LOAD, MOI.isVolatile(), 2253 MOI.isNonTemporal()); 2254 return Changed; 2255 } 2256 2257 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2258 MachineBasicBlock::iterator &MI) { 2259 assert(!MI->mayLoad() && MI->mayStore()); 2260 2261 bool Changed = false; 2262 2263 if (MOI.isAtomic()) { 2264 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2265 MOI.getOrdering() == AtomicOrdering::Release || 2266 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2267 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2268 MOI.getOrderingAddrSpace()); 2269 } 2270 2271 if (MOI.getOrdering() == AtomicOrdering::Release || 2272 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2273 Changed |= CC->insertRelease(MI, MOI.getScope(), 2274 MOI.getOrderingAddrSpace(), 2275 MOI.getIsCrossAddressSpaceOrdering(), 2276 Position::BEFORE); 2277 2278 return Changed; 2279 } 2280 2281 // Atomic instructions already bypass caches to the scope specified by the 2282 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2283 // need additional treatment. 2284 Changed |= CC->enableVolatileAndOrNonTemporal( 2285 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2286 MOI.isNonTemporal()); 2287 return Changed; 2288 } 2289 2290 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2291 MachineBasicBlock::iterator &MI) { 2292 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2293 2294 AtomicPseudoMIs.push_back(MI); 2295 bool Changed = false; 2296 2297 if (MOI.isAtomic()) { 2298 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2299 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), 2300 SIMemOp::LOAD | SIMemOp::STORE, 2301 MOI.getIsCrossAddressSpaceOrdering(), 2302 Position::BEFORE); 2303 2304 if (MOI.getOrdering() == AtomicOrdering::Release || 2305 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2306 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2307 /// TODO: This relies on a barrier always generating a waitcnt 2308 /// for LDS to ensure it is not reordered with the completion of 2309 /// the proceeding LDS operations. If barrier had a memory 2310 /// ordering and memory scope, then library does not need to 2311 /// generate a fence. Could add support in this file for 2312 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2313 /// adding S_WAITCNT before a S_BARRIER. 2314 Changed |= CC->insertRelease(MI, MOI.getScope(), 2315 MOI.getOrderingAddrSpace(), 2316 MOI.getIsCrossAddressSpaceOrdering(), 2317 Position::BEFORE); 2318 2319 // TODO: If both release and invalidate are happening they could be combined 2320 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2321 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2322 // track cache invalidate and write back instructions. 2323 2324 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2325 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2326 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2327 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2328 MOI.getOrderingAddrSpace(), 2329 Position::BEFORE); 2330 2331 return Changed; 2332 } 2333 2334 return Changed; 2335 } 2336 2337 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2338 MachineBasicBlock::iterator &MI) { 2339 assert(MI->mayLoad() && MI->mayStore()); 2340 2341 bool Changed = false; 2342 2343 if (MOI.isAtomic()) { 2344 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2345 MOI.getOrdering() == AtomicOrdering::Acquire || 2346 MOI.getOrdering() == AtomicOrdering::Release || 2347 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2348 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2349 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2350 MOI.getInstrAddrSpace()); 2351 } 2352 2353 if (MOI.getOrdering() == AtomicOrdering::Release || 2354 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2355 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2356 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2357 Changed |= CC->insertRelease(MI, MOI.getScope(), 2358 MOI.getOrderingAddrSpace(), 2359 MOI.getIsCrossAddressSpaceOrdering(), 2360 Position::BEFORE); 2361 2362 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2363 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2364 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2365 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2366 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2367 Changed |= CC->insertWait(MI, MOI.getScope(), 2368 MOI.getInstrAddrSpace(), 2369 isAtomicRet(*MI) ? SIMemOp::LOAD : 2370 SIMemOp::STORE, 2371 MOI.getIsCrossAddressSpaceOrdering(), 2372 Position::AFTER); 2373 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2374 MOI.getOrderingAddrSpace(), 2375 Position::AFTER); 2376 } 2377 2378 return Changed; 2379 } 2380 2381 return Changed; 2382 } 2383 2384 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2385 bool Changed = false; 2386 2387 SIMemOpAccess MOA(MF); 2388 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2389 2390 for (auto &MBB : MF) { 2391 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2392 2393 // Unbundle instructions after the post-RA scheduler. 2394 if (MI->isBundle() && MI->mayLoadOrStore()) { 2395 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2396 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2397 I != E && I->isBundledWithPred(); ++I) { 2398 I->unbundleFromPred(); 2399 for (MachineOperand &MO : I->operands()) 2400 if (MO.isReg()) 2401 MO.setIsInternalRead(false); 2402 } 2403 2404 MI->eraseFromParent(); 2405 MI = II->getIterator(); 2406 } 2407 2408 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2409 continue; 2410 2411 if (const auto &MOI = MOA.getLoadInfo(MI)) 2412 Changed |= expandLoad(*MOI, MI); 2413 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2414 Changed |= expandStore(*MOI, MI); 2415 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2416 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2417 Changed |= expandAtomicFence(*MOI, MI); 2418 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2419 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2420 } 2421 } 2422 2423 Changed |= removeAtomicPseudoMIs(); 2424 return Changed; 2425 } 2426 2427 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2428 2429 char SIMemoryLegalizer::ID = 0; 2430 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2431 2432 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2433 return new SIMemoryLegalizer(); 2434 } 2435