1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/TargetParser/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 std::optional<SIMemOpInfo> 233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 std::optional<SIMemOpInfo> 242 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 std::optional<SIMemOpInfo> 247 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 std::optional<SIMemOpInfo> 252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 std::optional<SIMemOpInfo> 257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 /// Inserts any necessary instructions at position \p Pos relative 316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 317 /// \p Op associated with address spaces \p AddrSpace have completed. Used 318 /// between memory instructions to enforce the order they become visible as 319 /// observed by other memory instructions executing in memory scope \p Scope. 320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 321 /// address spaces. Returns true iff any instructions inserted. 322 virtual bool insertWait(MachineBasicBlock::iterator &MI, 323 SIAtomicScope Scope, 324 SIAtomicAddrSpace AddrSpace, 325 SIMemOp Op, 326 bool IsCrossAddrSpaceOrdering, 327 Position Pos) const = 0; 328 329 /// Inserts any necessary instructions at position \p Pos relative to 330 /// instruction \p MI to ensure any subsequent memory instructions of this 331 /// thread with address spaces \p AddrSpace will observe the previous memory 332 /// operations by any thread for memory scopes up to memory scope \p Scope . 333 /// Returns true iff any instructions inserted. 334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 335 SIAtomicScope Scope, 336 SIAtomicAddrSpace AddrSpace, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure previous memory instructions by this thread 341 /// with address spaces \p AddrSpace have completed and can be observed by 342 /// subsequent memory instructions by any thread executing in memory scope \p 343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 344 /// between address spaces. Returns true iff any instructions inserted. 345 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const = 0; 350 351 /// Virtual destructor to allow derivations to be deleted. 352 virtual ~SICacheControl() = default; 353 354 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 355 MachineBasicBlock::iterator &MI) const { 356 return false; 357 } 358 }; 359 360 class SIGfx6CacheControl : public SICacheControl { 361 protected: 362 363 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::GLC); 367 } 368 369 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 370 /// is modified, false otherwise. 371 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 372 return enableNamedBit(MI, AMDGPU::CPol::SLC); 373 } 374 375 public: 376 377 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 378 379 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 380 SIAtomicScope Scope, 381 SIAtomicAddrSpace AddrSpace) const override; 382 383 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 388 SIAtomicScope Scope, 389 SIAtomicAddrSpace AddrSpace) const override; 390 391 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 392 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 393 bool IsVolatile, 394 bool IsNonTemporal) const override; 395 396 bool insertWait(MachineBasicBlock::iterator &MI, 397 SIAtomicScope Scope, 398 SIAtomicAddrSpace AddrSpace, 399 SIMemOp Op, 400 bool IsCrossAddrSpaceOrdering, 401 Position Pos) const override; 402 403 bool insertAcquire(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 Position Pos) const override; 407 408 bool insertRelease(MachineBasicBlock::iterator &MI, 409 SIAtomicScope Scope, 410 SIAtomicAddrSpace AddrSpace, 411 bool IsCrossAddrSpaceOrdering, 412 Position Pos) const override; 413 }; 414 415 class SIGfx7CacheControl : public SIGfx6CacheControl { 416 public: 417 418 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 419 420 bool insertAcquire(MachineBasicBlock::iterator &MI, 421 SIAtomicScope Scope, 422 SIAtomicAddrSpace AddrSpace, 423 Position Pos) const override; 424 425 }; 426 427 class SIGfx90ACacheControl : public SIGfx7CacheControl { 428 public: 429 430 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 431 432 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 433 SIAtomicScope Scope, 434 SIAtomicAddrSpace AddrSpace) const override; 435 436 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 441 SIAtomicScope Scope, 442 SIAtomicAddrSpace AddrSpace) const override; 443 444 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 445 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 446 bool IsVolatile, 447 bool IsNonTemporal) const override; 448 449 bool insertWait(MachineBasicBlock::iterator &MI, 450 SIAtomicScope Scope, 451 SIAtomicAddrSpace AddrSpace, 452 SIMemOp Op, 453 bool IsCrossAddrSpaceOrdering, 454 Position Pos) const override; 455 456 bool insertAcquire(MachineBasicBlock::iterator &MI, 457 SIAtomicScope Scope, 458 SIAtomicAddrSpace AddrSpace, 459 Position Pos) const override; 460 461 bool insertRelease(MachineBasicBlock::iterator &MI, 462 SIAtomicScope Scope, 463 SIAtomicAddrSpace AddrSpace, 464 bool IsCrossAddrSpaceOrdering, 465 Position Pos) const override; 466 }; 467 468 class SIGfx940CacheControl : public SIGfx90ACacheControl { 469 protected: 470 471 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 472 /// is modified, false otherwise. 473 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 474 return enableNamedBit(MI, AMDGPU::CPol::SC0); 475 } 476 477 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 478 /// is modified, false otherwise. 479 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 480 return enableNamedBit(MI, AMDGPU::CPol::SC1); 481 } 482 483 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 484 /// is modified, false otherwise. 485 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 486 return enableNamedBit(MI, AMDGPU::CPol::NT); 487 } 488 489 public: 490 491 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 492 493 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace) const override; 496 497 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 502 SIAtomicScope Scope, 503 SIAtomicAddrSpace AddrSpace) const override; 504 505 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 506 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 507 bool IsVolatile, 508 bool IsNonTemporal) const override; 509 510 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 511 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 512 513 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 514 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 515 Position Pos) const override; 516 517 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 518 MachineBasicBlock::iterator &MI) const override { 519 bool Changed = false; 520 if (ST.hasForceStoreSC0SC1() && 521 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 522 SIAtomicAddrSpace::GLOBAL | 523 SIAtomicAddrSpace::OTHER)) != 524 SIAtomicAddrSpace::NONE) { 525 Changed |= enableSC0Bit(MI); 526 Changed |= enableSC1Bit(MI); 527 } 528 return Changed; 529 } 530 }; 531 532 class SIGfx10CacheControl : public SIGfx7CacheControl { 533 protected: 534 535 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 536 /// is modified, false otherwise. 537 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 538 return enableNamedBit(MI, AMDGPU::CPol::DLC); 539 } 540 541 public: 542 543 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 544 545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 546 SIAtomicScope Scope, 547 SIAtomicAddrSpace AddrSpace) const override; 548 549 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 550 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 551 bool IsVolatile, 552 bool IsNonTemporal) const override; 553 554 bool insertWait(MachineBasicBlock::iterator &MI, 555 SIAtomicScope Scope, 556 SIAtomicAddrSpace AddrSpace, 557 SIMemOp Op, 558 bool IsCrossAddrSpaceOrdering, 559 Position Pos) const override; 560 561 bool insertAcquire(MachineBasicBlock::iterator &MI, 562 SIAtomicScope Scope, 563 SIAtomicAddrSpace AddrSpace, 564 Position Pos) const override; 565 }; 566 567 class SIGfx11CacheControl : public SIGfx10CacheControl { 568 public: 569 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 570 571 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 572 SIAtomicScope Scope, 573 SIAtomicAddrSpace AddrSpace) const override; 574 575 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 576 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 577 bool IsVolatile, 578 bool IsNonTemporal) const override; 579 }; 580 581 class SIMemoryLegalizer final : public MachineFunctionPass { 582 private: 583 584 /// Cache Control. 585 std::unique_ptr<SICacheControl> CC = nullptr; 586 587 /// List of atomic pseudo instructions. 588 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 589 590 /// Return true iff instruction \p MI is a atomic instruction that 591 /// returns a result. 592 bool isAtomicRet(const MachineInstr &MI) const { 593 return SIInstrInfo::isAtomicRet(MI); 594 } 595 596 /// Removes all processed atomic pseudo instructions from the current 597 /// function. Returns true if current function is modified, false otherwise. 598 bool removeAtomicPseudoMIs(); 599 600 /// Expands load operation \p MI. Returns true if instructions are 601 /// added/deleted or \p MI is modified, false otherwise. 602 bool expandLoad(const SIMemOpInfo &MOI, 603 MachineBasicBlock::iterator &MI); 604 /// Expands store operation \p MI. Returns true if instructions are 605 /// added/deleted or \p MI is modified, false otherwise. 606 bool expandStore(const SIMemOpInfo &MOI, 607 MachineBasicBlock::iterator &MI); 608 /// Expands atomic fence operation \p MI. Returns true if 609 /// instructions are added/deleted or \p MI is modified, false otherwise. 610 bool expandAtomicFence(const SIMemOpInfo &MOI, 611 MachineBasicBlock::iterator &MI); 612 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 613 /// instructions are added/deleted or \p MI is modified, false otherwise. 614 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 615 MachineBasicBlock::iterator &MI); 616 617 public: 618 static char ID; 619 620 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 621 622 void getAnalysisUsage(AnalysisUsage &AU) const override { 623 AU.setPreservesCFG(); 624 MachineFunctionPass::getAnalysisUsage(AU); 625 } 626 627 StringRef getPassName() const override { 628 return PASS_NAME; 629 } 630 631 bool runOnMachineFunction(MachineFunction &MF) override; 632 }; 633 634 } // end namespace anonymous 635 636 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 637 const char *Msg) const { 638 const Function &Func = MI->getParent()->getParent()->getFunction(); 639 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 640 Func.getContext().diagnose(Diag); 641 } 642 643 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 644 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 645 SIAtomicAddrSpace InstrAddrSpace) const { 646 if (SSID == SyncScope::System) 647 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 648 if (SSID == MMI->getAgentSSID()) 649 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 650 if (SSID == MMI->getWorkgroupSSID()) 651 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 652 true); 653 if (SSID == MMI->getWavefrontSSID()) 654 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 655 true); 656 if (SSID == SyncScope::SingleThread) 657 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 658 true); 659 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 660 return std::tuple(SIAtomicScope::SYSTEM, 661 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 662 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 663 return std::tuple(SIAtomicScope::AGENT, 664 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 665 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 666 return std::tuple(SIAtomicScope::WORKGROUP, 667 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 668 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 669 return std::tuple(SIAtomicScope::WAVEFRONT, 670 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 671 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 672 return std::tuple(SIAtomicScope::SINGLETHREAD, 673 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 674 return std::nullopt; 675 } 676 677 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 678 if (AS == AMDGPUAS::FLAT_ADDRESS) 679 return SIAtomicAddrSpace::FLAT; 680 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 681 return SIAtomicAddrSpace::GLOBAL; 682 if (AS == AMDGPUAS::LOCAL_ADDRESS) 683 return SIAtomicAddrSpace::LDS; 684 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 685 return SIAtomicAddrSpace::SCRATCH; 686 if (AS == AMDGPUAS::REGION_ADDRESS) 687 return SIAtomicAddrSpace::GDS; 688 689 return SIAtomicAddrSpace::OTHER; 690 } 691 692 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 693 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 694 } 695 696 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 697 const MachineBasicBlock::iterator &MI) const { 698 assert(MI->getNumMemOperands() > 0); 699 700 SyncScope::ID SSID = SyncScope::SingleThread; 701 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 702 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 703 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 704 bool IsNonTemporal = true; 705 bool IsVolatile = false; 706 707 // Validator should check whether or not MMOs cover the entire set of 708 // locations accessed by the memory instruction. 709 for (const auto &MMO : MI->memoperands()) { 710 IsNonTemporal &= MMO->isNonTemporal(); 711 IsVolatile |= MMO->isVolatile(); 712 InstrAddrSpace |= 713 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 714 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 715 if (OpOrdering != AtomicOrdering::NotAtomic) { 716 const auto &IsSyncScopeInclusion = 717 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 718 if (!IsSyncScopeInclusion) { 719 reportUnsupported(MI, 720 "Unsupported non-inclusive atomic synchronization scope"); 721 return std::nullopt; 722 } 723 724 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 725 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 726 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 727 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 728 FailureOrdering = 729 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 730 } 731 } 732 733 SIAtomicScope Scope = SIAtomicScope::NONE; 734 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 735 bool IsCrossAddressSpaceOrdering = false; 736 if (Ordering != AtomicOrdering::NotAtomic) { 737 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 738 if (!ScopeOrNone) { 739 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 740 return std::nullopt; 741 } 742 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 743 *ScopeOrNone; 744 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 745 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 746 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 747 reportUnsupported(MI, "Unsupported atomic address space"); 748 return std::nullopt; 749 } 750 } 751 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 752 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 753 IsNonTemporal); 754 } 755 756 std::optional<SIMemOpInfo> 757 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 758 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 759 760 if (!(MI->mayLoad() && !MI->mayStore())) 761 return std::nullopt; 762 763 // Be conservative if there are no memory operands. 764 if (MI->getNumMemOperands() == 0) 765 return SIMemOpInfo(); 766 767 return constructFromMIWithMMO(MI); 768 } 769 770 std::optional<SIMemOpInfo> 771 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 772 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 773 774 if (!(!MI->mayLoad() && MI->mayStore())) 775 return std::nullopt; 776 777 // Be conservative if there are no memory operands. 778 if (MI->getNumMemOperands() == 0) 779 return SIMemOpInfo(); 780 781 return constructFromMIWithMMO(MI); 782 } 783 784 std::optional<SIMemOpInfo> 785 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 786 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 787 788 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 789 return std::nullopt; 790 791 AtomicOrdering Ordering = 792 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 793 794 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 795 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 796 if (!ScopeOrNone) { 797 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 798 return std::nullopt; 799 } 800 801 SIAtomicScope Scope = SIAtomicScope::NONE; 802 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 803 bool IsCrossAddressSpaceOrdering = false; 804 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 805 *ScopeOrNone; 806 807 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 808 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 809 reportUnsupported(MI, "Unsupported atomic address space"); 810 return std::nullopt; 811 } 812 813 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 814 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 815 } 816 817 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 818 const MachineBasicBlock::iterator &MI) const { 819 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 820 821 if (!(MI->mayLoad() && MI->mayStore())) 822 return std::nullopt; 823 824 // Be conservative if there are no memory operands. 825 if (MI->getNumMemOperands() == 0) 826 return SIMemOpInfo(); 827 828 return constructFromMIWithMMO(MI); 829 } 830 831 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 832 TII = ST.getInstrInfo(); 833 IV = getIsaVersion(ST.getCPU()); 834 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 835 } 836 837 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 838 AMDGPU::CPol::CPol Bit) const { 839 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 840 if (!CPol) 841 return false; 842 843 CPol->setImm(CPol->getImm() | Bit); 844 return true; 845 } 846 847 /* static */ 848 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 849 GCNSubtarget::Generation Generation = ST.getGeneration(); 850 if (ST.hasGFX940Insts()) 851 return std::make_unique<SIGfx940CacheControl>(ST); 852 if (ST.hasGFX90AInsts()) 853 return std::make_unique<SIGfx90ACacheControl>(ST); 854 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 855 return std::make_unique<SIGfx6CacheControl>(ST); 856 if (Generation < AMDGPUSubtarget::GFX10) 857 return std::make_unique<SIGfx7CacheControl>(ST); 858 if (Generation < AMDGPUSubtarget::GFX11) 859 return std::make_unique<SIGfx10CacheControl>(ST); 860 return std::make_unique<SIGfx11CacheControl>(ST); 861 } 862 863 bool SIGfx6CacheControl::enableLoadCacheBypass( 864 const MachineBasicBlock::iterator &MI, 865 SIAtomicScope Scope, 866 SIAtomicAddrSpace AddrSpace) const { 867 assert(MI->mayLoad() && !MI->mayStore()); 868 bool Changed = false; 869 870 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 871 switch (Scope) { 872 case SIAtomicScope::SYSTEM: 873 case SIAtomicScope::AGENT: 874 // Set L1 cache policy to MISS_EVICT. 875 // Note: there is no L2 cache bypass policy at the ISA level. 876 Changed |= enableGLCBit(MI); 877 break; 878 case SIAtomicScope::WORKGROUP: 879 case SIAtomicScope::WAVEFRONT: 880 case SIAtomicScope::SINGLETHREAD: 881 // No cache to bypass. 882 break; 883 default: 884 llvm_unreachable("Unsupported synchronization scope"); 885 } 886 } 887 888 /// The scratch address space does not need the global memory caches 889 /// to be bypassed as all memory operations by the same thread are 890 /// sequentially consistent, and no other thread can access scratch 891 /// memory. 892 893 /// Other address spaces do not have a cache. 894 895 return Changed; 896 } 897 898 bool SIGfx6CacheControl::enableStoreCacheBypass( 899 const MachineBasicBlock::iterator &MI, 900 SIAtomicScope Scope, 901 SIAtomicAddrSpace AddrSpace) const { 902 assert(!MI->mayLoad() && MI->mayStore()); 903 bool Changed = false; 904 905 /// The L1 cache is write through so does not need to be bypassed. There is no 906 /// bypass control for the L2 cache at the isa level. 907 908 return Changed; 909 } 910 911 bool SIGfx6CacheControl::enableRMWCacheBypass( 912 const MachineBasicBlock::iterator &MI, 913 SIAtomicScope Scope, 914 SIAtomicAddrSpace AddrSpace) const { 915 assert(MI->mayLoad() && MI->mayStore()); 916 bool Changed = false; 917 918 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 919 /// bypassed, and the GLC bit is instead used to indicate if they are 920 /// return or no-return. 921 /// Note: there is no L2 cache coherent bypass control at the ISA level. 922 923 return Changed; 924 } 925 926 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 927 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 928 bool IsVolatile, bool IsNonTemporal) const { 929 // Only handle load and store, not atomic read-modify-write insructions. The 930 // latter use glc to indicate if the atomic returns a result and so must not 931 // be used for cache control. 932 assert(MI->mayLoad() ^ MI->mayStore()); 933 934 // Only update load and store, not LLVM IR atomic read-modify-write 935 // instructions. The latter are always marked as volatile so cannot sensibly 936 // handle it as do not want to pessimize all atomics. Also they do not support 937 // the nontemporal attribute. 938 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 939 940 bool Changed = false; 941 942 if (IsVolatile) { 943 // Set L1 cache policy to be MISS_EVICT for load instructions 944 // and MISS_LRU for store instructions. 945 // Note: there is no L2 cache bypass policy at the ISA level. 946 if (Op == SIMemOp::LOAD) 947 Changed |= enableGLCBit(MI); 948 949 // Ensure operation has completed at system scope to cause all volatile 950 // operations to be visible outside the program in a global order. Do not 951 // request cross address space as only the global address space can be 952 // observable outside the program, so no need to cause a waitcnt for LDS 953 // address space operations. 954 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 955 Position::AFTER); 956 957 return Changed; 958 } 959 960 if (IsNonTemporal) { 961 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 962 // for both loads and stores, and the L2 cache policy to STREAM. 963 Changed |= enableGLCBit(MI); 964 Changed |= enableSLCBit(MI); 965 return Changed; 966 } 967 968 return Changed; 969 } 970 971 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 972 SIAtomicScope Scope, 973 SIAtomicAddrSpace AddrSpace, 974 SIMemOp Op, 975 bool IsCrossAddrSpaceOrdering, 976 Position Pos) const { 977 bool Changed = false; 978 979 MachineBasicBlock &MBB = *MI->getParent(); 980 DebugLoc DL = MI->getDebugLoc(); 981 982 if (Pos == Position::AFTER) 983 ++MI; 984 985 bool VMCnt = false; 986 bool LGKMCnt = false; 987 988 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 989 SIAtomicAddrSpace::NONE) { 990 switch (Scope) { 991 case SIAtomicScope::SYSTEM: 992 case SIAtomicScope::AGENT: 993 VMCnt |= true; 994 break; 995 case SIAtomicScope::WORKGROUP: 996 case SIAtomicScope::WAVEFRONT: 997 case SIAtomicScope::SINGLETHREAD: 998 // The L1 cache keeps all memory operations in order for 999 // wavefronts in the same work-group. 1000 break; 1001 default: 1002 llvm_unreachable("Unsupported synchronization scope"); 1003 } 1004 } 1005 1006 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1007 switch (Scope) { 1008 case SIAtomicScope::SYSTEM: 1009 case SIAtomicScope::AGENT: 1010 case SIAtomicScope::WORKGROUP: 1011 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1012 // not needed as LDS operations for all waves are executed in a total 1013 // global ordering as observed by all waves. Required if also 1014 // synchronizing with global/GDS memory as LDS operations could be 1015 // reordered with respect to later global/GDS memory operations of the 1016 // same wave. 1017 LGKMCnt |= IsCrossAddrSpaceOrdering; 1018 break; 1019 case SIAtomicScope::WAVEFRONT: 1020 case SIAtomicScope::SINGLETHREAD: 1021 // The LDS keeps all memory operations in order for 1022 // the same wavefront. 1023 break; 1024 default: 1025 llvm_unreachable("Unsupported synchronization scope"); 1026 } 1027 } 1028 1029 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1030 switch (Scope) { 1031 case SIAtomicScope::SYSTEM: 1032 case SIAtomicScope::AGENT: 1033 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1034 // is not needed as GDS operations for all waves are executed in a total 1035 // global ordering as observed by all waves. Required if also 1036 // synchronizing with global/LDS memory as GDS operations could be 1037 // reordered with respect to later global/LDS memory operations of the 1038 // same wave. 1039 LGKMCnt |= IsCrossAddrSpaceOrdering; 1040 break; 1041 case SIAtomicScope::WORKGROUP: 1042 case SIAtomicScope::WAVEFRONT: 1043 case SIAtomicScope::SINGLETHREAD: 1044 // The GDS keeps all memory operations in order for 1045 // the same work-group. 1046 break; 1047 default: 1048 llvm_unreachable("Unsupported synchronization scope"); 1049 } 1050 } 1051 1052 if (VMCnt || LGKMCnt) { 1053 unsigned WaitCntImmediate = 1054 AMDGPU::encodeWaitcnt(IV, 1055 VMCnt ? 0 : getVmcntBitMask(IV), 1056 getExpcntBitMask(IV), 1057 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1058 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1059 Changed = true; 1060 } 1061 1062 if (Pos == Position::AFTER) 1063 --MI; 1064 1065 return Changed; 1066 } 1067 1068 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1069 SIAtomicScope Scope, 1070 SIAtomicAddrSpace AddrSpace, 1071 Position Pos) const { 1072 if (!InsertCacheInv) 1073 return false; 1074 1075 bool Changed = false; 1076 1077 MachineBasicBlock &MBB = *MI->getParent(); 1078 DebugLoc DL = MI->getDebugLoc(); 1079 1080 if (Pos == Position::AFTER) 1081 ++MI; 1082 1083 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1084 switch (Scope) { 1085 case SIAtomicScope::SYSTEM: 1086 case SIAtomicScope::AGENT: 1087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1088 Changed = true; 1089 break; 1090 case SIAtomicScope::WORKGROUP: 1091 case SIAtomicScope::WAVEFRONT: 1092 case SIAtomicScope::SINGLETHREAD: 1093 // No cache to invalidate. 1094 break; 1095 default: 1096 llvm_unreachable("Unsupported synchronization scope"); 1097 } 1098 } 1099 1100 /// The scratch address space does not need the global memory cache 1101 /// to be flushed as all memory operations by the same thread are 1102 /// sequentially consistent, and no other thread can access scratch 1103 /// memory. 1104 1105 /// Other address spaces do not have a cache. 1106 1107 if (Pos == Position::AFTER) 1108 --MI; 1109 1110 return Changed; 1111 } 1112 1113 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1114 SIAtomicScope Scope, 1115 SIAtomicAddrSpace AddrSpace, 1116 bool IsCrossAddrSpaceOrdering, 1117 Position Pos) const { 1118 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1119 IsCrossAddrSpaceOrdering, Pos); 1120 } 1121 1122 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1123 SIAtomicScope Scope, 1124 SIAtomicAddrSpace AddrSpace, 1125 Position Pos) const { 1126 if (!InsertCacheInv) 1127 return false; 1128 1129 bool Changed = false; 1130 1131 MachineBasicBlock &MBB = *MI->getParent(); 1132 DebugLoc DL = MI->getDebugLoc(); 1133 1134 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1135 1136 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1137 ? AMDGPU::BUFFER_WBINVL1 1138 : AMDGPU::BUFFER_WBINVL1_VOL; 1139 1140 if (Pos == Position::AFTER) 1141 ++MI; 1142 1143 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1144 switch (Scope) { 1145 case SIAtomicScope::SYSTEM: 1146 case SIAtomicScope::AGENT: 1147 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1148 Changed = true; 1149 break; 1150 case SIAtomicScope::WORKGROUP: 1151 case SIAtomicScope::WAVEFRONT: 1152 case SIAtomicScope::SINGLETHREAD: 1153 // No cache to invalidate. 1154 break; 1155 default: 1156 llvm_unreachable("Unsupported synchronization scope"); 1157 } 1158 } 1159 1160 /// The scratch address space does not need the global memory cache 1161 /// to be flushed as all memory operations by the same thread are 1162 /// sequentially consistent, and no other thread can access scratch 1163 /// memory. 1164 1165 /// Other address spaces do not have a cache. 1166 1167 if (Pos == Position::AFTER) 1168 --MI; 1169 1170 return Changed; 1171 } 1172 1173 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1174 const MachineBasicBlock::iterator &MI, 1175 SIAtomicScope Scope, 1176 SIAtomicAddrSpace AddrSpace) const { 1177 assert(MI->mayLoad() && !MI->mayStore()); 1178 bool Changed = false; 1179 1180 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1181 switch (Scope) { 1182 case SIAtomicScope::SYSTEM: 1183 case SIAtomicScope::AGENT: 1184 // Set the L1 cache policy to MISS_LRU. 1185 // Note: there is no L2 cache bypass policy at the ISA level. 1186 Changed |= enableGLCBit(MI); 1187 break; 1188 case SIAtomicScope::WORKGROUP: 1189 // In threadgroup split mode the waves of a work-group can be executing on 1190 // different CUs. Therefore need to bypass the L1 which is per CU. 1191 // Otherwise in non-threadgroup split mode all waves of a work-group are 1192 // on the same CU, and so the L1 does not need to be bypassed. 1193 if (ST.isTgSplitEnabled()) 1194 Changed |= enableGLCBit(MI); 1195 break; 1196 case SIAtomicScope::WAVEFRONT: 1197 case SIAtomicScope::SINGLETHREAD: 1198 // No cache to bypass. 1199 break; 1200 default: 1201 llvm_unreachable("Unsupported synchronization scope"); 1202 } 1203 } 1204 1205 /// The scratch address space does not need the global memory caches 1206 /// to be bypassed as all memory operations by the same thread are 1207 /// sequentially consistent, and no other thread can access scratch 1208 /// memory. 1209 1210 /// Other address spaces do not have a cache. 1211 1212 return Changed; 1213 } 1214 1215 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1216 const MachineBasicBlock::iterator &MI, 1217 SIAtomicScope Scope, 1218 SIAtomicAddrSpace AddrSpace) const { 1219 assert(!MI->mayLoad() && MI->mayStore()); 1220 bool Changed = false; 1221 1222 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1223 switch (Scope) { 1224 case SIAtomicScope::SYSTEM: 1225 case SIAtomicScope::AGENT: 1226 /// Do not set glc for store atomic operations as they implicitly write 1227 /// through the L1 cache. 1228 break; 1229 case SIAtomicScope::WORKGROUP: 1230 case SIAtomicScope::WAVEFRONT: 1231 case SIAtomicScope::SINGLETHREAD: 1232 // No cache to bypass. Store atomics implicitly write through the L1 1233 // cache. 1234 break; 1235 default: 1236 llvm_unreachable("Unsupported synchronization scope"); 1237 } 1238 } 1239 1240 /// The scratch address space does not need the global memory caches 1241 /// to be bypassed as all memory operations by the same thread are 1242 /// sequentially consistent, and no other thread can access scratch 1243 /// memory. 1244 1245 /// Other address spaces do not have a cache. 1246 1247 return Changed; 1248 } 1249 1250 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1251 const MachineBasicBlock::iterator &MI, 1252 SIAtomicScope Scope, 1253 SIAtomicAddrSpace AddrSpace) const { 1254 assert(MI->mayLoad() && MI->mayStore()); 1255 bool Changed = false; 1256 1257 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1258 switch (Scope) { 1259 case SIAtomicScope::SYSTEM: 1260 case SIAtomicScope::AGENT: 1261 /// Do not set glc for RMW atomic operations as they implicitly bypass 1262 /// the L1 cache, and the glc bit is instead used to indicate if they are 1263 /// return or no-return. 1264 break; 1265 case SIAtomicScope::WORKGROUP: 1266 case SIAtomicScope::WAVEFRONT: 1267 case SIAtomicScope::SINGLETHREAD: 1268 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1269 break; 1270 default: 1271 llvm_unreachable("Unsupported synchronization scope"); 1272 } 1273 } 1274 1275 return Changed; 1276 } 1277 1278 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1279 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1280 bool IsVolatile, bool IsNonTemporal) const { 1281 // Only handle load and store, not atomic read-modify-write insructions. The 1282 // latter use glc to indicate if the atomic returns a result and so must not 1283 // be used for cache control. 1284 assert(MI->mayLoad() ^ MI->mayStore()); 1285 1286 // Only update load and store, not LLVM IR atomic read-modify-write 1287 // instructions. The latter are always marked as volatile so cannot sensibly 1288 // handle it as do not want to pessimize all atomics. Also they do not support 1289 // the nontemporal attribute. 1290 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1291 1292 bool Changed = false; 1293 1294 if (IsVolatile) { 1295 // Set L1 cache policy to be MISS_EVICT for load instructions 1296 // and MISS_LRU for store instructions. 1297 // Note: there is no L2 cache bypass policy at the ISA level. 1298 if (Op == SIMemOp::LOAD) 1299 Changed |= enableGLCBit(MI); 1300 1301 // Ensure operation has completed at system scope to cause all volatile 1302 // operations to be visible outside the program in a global order. Do not 1303 // request cross address space as only the global address space can be 1304 // observable outside the program, so no need to cause a waitcnt for LDS 1305 // address space operations. 1306 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1307 Position::AFTER); 1308 1309 return Changed; 1310 } 1311 1312 if (IsNonTemporal) { 1313 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1314 // for both loads and stores, and the L2 cache policy to STREAM. 1315 Changed |= enableGLCBit(MI); 1316 Changed |= enableSLCBit(MI); 1317 return Changed; 1318 } 1319 1320 return Changed; 1321 } 1322 1323 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1324 SIAtomicScope Scope, 1325 SIAtomicAddrSpace AddrSpace, 1326 SIMemOp Op, 1327 bool IsCrossAddrSpaceOrdering, 1328 Position Pos) const { 1329 if (ST.isTgSplitEnabled()) { 1330 // In threadgroup split mode the waves of a work-group can be executing on 1331 // different CUs. Therefore need to wait for global or GDS memory operations 1332 // to complete to ensure they are visible to waves in the other CUs. 1333 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1334 // the same CU, so no need to wait for global memory as all waves in the 1335 // work-group access the same the L1, nor wait for GDS as access are ordered 1336 // on a CU. 1337 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1338 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1339 (Scope == SIAtomicScope::WORKGROUP)) { 1340 // Same as GFX7 using agent scope. 1341 Scope = SIAtomicScope::AGENT; 1342 } 1343 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1344 // LDS memory operations. 1345 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1346 } 1347 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1348 IsCrossAddrSpaceOrdering, Pos); 1349 } 1350 1351 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1352 SIAtomicScope Scope, 1353 SIAtomicAddrSpace AddrSpace, 1354 Position Pos) const { 1355 if (!InsertCacheInv) 1356 return false; 1357 1358 bool Changed = false; 1359 1360 MachineBasicBlock &MBB = *MI->getParent(); 1361 DebugLoc DL = MI->getDebugLoc(); 1362 1363 if (Pos == Position::AFTER) 1364 ++MI; 1365 1366 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1367 switch (Scope) { 1368 case SIAtomicScope::SYSTEM: 1369 // Ensures that following loads will not see stale remote VMEM data or 1370 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1371 // CC will never be stale due to the local memory probes. 1372 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1373 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1374 // hardware does not reorder memory operations by the same wave with 1375 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1376 // remove any cache lines of earlier writes by the same wave and ensures 1377 // later reads by the same wave will refetch the cache lines. 1378 Changed = true; 1379 break; 1380 case SIAtomicScope::AGENT: 1381 // Same as GFX7. 1382 break; 1383 case SIAtomicScope::WORKGROUP: 1384 // In threadgroup split mode the waves of a work-group can be executing on 1385 // different CUs. Therefore need to invalidate the L1 which is per CU. 1386 // Otherwise in non-threadgroup split mode all waves of a work-group are 1387 // on the same CU, and so the L1 does not need to be invalidated. 1388 if (ST.isTgSplitEnabled()) { 1389 // Same as GFX7 using agent scope. 1390 Scope = SIAtomicScope::AGENT; 1391 } 1392 break; 1393 case SIAtomicScope::WAVEFRONT: 1394 case SIAtomicScope::SINGLETHREAD: 1395 // Same as GFX7. 1396 break; 1397 default: 1398 llvm_unreachable("Unsupported synchronization scope"); 1399 } 1400 } 1401 1402 /// The scratch address space does not need the global memory cache 1403 /// to be flushed as all memory operations by the same thread are 1404 /// sequentially consistent, and no other thread can access scratch 1405 /// memory. 1406 1407 /// Other address spaces do not have a cache. 1408 1409 if (Pos == Position::AFTER) 1410 --MI; 1411 1412 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1413 1414 return Changed; 1415 } 1416 1417 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1418 SIAtomicScope Scope, 1419 SIAtomicAddrSpace AddrSpace, 1420 bool IsCrossAddrSpaceOrdering, 1421 Position Pos) const { 1422 bool Changed = false; 1423 1424 MachineBasicBlock &MBB = *MI->getParent(); 1425 DebugLoc DL = MI->getDebugLoc(); 1426 1427 if (Pos == Position::AFTER) 1428 ++MI; 1429 1430 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1431 switch (Scope) { 1432 case SIAtomicScope::SYSTEM: 1433 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1434 // hardware does not reorder memory operations by the same wave with 1435 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1436 // to initiate writeback of any dirty cache lines of earlier writes by the 1437 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1438 // writeback has completed. 1439 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1440 // Set SC bits to indicate system scope. 1441 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1442 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1443 // vmcnt(0)" needed by the "BUFFER_WBL2". 1444 Changed = true; 1445 break; 1446 case SIAtomicScope::AGENT: 1447 case SIAtomicScope::WORKGROUP: 1448 case SIAtomicScope::WAVEFRONT: 1449 case SIAtomicScope::SINGLETHREAD: 1450 // Same as GFX7. 1451 break; 1452 default: 1453 llvm_unreachable("Unsupported synchronization scope"); 1454 } 1455 } 1456 1457 if (Pos == Position::AFTER) 1458 --MI; 1459 1460 Changed |= 1461 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1462 IsCrossAddrSpaceOrdering, Pos); 1463 1464 return Changed; 1465 } 1466 1467 bool SIGfx940CacheControl::enableLoadCacheBypass( 1468 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1469 SIAtomicAddrSpace AddrSpace) const { 1470 assert(MI->mayLoad() && !MI->mayStore()); 1471 bool Changed = false; 1472 1473 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1474 switch (Scope) { 1475 case SIAtomicScope::SYSTEM: 1476 // Set SC bits to indicate system scope. 1477 Changed |= enableSC0Bit(MI); 1478 Changed |= enableSC1Bit(MI); 1479 break; 1480 case SIAtomicScope::AGENT: 1481 // Set SC bits to indicate agent scope. 1482 Changed |= enableSC1Bit(MI); 1483 break; 1484 case SIAtomicScope::WORKGROUP: 1485 // In threadgroup split mode the waves of a work-group can be executing on 1486 // different CUs. Therefore need to bypass the L1 which is per CU. 1487 // Otherwise in non-threadgroup split mode all waves of a work-group are 1488 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1489 // bits to indicate work-group scope will do this automatically. 1490 Changed |= enableSC0Bit(MI); 1491 break; 1492 case SIAtomicScope::WAVEFRONT: 1493 case SIAtomicScope::SINGLETHREAD: 1494 // Leave SC bits unset to indicate wavefront scope. 1495 break; 1496 default: 1497 llvm_unreachable("Unsupported synchronization scope"); 1498 } 1499 } 1500 1501 /// The scratch address space does not need the global memory caches 1502 /// to be bypassed as all memory operations by the same thread are 1503 /// sequentially consistent, and no other thread can access scratch 1504 /// memory. 1505 1506 /// Other address spaces do not have a cache. 1507 1508 return Changed; 1509 } 1510 1511 bool SIGfx940CacheControl::enableStoreCacheBypass( 1512 const MachineBasicBlock::iterator &MI, 1513 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1514 assert(!MI->mayLoad() && MI->mayStore()); 1515 bool Changed = false; 1516 1517 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1518 switch (Scope) { 1519 case SIAtomicScope::SYSTEM: 1520 // Set SC bits to indicate system scope. 1521 Changed |= enableSC0Bit(MI); 1522 Changed |= enableSC1Bit(MI); 1523 break; 1524 case SIAtomicScope::AGENT: 1525 // Set SC bits to indicate agent scope. 1526 Changed |= enableSC1Bit(MI); 1527 break; 1528 case SIAtomicScope::WORKGROUP: 1529 // Set SC bits to indicate workgroup scope. 1530 Changed |= enableSC0Bit(MI); 1531 break; 1532 case SIAtomicScope::WAVEFRONT: 1533 case SIAtomicScope::SINGLETHREAD: 1534 // Leave SC bits unset to indicate wavefront scope. 1535 break; 1536 default: 1537 llvm_unreachable("Unsupported synchronization scope"); 1538 } 1539 } 1540 1541 /// The scratch address space does not need the global memory caches 1542 /// to be bypassed as all memory operations by the same thread are 1543 /// sequentially consistent, and no other thread can access scratch 1544 /// memory. 1545 1546 /// Other address spaces do not have a cache. 1547 1548 return Changed; 1549 } 1550 1551 bool SIGfx940CacheControl::enableRMWCacheBypass( 1552 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1553 SIAtomicAddrSpace AddrSpace) const { 1554 assert(MI->mayLoad() && MI->mayStore()); 1555 bool Changed = false; 1556 1557 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1558 switch (Scope) { 1559 case SIAtomicScope::SYSTEM: 1560 // Set SC1 bit to indicate system scope. 1561 Changed |= enableSC1Bit(MI); 1562 break; 1563 case SIAtomicScope::AGENT: 1564 case SIAtomicScope::WORKGROUP: 1565 case SIAtomicScope::WAVEFRONT: 1566 case SIAtomicScope::SINGLETHREAD: 1567 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1568 // to indicate system or agent scope. The SC0 bit is used to indicate if 1569 // they are return or no-return. Leave SC1 bit unset to indicate agent 1570 // scope. 1571 break; 1572 default: 1573 llvm_unreachable("Unsupported synchronization scope"); 1574 } 1575 } 1576 1577 return Changed; 1578 } 1579 1580 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1581 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1582 bool IsVolatile, bool IsNonTemporal) const { 1583 // Only handle load and store, not atomic read-modify-write insructions. The 1584 // latter use glc to indicate if the atomic returns a result and so must not 1585 // be used for cache control. 1586 assert(MI->mayLoad() ^ MI->mayStore()); 1587 1588 // Only update load and store, not LLVM IR atomic read-modify-write 1589 // instructions. The latter are always marked as volatile so cannot sensibly 1590 // handle it as do not want to pessimize all atomics. Also they do not support 1591 // the nontemporal attribute. 1592 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1593 1594 bool Changed = false; 1595 1596 if (IsVolatile) { 1597 // Set SC bits to indicate system scope. 1598 Changed |= enableSC0Bit(MI); 1599 Changed |= enableSC1Bit(MI); 1600 1601 // Ensure operation has completed at system scope to cause all volatile 1602 // operations to be visible outside the program in a global order. Do not 1603 // request cross address space as only the global address space can be 1604 // observable outside the program, so no need to cause a waitcnt for LDS 1605 // address space operations. 1606 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1607 Position::AFTER); 1608 1609 return Changed; 1610 } 1611 1612 if (IsNonTemporal) { 1613 Changed |= enableNTBit(MI); 1614 return Changed; 1615 } 1616 1617 return Changed; 1618 } 1619 1620 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1621 SIAtomicScope Scope, 1622 SIAtomicAddrSpace AddrSpace, 1623 Position Pos) const { 1624 if (!InsertCacheInv) 1625 return false; 1626 1627 bool Changed = false; 1628 1629 MachineBasicBlock &MBB = *MI->getParent(); 1630 DebugLoc DL = MI->getDebugLoc(); 1631 1632 if (Pos == Position::AFTER) 1633 ++MI; 1634 1635 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1636 switch (Scope) { 1637 case SIAtomicScope::SYSTEM: 1638 // Ensures that following loads will not see stale remote VMEM data or 1639 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1640 // CC will never be stale due to the local memory probes. 1641 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1642 // Set SC bits to indicate system scope. 1643 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1644 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1645 // hardware does not reorder memory operations by the same wave with 1646 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1647 // remove any cache lines of earlier writes by the same wave and ensures 1648 // later reads by the same wave will refetch the cache lines. 1649 Changed = true; 1650 break; 1651 case SIAtomicScope::AGENT: 1652 // Ensures that following loads will not see stale remote date or local 1653 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1654 // due to the memory probes. 1655 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1656 // Set SC bits to indicate agent scope. 1657 .addImm(AMDGPU::CPol::SC1); 1658 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1659 // does not reorder memory operations with respect to preceeding buffer 1660 // invalidate. The invalidate is guaranteed to remove any cache lines of 1661 // earlier writes and ensures later writes will refetch the cache lines. 1662 Changed = true; 1663 break; 1664 case SIAtomicScope::WORKGROUP: 1665 // In threadgroup split mode the waves of a work-group can be executing on 1666 // different CUs. Therefore need to invalidate the L1 which is per CU. 1667 // Otherwise in non-threadgroup split mode all waves of a work-group are 1668 // on the same CU, and so the L1 does not need to be invalidated. 1669 if (ST.isTgSplitEnabled()) { 1670 // Ensures L1 is invalidated if in threadgroup split mode. In 1671 // non-threadgroup split mode it is a NOP, but no point generating it in 1672 // that case if know not in that mode. 1673 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1674 // Set SC bits to indicate work-group scope. 1675 .addImm(AMDGPU::CPol::SC0); 1676 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1677 // does not reorder memory operations with respect to preceeding buffer 1678 // invalidate. The invalidate is guaranteed to remove any cache lines of 1679 // earlier writes and ensures later writes will refetch the cache lines. 1680 Changed = true; 1681 } 1682 break; 1683 case SIAtomicScope::WAVEFRONT: 1684 case SIAtomicScope::SINGLETHREAD: 1685 // Could generate "BUFFER_INV" but it would do nothing as there are no 1686 // caches to invalidate. 1687 break; 1688 default: 1689 llvm_unreachable("Unsupported synchronization scope"); 1690 } 1691 } 1692 1693 /// The scratch address space does not need the global memory cache 1694 /// to be flushed as all memory operations by the same thread are 1695 /// sequentially consistent, and no other thread can access scratch 1696 /// memory. 1697 1698 /// Other address spaces do not have a cache. 1699 1700 if (Pos == Position::AFTER) 1701 --MI; 1702 1703 return Changed; 1704 } 1705 1706 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1707 SIAtomicScope Scope, 1708 SIAtomicAddrSpace AddrSpace, 1709 bool IsCrossAddrSpaceOrdering, 1710 Position Pos) const { 1711 bool Changed = false; 1712 1713 MachineBasicBlock &MBB = *MI->getParent(); 1714 DebugLoc DL = MI->getDebugLoc(); 1715 1716 if (Pos == Position::AFTER) 1717 ++MI; 1718 1719 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1720 switch (Scope) { 1721 case SIAtomicScope::SYSTEM: 1722 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1723 // hardware does not reorder memory operations by the same wave with 1724 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1725 // to initiate writeback of any dirty cache lines of earlier writes by the 1726 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1727 // writeback has completed. 1728 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1729 // Set SC bits to indicate system scope. 1730 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1731 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1732 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1733 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1734 Changed = true; 1735 break; 1736 case SIAtomicScope::AGENT: 1737 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1738 // Set SC bits to indicate agent scope. 1739 .addImm(AMDGPU::CPol::SC1); 1740 1741 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1742 // SIAtomicScope::AGENT, the following insertWait will generate the 1743 // required "S_WAITCNT vmcnt(0)". 1744 Changed = true; 1745 break; 1746 case SIAtomicScope::WORKGROUP: 1747 case SIAtomicScope::WAVEFRONT: 1748 case SIAtomicScope::SINGLETHREAD: 1749 // Do not generate "BUFFER_WBL2" as there are no caches it would 1750 // writeback, and would require an otherwise unnecessary 1751 // "S_WAITCNT vmcnt(0)". 1752 break; 1753 default: 1754 llvm_unreachable("Unsupported synchronization scope"); 1755 } 1756 } 1757 1758 if (Pos == Position::AFTER) 1759 --MI; 1760 1761 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1762 // S_WAITCNT needed. 1763 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1764 IsCrossAddrSpaceOrdering, Pos); 1765 1766 return Changed; 1767 } 1768 1769 bool SIGfx10CacheControl::enableLoadCacheBypass( 1770 const MachineBasicBlock::iterator &MI, 1771 SIAtomicScope Scope, 1772 SIAtomicAddrSpace AddrSpace) const { 1773 assert(MI->mayLoad() && !MI->mayStore()); 1774 bool Changed = false; 1775 1776 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1777 switch (Scope) { 1778 case SIAtomicScope::SYSTEM: 1779 case SIAtomicScope::AGENT: 1780 // Set the L0 and L1 cache policies to MISS_EVICT. 1781 // Note: there is no L2 cache coherent bypass control at the ISA level. 1782 Changed |= enableGLCBit(MI); 1783 Changed |= enableDLCBit(MI); 1784 break; 1785 case SIAtomicScope::WORKGROUP: 1786 // In WGP mode the waves of a work-group can be executing on either CU of 1787 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1788 // CU mode all waves of a work-group are on the same CU, and so the L0 1789 // does not need to be bypassed. 1790 if (!ST.isCuModeEnabled()) 1791 Changed |= enableGLCBit(MI); 1792 break; 1793 case SIAtomicScope::WAVEFRONT: 1794 case SIAtomicScope::SINGLETHREAD: 1795 // No cache to bypass. 1796 break; 1797 default: 1798 llvm_unreachable("Unsupported synchronization scope"); 1799 } 1800 } 1801 1802 /// The scratch address space does not need the global memory caches 1803 /// to be bypassed as all memory operations by the same thread are 1804 /// sequentially consistent, and no other thread can access scratch 1805 /// memory. 1806 1807 /// Other address spaces do not have a cache. 1808 1809 return Changed; 1810 } 1811 1812 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1813 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1814 bool IsVolatile, bool IsNonTemporal) const { 1815 1816 // Only handle load and store, not atomic read-modify-write insructions. The 1817 // latter use glc to indicate if the atomic returns a result and so must not 1818 // be used for cache control. 1819 assert(MI->mayLoad() ^ MI->mayStore()); 1820 1821 // Only update load and store, not LLVM IR atomic read-modify-write 1822 // instructions. The latter are always marked as volatile so cannot sensibly 1823 // handle it as do not want to pessimize all atomics. Also they do not support 1824 // the nontemporal attribute. 1825 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1826 1827 bool Changed = false; 1828 1829 if (IsVolatile) { 1830 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1831 // and MISS_LRU for store instructions. 1832 // Note: there is no L2 cache coherent bypass control at the ISA level. 1833 if (Op == SIMemOp::LOAD) { 1834 Changed |= enableGLCBit(MI); 1835 Changed |= enableDLCBit(MI); 1836 } 1837 1838 // Ensure operation has completed at system scope to cause all volatile 1839 // operations to be visible outside the program in a global order. Do not 1840 // request cross address space as only the global address space can be 1841 // observable outside the program, so no need to cause a waitcnt for LDS 1842 // address space operations. 1843 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1844 Position::AFTER); 1845 return Changed; 1846 } 1847 1848 if (IsNonTemporal) { 1849 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1850 // and L2 cache policy to STREAM. 1851 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1852 // to MISS_EVICT and the L2 cache policy to STREAM. 1853 if (Op == SIMemOp::STORE) 1854 Changed |= enableGLCBit(MI); 1855 Changed |= enableSLCBit(MI); 1856 1857 return Changed; 1858 } 1859 1860 return Changed; 1861 } 1862 1863 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1864 SIAtomicScope Scope, 1865 SIAtomicAddrSpace AddrSpace, 1866 SIMemOp Op, 1867 bool IsCrossAddrSpaceOrdering, 1868 Position Pos) const { 1869 bool Changed = false; 1870 1871 MachineBasicBlock &MBB = *MI->getParent(); 1872 DebugLoc DL = MI->getDebugLoc(); 1873 1874 if (Pos == Position::AFTER) 1875 ++MI; 1876 1877 bool VMCnt = false; 1878 bool VSCnt = false; 1879 bool LGKMCnt = false; 1880 1881 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1882 SIAtomicAddrSpace::NONE) { 1883 switch (Scope) { 1884 case SIAtomicScope::SYSTEM: 1885 case SIAtomicScope::AGENT: 1886 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1887 VMCnt |= true; 1888 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1889 VSCnt |= true; 1890 break; 1891 case SIAtomicScope::WORKGROUP: 1892 // In WGP mode the waves of a work-group can be executing on either CU of 1893 // the WGP. Therefore need to wait for operations to complete to ensure 1894 // they are visible to waves in the other CU as the L0 is per CU. 1895 // Otherwise in CU mode and all waves of a work-group are on the same CU 1896 // which shares the same L0. 1897 if (!ST.isCuModeEnabled()) { 1898 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1899 VMCnt |= true; 1900 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1901 VSCnt |= true; 1902 } 1903 break; 1904 case SIAtomicScope::WAVEFRONT: 1905 case SIAtomicScope::SINGLETHREAD: 1906 // The L0 cache keeps all memory operations in order for 1907 // work-items in the same wavefront. 1908 break; 1909 default: 1910 llvm_unreachable("Unsupported synchronization scope"); 1911 } 1912 } 1913 1914 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1915 switch (Scope) { 1916 case SIAtomicScope::SYSTEM: 1917 case SIAtomicScope::AGENT: 1918 case SIAtomicScope::WORKGROUP: 1919 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1920 // not needed as LDS operations for all waves are executed in a total 1921 // global ordering as observed by all waves. Required if also 1922 // synchronizing with global/GDS memory as LDS operations could be 1923 // reordered with respect to later global/GDS memory operations of the 1924 // same wave. 1925 LGKMCnt |= IsCrossAddrSpaceOrdering; 1926 break; 1927 case SIAtomicScope::WAVEFRONT: 1928 case SIAtomicScope::SINGLETHREAD: 1929 // The LDS keeps all memory operations in order for 1930 // the same wavefront. 1931 break; 1932 default: 1933 llvm_unreachable("Unsupported synchronization scope"); 1934 } 1935 } 1936 1937 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1938 switch (Scope) { 1939 case SIAtomicScope::SYSTEM: 1940 case SIAtomicScope::AGENT: 1941 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1942 // is not needed as GDS operations for all waves are executed in a total 1943 // global ordering as observed by all waves. Required if also 1944 // synchronizing with global/LDS memory as GDS operations could be 1945 // reordered with respect to later global/LDS memory operations of the 1946 // same wave. 1947 LGKMCnt |= IsCrossAddrSpaceOrdering; 1948 break; 1949 case SIAtomicScope::WORKGROUP: 1950 case SIAtomicScope::WAVEFRONT: 1951 case SIAtomicScope::SINGLETHREAD: 1952 // The GDS keeps all memory operations in order for 1953 // the same work-group. 1954 break; 1955 default: 1956 llvm_unreachable("Unsupported synchronization scope"); 1957 } 1958 } 1959 1960 if (VMCnt || LGKMCnt) { 1961 unsigned WaitCntImmediate = 1962 AMDGPU::encodeWaitcnt(IV, 1963 VMCnt ? 0 : getVmcntBitMask(IV), 1964 getExpcntBitMask(IV), 1965 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1966 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1967 Changed = true; 1968 } 1969 1970 if (VSCnt) { 1971 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1972 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1973 .addImm(0); 1974 Changed = true; 1975 } 1976 1977 if (Pos == Position::AFTER) 1978 --MI; 1979 1980 return Changed; 1981 } 1982 1983 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1984 SIAtomicScope Scope, 1985 SIAtomicAddrSpace AddrSpace, 1986 Position Pos) const { 1987 if (!InsertCacheInv) 1988 return false; 1989 1990 bool Changed = false; 1991 1992 MachineBasicBlock &MBB = *MI->getParent(); 1993 DebugLoc DL = MI->getDebugLoc(); 1994 1995 if (Pos == Position::AFTER) 1996 ++MI; 1997 1998 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1999 switch (Scope) { 2000 case SIAtomicScope::SYSTEM: 2001 case SIAtomicScope::AGENT: 2002 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2003 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2004 Changed = true; 2005 break; 2006 case SIAtomicScope::WORKGROUP: 2007 // In WGP mode the waves of a work-group can be executing on either CU of 2008 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2009 // in CU mode and all waves of a work-group are on the same CU, and so the 2010 // L0 does not need to be invalidated. 2011 if (!ST.isCuModeEnabled()) { 2012 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2013 Changed = true; 2014 } 2015 break; 2016 case SIAtomicScope::WAVEFRONT: 2017 case SIAtomicScope::SINGLETHREAD: 2018 // No cache to invalidate. 2019 break; 2020 default: 2021 llvm_unreachable("Unsupported synchronization scope"); 2022 } 2023 } 2024 2025 /// The scratch address space does not need the global memory cache 2026 /// to be flushed as all memory operations by the same thread are 2027 /// sequentially consistent, and no other thread can access scratch 2028 /// memory. 2029 2030 /// Other address spaces do not have a cache. 2031 2032 if (Pos == Position::AFTER) 2033 --MI; 2034 2035 return Changed; 2036 } 2037 2038 bool SIGfx11CacheControl::enableLoadCacheBypass( 2039 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2040 SIAtomicAddrSpace AddrSpace) const { 2041 assert(MI->mayLoad() && !MI->mayStore()); 2042 bool Changed = false; 2043 2044 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2045 switch (Scope) { 2046 case SIAtomicScope::SYSTEM: 2047 case SIAtomicScope::AGENT: 2048 // Set the L0 and L1 cache policies to MISS_EVICT. 2049 // Note: there is no L2 cache coherent bypass control at the ISA level. 2050 Changed |= enableGLCBit(MI); 2051 break; 2052 case SIAtomicScope::WORKGROUP: 2053 // In WGP mode the waves of a work-group can be executing on either CU of 2054 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2055 // CU mode all waves of a work-group are on the same CU, and so the L0 2056 // does not need to be bypassed. 2057 if (!ST.isCuModeEnabled()) 2058 Changed |= enableGLCBit(MI); 2059 break; 2060 case SIAtomicScope::WAVEFRONT: 2061 case SIAtomicScope::SINGLETHREAD: 2062 // No cache to bypass. 2063 break; 2064 default: 2065 llvm_unreachable("Unsupported synchronization scope"); 2066 } 2067 } 2068 2069 /// The scratch address space does not need the global memory caches 2070 /// to be bypassed as all memory operations by the same thread are 2071 /// sequentially consistent, and no other thread can access scratch 2072 /// memory. 2073 2074 /// Other address spaces do not have a cache. 2075 2076 return Changed; 2077 } 2078 2079 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2080 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2081 bool IsVolatile, bool IsNonTemporal) const { 2082 2083 // Only handle load and store, not atomic read-modify-write insructions. The 2084 // latter use glc to indicate if the atomic returns a result and so must not 2085 // be used for cache control. 2086 assert(MI->mayLoad() ^ MI->mayStore()); 2087 2088 // Only update load and store, not LLVM IR atomic read-modify-write 2089 // instructions. The latter are always marked as volatile so cannot sensibly 2090 // handle it as do not want to pessimize all atomics. Also they do not support 2091 // the nontemporal attribute. 2092 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2093 2094 bool Changed = false; 2095 2096 if (IsVolatile) { 2097 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2098 // and MISS_LRU for store instructions. 2099 // Note: there is no L2 cache coherent bypass control at the ISA level. 2100 if (Op == SIMemOp::LOAD) 2101 Changed |= enableGLCBit(MI); 2102 2103 // Set MALL NOALLOC for load and store instructions. 2104 Changed |= enableDLCBit(MI); 2105 2106 // Ensure operation has completed at system scope to cause all volatile 2107 // operations to be visible outside the program in a global order. Do not 2108 // request cross address space as only the global address space can be 2109 // observable outside the program, so no need to cause a waitcnt for LDS 2110 // address space operations. 2111 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2112 Position::AFTER); 2113 return Changed; 2114 } 2115 2116 if (IsNonTemporal) { 2117 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2118 // and L2 cache policy to STREAM. 2119 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2120 // to MISS_EVICT and the L2 cache policy to STREAM. 2121 if (Op == SIMemOp::STORE) 2122 Changed |= enableGLCBit(MI); 2123 Changed |= enableSLCBit(MI); 2124 2125 // Set MALL NOALLOC for load and store instructions. 2126 Changed |= enableDLCBit(MI); 2127 return Changed; 2128 } 2129 2130 return Changed; 2131 } 2132 2133 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2134 if (AtomicPseudoMIs.empty()) 2135 return false; 2136 2137 for (auto &MI : AtomicPseudoMIs) 2138 MI->eraseFromParent(); 2139 2140 AtomicPseudoMIs.clear(); 2141 return true; 2142 } 2143 2144 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2145 MachineBasicBlock::iterator &MI) { 2146 assert(MI->mayLoad() && !MI->mayStore()); 2147 2148 bool Changed = false; 2149 2150 if (MOI.isAtomic()) { 2151 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2152 MOI.getOrdering() == AtomicOrdering::Acquire || 2153 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2154 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2155 MOI.getOrderingAddrSpace()); 2156 } 2157 2158 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2159 Changed |= CC->insertWait(MI, MOI.getScope(), 2160 MOI.getOrderingAddrSpace(), 2161 SIMemOp::LOAD | SIMemOp::STORE, 2162 MOI.getIsCrossAddressSpaceOrdering(), 2163 Position::BEFORE); 2164 2165 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2166 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2167 Changed |= CC->insertWait(MI, MOI.getScope(), 2168 MOI.getInstrAddrSpace(), 2169 SIMemOp::LOAD, 2170 MOI.getIsCrossAddressSpaceOrdering(), 2171 Position::AFTER); 2172 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2173 MOI.getOrderingAddrSpace(), 2174 Position::AFTER); 2175 } 2176 2177 return Changed; 2178 } 2179 2180 // Atomic instructions already bypass caches to the scope specified by the 2181 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2182 // need additional treatment. 2183 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2184 SIMemOp::LOAD, MOI.isVolatile(), 2185 MOI.isNonTemporal()); 2186 return Changed; 2187 } 2188 2189 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2190 MachineBasicBlock::iterator &MI) { 2191 assert(!MI->mayLoad() && MI->mayStore()); 2192 2193 bool Changed = false; 2194 2195 if (MOI.isAtomic()) { 2196 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2197 MOI.getOrdering() == AtomicOrdering::Release || 2198 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2199 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2200 MOI.getOrderingAddrSpace()); 2201 } 2202 2203 if (MOI.getOrdering() == AtomicOrdering::Release || 2204 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2205 Changed |= CC->insertRelease(MI, MOI.getScope(), 2206 MOI.getOrderingAddrSpace(), 2207 MOI.getIsCrossAddressSpaceOrdering(), 2208 Position::BEFORE); 2209 2210 return Changed; 2211 } 2212 2213 // Atomic instructions already bypass caches to the scope specified by the 2214 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2215 // need additional treatment. 2216 Changed |= CC->enableVolatileAndOrNonTemporal( 2217 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2218 MOI.isNonTemporal()); 2219 return Changed; 2220 } 2221 2222 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2223 MachineBasicBlock::iterator &MI) { 2224 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2225 2226 AtomicPseudoMIs.push_back(MI); 2227 bool Changed = false; 2228 2229 if (MOI.isAtomic()) { 2230 if (MOI.getOrdering() == AtomicOrdering::Acquire) 2231 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), 2232 SIMemOp::LOAD | SIMemOp::STORE, 2233 MOI.getIsCrossAddressSpaceOrdering(), 2234 Position::BEFORE); 2235 2236 if (MOI.getOrdering() == AtomicOrdering::Release || 2237 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2238 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2239 /// TODO: This relies on a barrier always generating a waitcnt 2240 /// for LDS to ensure it is not reordered with the completion of 2241 /// the proceeding LDS operations. If barrier had a memory 2242 /// ordering and memory scope, then library does not need to 2243 /// generate a fence. Could add support in this file for 2244 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2245 /// adding S_WAITCNT before a S_BARRIER. 2246 Changed |= CC->insertRelease(MI, MOI.getScope(), 2247 MOI.getOrderingAddrSpace(), 2248 MOI.getIsCrossAddressSpaceOrdering(), 2249 Position::BEFORE); 2250 2251 // TODO: If both release and invalidate are happening they could be combined 2252 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2253 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2254 // track cache invalidate and write back instructions. 2255 2256 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2257 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2258 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2259 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2260 MOI.getOrderingAddrSpace(), 2261 Position::BEFORE); 2262 2263 return Changed; 2264 } 2265 2266 return Changed; 2267 } 2268 2269 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2270 MachineBasicBlock::iterator &MI) { 2271 assert(MI->mayLoad() && MI->mayStore()); 2272 2273 bool Changed = false; 2274 2275 if (MOI.isAtomic()) { 2276 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2277 MOI.getOrdering() == AtomicOrdering::Acquire || 2278 MOI.getOrdering() == AtomicOrdering::Release || 2279 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2280 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2281 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2282 MOI.getInstrAddrSpace()); 2283 } 2284 2285 if (MOI.getOrdering() == AtomicOrdering::Release || 2286 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2287 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2288 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2289 Changed |= CC->insertRelease(MI, MOI.getScope(), 2290 MOI.getOrderingAddrSpace(), 2291 MOI.getIsCrossAddressSpaceOrdering(), 2292 Position::BEFORE); 2293 2294 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2295 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2296 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2297 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2298 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2299 Changed |= CC->insertWait(MI, MOI.getScope(), 2300 MOI.getInstrAddrSpace(), 2301 isAtomicRet(*MI) ? SIMemOp::LOAD : 2302 SIMemOp::STORE, 2303 MOI.getIsCrossAddressSpaceOrdering(), 2304 Position::AFTER); 2305 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2306 MOI.getOrderingAddrSpace(), 2307 Position::AFTER); 2308 } 2309 2310 return Changed; 2311 } 2312 2313 return Changed; 2314 } 2315 2316 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2317 bool Changed = false; 2318 2319 SIMemOpAccess MOA(MF); 2320 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2321 2322 for (auto &MBB : MF) { 2323 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2324 2325 // Unbundle instructions after the post-RA scheduler. 2326 if (MI->isBundle() && MI->mayLoadOrStore()) { 2327 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2328 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2329 I != E && I->isBundledWithPred(); ++I) { 2330 I->unbundleFromPred(); 2331 for (MachineOperand &MO : I->operands()) 2332 if (MO.isReg()) 2333 MO.setIsInternalRead(false); 2334 } 2335 2336 MI->eraseFromParent(); 2337 MI = II->getIterator(); 2338 } 2339 2340 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2341 continue; 2342 2343 if (const auto &MOI = MOA.getLoadInfo(MI)) 2344 Changed |= expandLoad(*MOI, MI); 2345 else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2346 Changed |= expandStore(*MOI, MI); 2347 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 2348 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2349 Changed |= expandAtomicFence(*MOI, MI); 2350 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2351 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2352 } 2353 } 2354 2355 Changed |= removeAtomicPseudoMIs(); 2356 return Changed; 2357 } 2358 2359 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2360 2361 char SIMemoryLegalizer::ID = 0; 2362 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2363 2364 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2365 return new SIMemoryLegalizer(); 2366 } 2367