1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/IR/DiagnosticInfo.h" 24 #include "llvm/Support/AtomicOrdering.h" 25 #include "llvm/Support/TargetParser.h" 26 27 using namespace llvm; 28 using namespace llvm::AMDGPU; 29 30 #define DEBUG_TYPE "si-memory-legalizer" 31 #define PASS_NAME "SI Memory Legalizer" 32 33 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 35 cl::desc("Use this to skip inserting cache invalidating instructions.")); 36 37 namespace { 38 39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 40 41 /// Memory operation flags. Can be ORed together. 42 enum class SIMemOp { 43 NONE = 0u, 44 LOAD = 1u << 0, 45 STORE = 1u << 1, 46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 47 }; 48 49 /// Position to insert a new instruction relative to an existing 50 /// instruction. 51 enum class Position { 52 BEFORE, 53 AFTER 54 }; 55 56 /// The atomic synchronization scopes supported by the AMDGPU target. 57 enum class SIAtomicScope { 58 NONE, 59 SINGLETHREAD, 60 WAVEFRONT, 61 WORKGROUP, 62 AGENT, 63 SYSTEM 64 }; 65 66 /// The distinct address spaces supported by the AMDGPU target for 67 /// atomic memory operation. Can be ORed together. 68 enum class SIAtomicAddrSpace { 69 NONE = 0u, 70 GLOBAL = 1u << 0, 71 LDS = 1u << 1, 72 SCRATCH = 1u << 2, 73 GDS = 1u << 3, 74 OTHER = 1u << 4, 75 76 /// The address spaces that can be accessed by a FLAT instruction. 77 FLAT = GLOBAL | LDS | SCRATCH, 78 79 /// The address spaces that support atomic instructions. 80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 81 82 /// All address spaces. 83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 84 85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 86 }; 87 88 class SIMemOpInfo final { 89 private: 90 91 friend class SIMemOpAccess; 92 93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 95 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 98 bool IsCrossAddressSpaceOrdering = false; 99 bool IsVolatile = false; 100 bool IsNonTemporal = false; 101 102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 103 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 106 bool IsCrossAddressSpaceOrdering = true, 107 AtomicOrdering FailureOrdering = 108 AtomicOrdering::SequentiallyConsistent, 109 bool IsVolatile = false, 110 bool IsNonTemporal = false) 111 : Ordering(Ordering), FailureOrdering(FailureOrdering), 112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 113 InstrAddrSpace(InstrAddrSpace), 114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 115 IsVolatile(IsVolatile), 116 IsNonTemporal(IsNonTemporal) { 117 118 if (Ordering == AtomicOrdering::NotAtomic) { 119 assert(Scope == SIAtomicScope::NONE && 120 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 121 !IsCrossAddressSpaceOrdering && 122 FailureOrdering == AtomicOrdering::NotAtomic); 123 return; 124 } 125 126 assert(Scope != SIAtomicScope::NONE && 127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 128 SIAtomicAddrSpace::NONE && 129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130 SIAtomicAddrSpace::NONE); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SIMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SIMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 std::optional<SIMemOpInfo> 233 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 241 std::optional<SIMemOpInfo> 242 getLoadInfo(const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "std::nullopt" 245 /// otherwise. 246 std::optional<SIMemOpInfo> 247 getStoreInfo(const MachineBasicBlock::iterator &MI) const; 248 249 /// \returns Atomic fence info if \p MI is an atomic fence operation, 250 /// "std::nullopt" otherwise. 251 std::optional<SIMemOpInfo> 252 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 253 254 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 255 /// rmw operation, "std::nullopt" otherwise. 256 std::optional<SIMemOpInfo> 257 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 258 }; 259 260 class SICacheControl { 261 protected: 262 263 /// AMDGPU subtarget info. 264 const GCNSubtarget &ST; 265 266 /// Instruction info. 267 const SIInstrInfo *TII = nullptr; 268 269 IsaVersion IV; 270 271 /// Whether to insert cache invalidating instructions. 272 bool InsertCacheInv; 273 274 SICacheControl(const GCNSubtarget &ST); 275 276 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 277 /// \returns Returns true if \p MI is modified, false otherwise. 278 bool enableNamedBit(const MachineBasicBlock::iterator MI, 279 AMDGPU::CPol::CPol Bit) const; 280 281 public: 282 283 /// Create a cache control for the subtarget \p ST. 284 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 285 286 /// Update \p MI memory load instruction to bypass any caches up to 287 /// the \p Scope memory scope for address spaces \p 288 /// AddrSpace. Return true iff the instruction was modified. 289 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 290 SIAtomicScope Scope, 291 SIAtomicAddrSpace AddrSpace) const = 0; 292 293 /// Update \p MI memory store instruction to bypass any caches up to 294 /// the \p Scope memory scope for address spaces \p 295 /// AddrSpace. Return true iff the instruction was modified. 296 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 297 SIAtomicScope Scope, 298 SIAtomicAddrSpace AddrSpace) const = 0; 299 300 /// Update \p MI memory read-modify-write instruction to bypass any caches up 301 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 302 /// iff the instruction was modified. 303 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 304 SIAtomicScope Scope, 305 SIAtomicAddrSpace AddrSpace) const = 0; 306 307 /// Update \p MI memory instruction of kind \p Op associated with address 308 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 309 /// true iff the instruction was modified. 310 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 311 SIAtomicAddrSpace AddrSpace, 312 SIMemOp Op, bool IsVolatile, 313 bool IsNonTemporal) const = 0; 314 315 /// Inserts any necessary instructions at position \p Pos relative 316 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 317 /// \p Op associated with address spaces \p AddrSpace have completed. Used 318 /// between memory instructions to enforce the order they become visible as 319 /// observed by other memory instructions executing in memory scope \p Scope. 320 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 321 /// address spaces. Returns true iff any instructions inserted. 322 virtual bool insertWait(MachineBasicBlock::iterator &MI, 323 SIAtomicScope Scope, 324 SIAtomicAddrSpace AddrSpace, 325 SIMemOp Op, 326 bool IsCrossAddrSpaceOrdering, 327 Position Pos) const = 0; 328 329 /// Inserts any necessary instructions at position \p Pos relative to 330 /// instruction \p MI to ensure any subsequent memory instructions of this 331 /// thread with address spaces \p AddrSpace will observe the previous memory 332 /// operations by any thread for memory scopes up to memory scope \p Scope . 333 /// Returns true iff any instructions inserted. 334 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 335 SIAtomicScope Scope, 336 SIAtomicAddrSpace AddrSpace, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure previous memory instructions by this thread 341 /// with address spaces \p AddrSpace have completed and can be observed by 342 /// subsequent memory instructions by any thread executing in memory scope \p 343 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 344 /// between address spaces. Returns true iff any instructions inserted. 345 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const = 0; 350 351 /// Virtual destructor to allow derivations to be deleted. 352 virtual ~SICacheControl() = default; 353 354 }; 355 356 class SIGfx6CacheControl : public SICacheControl { 357 protected: 358 359 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 360 /// is modified, false otherwise. 361 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 362 return enableNamedBit(MI, AMDGPU::CPol::GLC); 363 } 364 365 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 366 /// is modified, false otherwise. 367 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 368 return enableNamedBit(MI, AMDGPU::CPol::SLC); 369 } 370 371 public: 372 373 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 374 375 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 376 SIAtomicScope Scope, 377 SIAtomicAddrSpace AddrSpace) const override; 378 379 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 380 SIAtomicScope Scope, 381 SIAtomicAddrSpace AddrSpace) const override; 382 383 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace) const override; 386 387 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 388 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 389 bool IsVolatile, 390 bool IsNonTemporal) const override; 391 392 bool insertWait(MachineBasicBlock::iterator &MI, 393 SIAtomicScope Scope, 394 SIAtomicAddrSpace AddrSpace, 395 SIMemOp Op, 396 bool IsCrossAddrSpaceOrdering, 397 Position Pos) const override; 398 399 bool insertAcquire(MachineBasicBlock::iterator &MI, 400 SIAtomicScope Scope, 401 SIAtomicAddrSpace AddrSpace, 402 Position Pos) const override; 403 404 bool insertRelease(MachineBasicBlock::iterator &MI, 405 SIAtomicScope Scope, 406 SIAtomicAddrSpace AddrSpace, 407 bool IsCrossAddrSpaceOrdering, 408 Position Pos) const override; 409 }; 410 411 class SIGfx7CacheControl : public SIGfx6CacheControl { 412 public: 413 414 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 415 416 bool insertAcquire(MachineBasicBlock::iterator &MI, 417 SIAtomicScope Scope, 418 SIAtomicAddrSpace AddrSpace, 419 Position Pos) const override; 420 421 }; 422 423 class SIGfx90ACacheControl : public SIGfx7CacheControl { 424 public: 425 426 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 427 428 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 429 SIAtomicScope Scope, 430 SIAtomicAddrSpace AddrSpace) const override; 431 432 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 433 SIAtomicScope Scope, 434 SIAtomicAddrSpace AddrSpace) const override; 435 436 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 437 SIAtomicScope Scope, 438 SIAtomicAddrSpace AddrSpace) const override; 439 440 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 441 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 442 bool IsVolatile, 443 bool IsNonTemporal) const override; 444 445 bool insertWait(MachineBasicBlock::iterator &MI, 446 SIAtomicScope Scope, 447 SIAtomicAddrSpace AddrSpace, 448 SIMemOp Op, 449 bool IsCrossAddrSpaceOrdering, 450 Position Pos) const override; 451 452 bool insertAcquire(MachineBasicBlock::iterator &MI, 453 SIAtomicScope Scope, 454 SIAtomicAddrSpace AddrSpace, 455 Position Pos) const override; 456 457 bool insertRelease(MachineBasicBlock::iterator &MI, 458 SIAtomicScope Scope, 459 SIAtomicAddrSpace AddrSpace, 460 bool IsCrossAddrSpaceOrdering, 461 Position Pos) const override; 462 }; 463 464 class SIGfx940CacheControl : public SIGfx90ACacheControl { 465 protected: 466 467 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 468 /// is modified, false otherwise. 469 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 470 return enableNamedBit(MI, AMDGPU::CPol::SC0); 471 } 472 473 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 474 /// is modified, false otherwise. 475 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 476 return enableNamedBit(MI, AMDGPU::CPol::SC1); 477 } 478 479 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 480 /// is modified, false otherwise. 481 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 482 return enableNamedBit(MI, AMDGPU::CPol::NT); 483 } 484 485 public: 486 487 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 488 489 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 490 SIAtomicScope Scope, 491 SIAtomicAddrSpace AddrSpace) const override; 492 493 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace) const override; 496 497 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 498 SIAtomicScope Scope, 499 SIAtomicAddrSpace AddrSpace) const override; 500 501 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 502 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 503 bool IsVolatile, 504 bool IsNonTemporal) const override; 505 506 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 507 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 508 509 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 510 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 511 Position Pos) const override; 512 }; 513 514 class SIGfx10CacheControl : public SIGfx7CacheControl { 515 protected: 516 517 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 518 /// is modified, false otherwise. 519 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 520 return enableNamedBit(MI, AMDGPU::CPol::DLC); 521 } 522 523 public: 524 525 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 526 527 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 528 SIAtomicScope Scope, 529 SIAtomicAddrSpace AddrSpace) const override; 530 531 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 532 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 533 bool IsVolatile, 534 bool IsNonTemporal) const override; 535 536 bool insertWait(MachineBasicBlock::iterator &MI, 537 SIAtomicScope Scope, 538 SIAtomicAddrSpace AddrSpace, 539 SIMemOp Op, 540 bool IsCrossAddrSpaceOrdering, 541 Position Pos) const override; 542 543 bool insertAcquire(MachineBasicBlock::iterator &MI, 544 SIAtomicScope Scope, 545 SIAtomicAddrSpace AddrSpace, 546 Position Pos) const override; 547 }; 548 549 class SIGfx11CacheControl : public SIGfx10CacheControl { 550 public: 551 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 552 553 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 554 SIAtomicScope Scope, 555 SIAtomicAddrSpace AddrSpace) const override; 556 557 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 558 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 559 bool IsVolatile, 560 bool IsNonTemporal) const override; 561 }; 562 563 class SIMemoryLegalizer final : public MachineFunctionPass { 564 private: 565 566 /// Cache Control. 567 std::unique_ptr<SICacheControl> CC = nullptr; 568 569 /// List of atomic pseudo instructions. 570 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 571 572 /// Return true iff instruction \p MI is a atomic instruction that 573 /// returns a result. 574 bool isAtomicRet(const MachineInstr &MI) const { 575 return SIInstrInfo::isAtomicRet(MI); 576 } 577 578 /// Removes all processed atomic pseudo instructions from the current 579 /// function. Returns true if current function is modified, false otherwise. 580 bool removeAtomicPseudoMIs(); 581 582 /// Expands load operation \p MI. Returns true if instructions are 583 /// added/deleted or \p MI is modified, false otherwise. 584 bool expandLoad(const SIMemOpInfo &MOI, 585 MachineBasicBlock::iterator &MI); 586 /// Expands store operation \p MI. Returns true if instructions are 587 /// added/deleted or \p MI is modified, false otherwise. 588 bool expandStore(const SIMemOpInfo &MOI, 589 MachineBasicBlock::iterator &MI); 590 /// Expands atomic fence operation \p MI. Returns true if 591 /// instructions are added/deleted or \p MI is modified, false otherwise. 592 bool expandAtomicFence(const SIMemOpInfo &MOI, 593 MachineBasicBlock::iterator &MI); 594 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 595 /// instructions are added/deleted or \p MI is modified, false otherwise. 596 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 597 MachineBasicBlock::iterator &MI); 598 599 public: 600 static char ID; 601 602 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 603 604 void getAnalysisUsage(AnalysisUsage &AU) const override { 605 AU.setPreservesCFG(); 606 MachineFunctionPass::getAnalysisUsage(AU); 607 } 608 609 StringRef getPassName() const override { 610 return PASS_NAME; 611 } 612 613 bool runOnMachineFunction(MachineFunction &MF) override; 614 }; 615 616 } // end namespace anonymous 617 618 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 619 const char *Msg) const { 620 const Function &Func = MI->getParent()->getParent()->getFunction(); 621 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 622 Func.getContext().diagnose(Diag); 623 } 624 625 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 626 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 627 SIAtomicAddrSpace InstrAddrSpace) const { 628 if (SSID == SyncScope::System) 629 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 630 if (SSID == MMI->getAgentSSID()) 631 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 632 if (SSID == MMI->getWorkgroupSSID()) 633 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 634 true); 635 if (SSID == MMI->getWavefrontSSID()) 636 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 637 true); 638 if (SSID == SyncScope::SingleThread) 639 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 640 true); 641 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 642 return std::tuple(SIAtomicScope::SYSTEM, 643 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 644 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 645 return std::tuple(SIAtomicScope::AGENT, 646 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 647 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 648 return std::tuple(SIAtomicScope::WORKGROUP, 649 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 650 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 651 return std::tuple(SIAtomicScope::WAVEFRONT, 652 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 653 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 654 return std::tuple(SIAtomicScope::SINGLETHREAD, 655 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 656 return std::nullopt; 657 } 658 659 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 660 if (AS == AMDGPUAS::FLAT_ADDRESS) 661 return SIAtomicAddrSpace::FLAT; 662 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 663 return SIAtomicAddrSpace::GLOBAL; 664 if (AS == AMDGPUAS::LOCAL_ADDRESS) 665 return SIAtomicAddrSpace::LDS; 666 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 667 return SIAtomicAddrSpace::SCRATCH; 668 if (AS == AMDGPUAS::REGION_ADDRESS) 669 return SIAtomicAddrSpace::GDS; 670 671 return SIAtomicAddrSpace::OTHER; 672 } 673 674 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 675 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 676 } 677 678 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 679 const MachineBasicBlock::iterator &MI) const { 680 assert(MI->getNumMemOperands() > 0); 681 682 SyncScope::ID SSID = SyncScope::SingleThread; 683 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 684 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 685 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 686 bool IsNonTemporal = true; 687 bool IsVolatile = false; 688 689 // Validator should check whether or not MMOs cover the entire set of 690 // locations accessed by the memory instruction. 691 for (const auto &MMO : MI->memoperands()) { 692 IsNonTemporal &= MMO->isNonTemporal(); 693 IsVolatile |= MMO->isVolatile(); 694 InstrAddrSpace |= 695 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 696 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 697 if (OpOrdering != AtomicOrdering::NotAtomic) { 698 const auto &IsSyncScopeInclusion = 699 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 700 if (!IsSyncScopeInclusion) { 701 reportUnsupported(MI, 702 "Unsupported non-inclusive atomic synchronization scope"); 703 return std::nullopt; 704 } 705 706 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 707 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 708 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 709 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 710 FailureOrdering = 711 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 712 } 713 } 714 715 SIAtomicScope Scope = SIAtomicScope::NONE; 716 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 717 bool IsCrossAddressSpaceOrdering = false; 718 if (Ordering != AtomicOrdering::NotAtomic) { 719 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 720 if (!ScopeOrNone) { 721 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 722 return std::nullopt; 723 } 724 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 725 *ScopeOrNone; 726 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 727 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 728 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 729 reportUnsupported(MI, "Unsupported atomic address space"); 730 return std::nullopt; 731 } 732 } 733 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 734 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 735 IsNonTemporal); 736 } 737 738 std::optional<SIMemOpInfo> 739 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 740 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 741 742 if (!(MI->mayLoad() && !MI->mayStore())) 743 return std::nullopt; 744 745 // Be conservative if there are no memory operands. 746 if (MI->getNumMemOperands() == 0) 747 return SIMemOpInfo(); 748 749 return constructFromMIWithMMO(MI); 750 } 751 752 std::optional<SIMemOpInfo> 753 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 754 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 755 756 if (!(!MI->mayLoad() && MI->mayStore())) 757 return std::nullopt; 758 759 // Be conservative if there are no memory operands. 760 if (MI->getNumMemOperands() == 0) 761 return SIMemOpInfo(); 762 763 return constructFromMIWithMMO(MI); 764 } 765 766 std::optional<SIMemOpInfo> 767 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 768 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 769 770 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 771 return std::nullopt; 772 773 AtomicOrdering Ordering = 774 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 775 776 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 777 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 778 if (!ScopeOrNone) { 779 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 780 return std::nullopt; 781 } 782 783 SIAtomicScope Scope = SIAtomicScope::NONE; 784 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 785 bool IsCrossAddressSpaceOrdering = false; 786 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 787 *ScopeOrNone; 788 789 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 790 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 791 reportUnsupported(MI, "Unsupported atomic address space"); 792 return std::nullopt; 793 } 794 795 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 796 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 797 } 798 799 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 800 const MachineBasicBlock::iterator &MI) const { 801 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 802 803 if (!(MI->mayLoad() && MI->mayStore())) 804 return std::nullopt; 805 806 // Be conservative if there are no memory operands. 807 if (MI->getNumMemOperands() == 0) 808 return SIMemOpInfo(); 809 810 return constructFromMIWithMMO(MI); 811 } 812 813 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 814 TII = ST.getInstrInfo(); 815 IV = getIsaVersion(ST.getCPU()); 816 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 817 } 818 819 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 820 AMDGPU::CPol::CPol Bit) const { 821 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 822 if (!CPol) 823 return false; 824 825 CPol->setImm(CPol->getImm() | Bit); 826 return true; 827 } 828 829 /* static */ 830 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 831 GCNSubtarget::Generation Generation = ST.getGeneration(); 832 if (ST.hasGFX940Insts()) 833 return std::make_unique<SIGfx940CacheControl>(ST); 834 if (ST.hasGFX90AInsts()) 835 return std::make_unique<SIGfx90ACacheControl>(ST); 836 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 837 return std::make_unique<SIGfx6CacheControl>(ST); 838 if (Generation < AMDGPUSubtarget::GFX10) 839 return std::make_unique<SIGfx7CacheControl>(ST); 840 if (Generation < AMDGPUSubtarget::GFX11) 841 return std::make_unique<SIGfx10CacheControl>(ST); 842 return std::make_unique<SIGfx11CacheControl>(ST); 843 } 844 845 bool SIGfx6CacheControl::enableLoadCacheBypass( 846 const MachineBasicBlock::iterator &MI, 847 SIAtomicScope Scope, 848 SIAtomicAddrSpace AddrSpace) const { 849 assert(MI->mayLoad() && !MI->mayStore()); 850 bool Changed = false; 851 852 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 853 switch (Scope) { 854 case SIAtomicScope::SYSTEM: 855 case SIAtomicScope::AGENT: 856 // Set L1 cache policy to MISS_EVICT. 857 // Note: there is no L2 cache bypass policy at the ISA level. 858 Changed |= enableGLCBit(MI); 859 break; 860 case SIAtomicScope::WORKGROUP: 861 case SIAtomicScope::WAVEFRONT: 862 case SIAtomicScope::SINGLETHREAD: 863 // No cache to bypass. 864 break; 865 default: 866 llvm_unreachable("Unsupported synchronization scope"); 867 } 868 } 869 870 /// The scratch address space does not need the global memory caches 871 /// to be bypassed as all memory operations by the same thread are 872 /// sequentially consistent, and no other thread can access scratch 873 /// memory. 874 875 /// Other address spaces do not have a cache. 876 877 return Changed; 878 } 879 880 bool SIGfx6CacheControl::enableStoreCacheBypass( 881 const MachineBasicBlock::iterator &MI, 882 SIAtomicScope Scope, 883 SIAtomicAddrSpace AddrSpace) const { 884 assert(!MI->mayLoad() && MI->mayStore()); 885 bool Changed = false; 886 887 /// The L1 cache is write through so does not need to be bypassed. There is no 888 /// bypass control for the L2 cache at the isa level. 889 890 return Changed; 891 } 892 893 bool SIGfx6CacheControl::enableRMWCacheBypass( 894 const MachineBasicBlock::iterator &MI, 895 SIAtomicScope Scope, 896 SIAtomicAddrSpace AddrSpace) const { 897 assert(MI->mayLoad() && MI->mayStore()); 898 bool Changed = false; 899 900 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 901 /// bypassed, and the GLC bit is instead used to indicate if they are 902 /// return or no-return. 903 /// Note: there is no L2 cache coherent bypass control at the ISA level. 904 905 return Changed; 906 } 907 908 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 909 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 910 bool IsVolatile, bool IsNonTemporal) const { 911 // Only handle load and store, not atomic read-modify-write insructions. The 912 // latter use glc to indicate if the atomic returns a result and so must not 913 // be used for cache control. 914 assert(MI->mayLoad() ^ MI->mayStore()); 915 916 // Only update load and store, not LLVM IR atomic read-modify-write 917 // instructions. The latter are always marked as volatile so cannot sensibly 918 // handle it as do not want to pessimize all atomics. Also they do not support 919 // the nontemporal attribute. 920 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 921 922 bool Changed = false; 923 924 if (IsVolatile) { 925 // Set L1 cache policy to be MISS_EVICT for load instructions 926 // and MISS_LRU for store instructions. 927 // Note: there is no L2 cache bypass policy at the ISA level. 928 if (Op == SIMemOp::LOAD) 929 Changed |= enableGLCBit(MI); 930 931 // Ensure operation has completed at system scope to cause all volatile 932 // operations to be visible outside the program in a global order. Do not 933 // request cross address space as only the global address space can be 934 // observable outside the program, so no need to cause a waitcnt for LDS 935 // address space operations. 936 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 937 Position::AFTER); 938 939 return Changed; 940 } 941 942 if (IsNonTemporal) { 943 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 944 // for both loads and stores, and the L2 cache policy to STREAM. 945 Changed |= enableGLCBit(MI); 946 Changed |= enableSLCBit(MI); 947 return Changed; 948 } 949 950 return Changed; 951 } 952 953 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 954 SIAtomicScope Scope, 955 SIAtomicAddrSpace AddrSpace, 956 SIMemOp Op, 957 bool IsCrossAddrSpaceOrdering, 958 Position Pos) const { 959 bool Changed = false; 960 961 MachineBasicBlock &MBB = *MI->getParent(); 962 DebugLoc DL = MI->getDebugLoc(); 963 964 if (Pos == Position::AFTER) 965 ++MI; 966 967 bool VMCnt = false; 968 bool LGKMCnt = false; 969 970 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 971 SIAtomicAddrSpace::NONE) { 972 switch (Scope) { 973 case SIAtomicScope::SYSTEM: 974 case SIAtomicScope::AGENT: 975 VMCnt |= true; 976 break; 977 case SIAtomicScope::WORKGROUP: 978 case SIAtomicScope::WAVEFRONT: 979 case SIAtomicScope::SINGLETHREAD: 980 // The L1 cache keeps all memory operations in order for 981 // wavefronts in the same work-group. 982 break; 983 default: 984 llvm_unreachable("Unsupported synchronization scope"); 985 } 986 } 987 988 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 989 switch (Scope) { 990 case SIAtomicScope::SYSTEM: 991 case SIAtomicScope::AGENT: 992 case SIAtomicScope::WORKGROUP: 993 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 994 // not needed as LDS operations for all waves are executed in a total 995 // global ordering as observed by all waves. Required if also 996 // synchronizing with global/GDS memory as LDS operations could be 997 // reordered with respect to later global/GDS memory operations of the 998 // same wave. 999 LGKMCnt |= IsCrossAddrSpaceOrdering; 1000 break; 1001 case SIAtomicScope::WAVEFRONT: 1002 case SIAtomicScope::SINGLETHREAD: 1003 // The LDS keeps all memory operations in order for 1004 // the same wavefront. 1005 break; 1006 default: 1007 llvm_unreachable("Unsupported synchronization scope"); 1008 } 1009 } 1010 1011 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1012 switch (Scope) { 1013 case SIAtomicScope::SYSTEM: 1014 case SIAtomicScope::AGENT: 1015 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1016 // is not needed as GDS operations for all waves are executed in a total 1017 // global ordering as observed by all waves. Required if also 1018 // synchronizing with global/LDS memory as GDS operations could be 1019 // reordered with respect to later global/LDS memory operations of the 1020 // same wave. 1021 LGKMCnt |= IsCrossAddrSpaceOrdering; 1022 break; 1023 case SIAtomicScope::WORKGROUP: 1024 case SIAtomicScope::WAVEFRONT: 1025 case SIAtomicScope::SINGLETHREAD: 1026 // The GDS keeps all memory operations in order for 1027 // the same work-group. 1028 break; 1029 default: 1030 llvm_unreachable("Unsupported synchronization scope"); 1031 } 1032 } 1033 1034 if (VMCnt || LGKMCnt) { 1035 unsigned WaitCntImmediate = 1036 AMDGPU::encodeWaitcnt(IV, 1037 VMCnt ? 0 : getVmcntBitMask(IV), 1038 getExpcntBitMask(IV), 1039 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1040 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1041 Changed = true; 1042 } 1043 1044 if (Pos == Position::AFTER) 1045 --MI; 1046 1047 return Changed; 1048 } 1049 1050 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1051 SIAtomicScope Scope, 1052 SIAtomicAddrSpace AddrSpace, 1053 Position Pos) const { 1054 if (!InsertCacheInv) 1055 return false; 1056 1057 bool Changed = false; 1058 1059 MachineBasicBlock &MBB = *MI->getParent(); 1060 DebugLoc DL = MI->getDebugLoc(); 1061 1062 if (Pos == Position::AFTER) 1063 ++MI; 1064 1065 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1066 switch (Scope) { 1067 case SIAtomicScope::SYSTEM: 1068 case SIAtomicScope::AGENT: 1069 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1070 Changed = true; 1071 break; 1072 case SIAtomicScope::WORKGROUP: 1073 case SIAtomicScope::WAVEFRONT: 1074 case SIAtomicScope::SINGLETHREAD: 1075 // No cache to invalidate. 1076 break; 1077 default: 1078 llvm_unreachable("Unsupported synchronization scope"); 1079 } 1080 } 1081 1082 /// The scratch address space does not need the global memory cache 1083 /// to be flushed as all memory operations by the same thread are 1084 /// sequentially consistent, and no other thread can access scratch 1085 /// memory. 1086 1087 /// Other address spaces do not have a cache. 1088 1089 if (Pos == Position::AFTER) 1090 --MI; 1091 1092 return Changed; 1093 } 1094 1095 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1096 SIAtomicScope Scope, 1097 SIAtomicAddrSpace AddrSpace, 1098 bool IsCrossAddrSpaceOrdering, 1099 Position Pos) const { 1100 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1101 IsCrossAddrSpaceOrdering, Pos); 1102 } 1103 1104 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1105 SIAtomicScope Scope, 1106 SIAtomicAddrSpace AddrSpace, 1107 Position Pos) const { 1108 if (!InsertCacheInv) 1109 return false; 1110 1111 bool Changed = false; 1112 1113 MachineBasicBlock &MBB = *MI->getParent(); 1114 DebugLoc DL = MI->getDebugLoc(); 1115 1116 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1117 1118 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1119 ? AMDGPU::BUFFER_WBINVL1 1120 : AMDGPU::BUFFER_WBINVL1_VOL; 1121 1122 if (Pos == Position::AFTER) 1123 ++MI; 1124 1125 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1126 switch (Scope) { 1127 case SIAtomicScope::SYSTEM: 1128 case SIAtomicScope::AGENT: 1129 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1130 Changed = true; 1131 break; 1132 case SIAtomicScope::WORKGROUP: 1133 case SIAtomicScope::WAVEFRONT: 1134 case SIAtomicScope::SINGLETHREAD: 1135 // No cache to invalidate. 1136 break; 1137 default: 1138 llvm_unreachable("Unsupported synchronization scope"); 1139 } 1140 } 1141 1142 /// The scratch address space does not need the global memory cache 1143 /// to be flushed as all memory operations by the same thread are 1144 /// sequentially consistent, and no other thread can access scratch 1145 /// memory. 1146 1147 /// Other address spaces do not have a cache. 1148 1149 if (Pos == Position::AFTER) 1150 --MI; 1151 1152 return Changed; 1153 } 1154 1155 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1156 const MachineBasicBlock::iterator &MI, 1157 SIAtomicScope Scope, 1158 SIAtomicAddrSpace AddrSpace) const { 1159 assert(MI->mayLoad() && !MI->mayStore()); 1160 bool Changed = false; 1161 1162 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1163 switch (Scope) { 1164 case SIAtomicScope::SYSTEM: 1165 case SIAtomicScope::AGENT: 1166 // Set the L1 cache policy to MISS_LRU. 1167 // Note: there is no L2 cache bypass policy at the ISA level. 1168 Changed |= enableGLCBit(MI); 1169 break; 1170 case SIAtomicScope::WORKGROUP: 1171 // In threadgroup split mode the waves of a work-group can be executing on 1172 // different CUs. Therefore need to bypass the L1 which is per CU. 1173 // Otherwise in non-threadgroup split mode all waves of a work-group are 1174 // on the same CU, and so the L1 does not need to be bypassed. 1175 if (ST.isTgSplitEnabled()) 1176 Changed |= enableGLCBit(MI); 1177 break; 1178 case SIAtomicScope::WAVEFRONT: 1179 case SIAtomicScope::SINGLETHREAD: 1180 // No cache to bypass. 1181 break; 1182 default: 1183 llvm_unreachable("Unsupported synchronization scope"); 1184 } 1185 } 1186 1187 /// The scratch address space does not need the global memory caches 1188 /// to be bypassed as all memory operations by the same thread are 1189 /// sequentially consistent, and no other thread can access scratch 1190 /// memory. 1191 1192 /// Other address spaces do not have a cache. 1193 1194 return Changed; 1195 } 1196 1197 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1198 const MachineBasicBlock::iterator &MI, 1199 SIAtomicScope Scope, 1200 SIAtomicAddrSpace AddrSpace) const { 1201 assert(!MI->mayLoad() && MI->mayStore()); 1202 bool Changed = false; 1203 1204 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1205 switch (Scope) { 1206 case SIAtomicScope::SYSTEM: 1207 case SIAtomicScope::AGENT: 1208 /// Do not set glc for store atomic operations as they implicitly write 1209 /// through the L1 cache. 1210 break; 1211 case SIAtomicScope::WORKGROUP: 1212 case SIAtomicScope::WAVEFRONT: 1213 case SIAtomicScope::SINGLETHREAD: 1214 // No cache to bypass. Store atomics implicitly write through the L1 1215 // cache. 1216 break; 1217 default: 1218 llvm_unreachable("Unsupported synchronization scope"); 1219 } 1220 } 1221 1222 /// The scratch address space does not need the global memory caches 1223 /// to be bypassed as all memory operations by the same thread are 1224 /// sequentially consistent, and no other thread can access scratch 1225 /// memory. 1226 1227 /// Other address spaces do not have a cache. 1228 1229 return Changed; 1230 } 1231 1232 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1233 const MachineBasicBlock::iterator &MI, 1234 SIAtomicScope Scope, 1235 SIAtomicAddrSpace AddrSpace) const { 1236 assert(MI->mayLoad() && MI->mayStore()); 1237 bool Changed = false; 1238 1239 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1240 switch (Scope) { 1241 case SIAtomicScope::SYSTEM: 1242 case SIAtomicScope::AGENT: 1243 /// Do not set glc for RMW atomic operations as they implicitly bypass 1244 /// the L1 cache, and the glc bit is instead used to indicate if they are 1245 /// return or no-return. 1246 break; 1247 case SIAtomicScope::WORKGROUP: 1248 case SIAtomicScope::WAVEFRONT: 1249 case SIAtomicScope::SINGLETHREAD: 1250 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1251 break; 1252 default: 1253 llvm_unreachable("Unsupported synchronization scope"); 1254 } 1255 } 1256 1257 return Changed; 1258 } 1259 1260 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1261 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1262 bool IsVolatile, bool IsNonTemporal) const { 1263 // Only handle load and store, not atomic read-modify-write insructions. The 1264 // latter use glc to indicate if the atomic returns a result and so must not 1265 // be used for cache control. 1266 assert(MI->mayLoad() ^ MI->mayStore()); 1267 1268 // Only update load and store, not LLVM IR atomic read-modify-write 1269 // instructions. The latter are always marked as volatile so cannot sensibly 1270 // handle it as do not want to pessimize all atomics. Also they do not support 1271 // the nontemporal attribute. 1272 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1273 1274 bool Changed = false; 1275 1276 if (IsVolatile) { 1277 // Set L1 cache policy to be MISS_EVICT for load instructions 1278 // and MISS_LRU for store instructions. 1279 // Note: there is no L2 cache bypass policy at the ISA level. 1280 if (Op == SIMemOp::LOAD) 1281 Changed |= enableGLCBit(MI); 1282 1283 // Ensure operation has completed at system scope to cause all volatile 1284 // operations to be visible outside the program in a global order. Do not 1285 // request cross address space as only the global address space can be 1286 // observable outside the program, so no need to cause a waitcnt for LDS 1287 // address space operations. 1288 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1289 Position::AFTER); 1290 1291 return Changed; 1292 } 1293 1294 if (IsNonTemporal) { 1295 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1296 // for both loads and stores, and the L2 cache policy to STREAM. 1297 Changed |= enableGLCBit(MI); 1298 Changed |= enableSLCBit(MI); 1299 return Changed; 1300 } 1301 1302 return Changed; 1303 } 1304 1305 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1306 SIAtomicScope Scope, 1307 SIAtomicAddrSpace AddrSpace, 1308 SIMemOp Op, 1309 bool IsCrossAddrSpaceOrdering, 1310 Position Pos) const { 1311 if (ST.isTgSplitEnabled()) { 1312 // In threadgroup split mode the waves of a work-group can be executing on 1313 // different CUs. Therefore need to wait for global or GDS memory operations 1314 // to complete to ensure they are visible to waves in the other CUs. 1315 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1316 // the same CU, so no need to wait for global memory as all waves in the 1317 // work-group access the same the L1, nor wait for GDS as access are ordered 1318 // on a CU. 1319 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1320 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1321 (Scope == SIAtomicScope::WORKGROUP)) { 1322 // Same as GFX7 using agent scope. 1323 Scope = SIAtomicScope::AGENT; 1324 } 1325 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1326 // LDS memory operations. 1327 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1328 } 1329 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1330 IsCrossAddrSpaceOrdering, Pos); 1331 } 1332 1333 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1334 SIAtomicScope Scope, 1335 SIAtomicAddrSpace AddrSpace, 1336 Position Pos) const { 1337 if (!InsertCacheInv) 1338 return false; 1339 1340 bool Changed = false; 1341 1342 MachineBasicBlock &MBB = *MI->getParent(); 1343 DebugLoc DL = MI->getDebugLoc(); 1344 1345 if (Pos == Position::AFTER) 1346 ++MI; 1347 1348 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1349 switch (Scope) { 1350 case SIAtomicScope::SYSTEM: 1351 // Ensures that following loads will not see stale remote VMEM data or 1352 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1353 // CC will never be stale due to the local memory probes. 1354 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1355 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1356 // hardware does not reorder memory operations by the same wave with 1357 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1358 // remove any cache lines of earlier writes by the same wave and ensures 1359 // later reads by the same wave will refetch the cache lines. 1360 Changed = true; 1361 break; 1362 case SIAtomicScope::AGENT: 1363 // Same as GFX7. 1364 break; 1365 case SIAtomicScope::WORKGROUP: 1366 // In threadgroup split mode the waves of a work-group can be executing on 1367 // different CUs. Therefore need to invalidate the L1 which is per CU. 1368 // Otherwise in non-threadgroup split mode all waves of a work-group are 1369 // on the same CU, and so the L1 does not need to be invalidated. 1370 if (ST.isTgSplitEnabled()) { 1371 // Same as GFX7 using agent scope. 1372 Scope = SIAtomicScope::AGENT; 1373 } 1374 break; 1375 case SIAtomicScope::WAVEFRONT: 1376 case SIAtomicScope::SINGLETHREAD: 1377 // Same as GFX7. 1378 break; 1379 default: 1380 llvm_unreachable("Unsupported synchronization scope"); 1381 } 1382 } 1383 1384 /// The scratch address space does not need the global memory cache 1385 /// to be flushed as all memory operations by the same thread are 1386 /// sequentially consistent, and no other thread can access scratch 1387 /// memory. 1388 1389 /// Other address spaces do not have a cache. 1390 1391 if (Pos == Position::AFTER) 1392 --MI; 1393 1394 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1395 1396 return Changed; 1397 } 1398 1399 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1400 SIAtomicScope Scope, 1401 SIAtomicAddrSpace AddrSpace, 1402 bool IsCrossAddrSpaceOrdering, 1403 Position Pos) const { 1404 bool Changed = false; 1405 1406 MachineBasicBlock &MBB = *MI->getParent(); 1407 DebugLoc DL = MI->getDebugLoc(); 1408 1409 if (Pos == Position::AFTER) 1410 ++MI; 1411 1412 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1413 switch (Scope) { 1414 case SIAtomicScope::SYSTEM: 1415 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1416 // hardware does not reorder memory operations by the same wave with 1417 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1418 // to initiate writeback of any dirty cache lines of earlier writes by the 1419 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1420 // writeback has completed. 1421 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1422 // Set SC bits to indicate system scope. 1423 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1424 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1425 // vmcnt(0)" needed by the "BUFFER_WBL2". 1426 Changed = true; 1427 break; 1428 case SIAtomicScope::AGENT: 1429 case SIAtomicScope::WORKGROUP: 1430 case SIAtomicScope::WAVEFRONT: 1431 case SIAtomicScope::SINGLETHREAD: 1432 // Same as GFX7. 1433 break; 1434 default: 1435 llvm_unreachable("Unsupported synchronization scope"); 1436 } 1437 } 1438 1439 if (Pos == Position::AFTER) 1440 --MI; 1441 1442 Changed |= 1443 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1444 IsCrossAddrSpaceOrdering, Pos); 1445 1446 return Changed; 1447 } 1448 1449 bool SIGfx940CacheControl::enableLoadCacheBypass( 1450 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1451 SIAtomicAddrSpace AddrSpace) const { 1452 assert(MI->mayLoad() && !MI->mayStore()); 1453 bool Changed = false; 1454 1455 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1456 switch (Scope) { 1457 case SIAtomicScope::SYSTEM: 1458 // Set SC bits to indicate system scope. 1459 Changed |= enableSC0Bit(MI); 1460 Changed |= enableSC1Bit(MI); 1461 break; 1462 case SIAtomicScope::AGENT: 1463 // Set SC bits to indicate agent scope. 1464 Changed |= enableSC1Bit(MI); 1465 break; 1466 case SIAtomicScope::WORKGROUP: 1467 // In threadgroup split mode the waves of a work-group can be executing on 1468 // different CUs. Therefore need to bypass the L1 which is per CU. 1469 // Otherwise in non-threadgroup split mode all waves of a work-group are 1470 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1471 // bits to indicate work-group scope will do this automatically. 1472 Changed |= enableSC0Bit(MI); 1473 break; 1474 case SIAtomicScope::WAVEFRONT: 1475 case SIAtomicScope::SINGLETHREAD: 1476 // Leave SC bits unset to indicate wavefront scope. 1477 break; 1478 default: 1479 llvm_unreachable("Unsupported synchronization scope"); 1480 } 1481 } 1482 1483 /// The scratch address space does not need the global memory caches 1484 /// to be bypassed as all memory operations by the same thread are 1485 /// sequentially consistent, and no other thread can access scratch 1486 /// memory. 1487 1488 /// Other address spaces do not have a cache. 1489 1490 return Changed; 1491 } 1492 1493 bool SIGfx940CacheControl::enableStoreCacheBypass( 1494 const MachineBasicBlock::iterator &MI, 1495 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1496 assert(!MI->mayLoad() && MI->mayStore()); 1497 bool Changed = false; 1498 1499 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1500 switch (Scope) { 1501 case SIAtomicScope::SYSTEM: 1502 // Set SC bits to indicate system scope. 1503 Changed |= enableSC0Bit(MI); 1504 Changed |= enableSC1Bit(MI); 1505 break; 1506 case SIAtomicScope::AGENT: 1507 // Set SC bits to indicate agent scope. 1508 Changed |= enableSC1Bit(MI); 1509 break; 1510 case SIAtomicScope::WORKGROUP: 1511 // Set SC bits to indicate workgroup scope. 1512 Changed |= enableSC0Bit(MI); 1513 break; 1514 case SIAtomicScope::WAVEFRONT: 1515 case SIAtomicScope::SINGLETHREAD: 1516 // Leave SC bits unset to indicate wavefront scope. 1517 break; 1518 default: 1519 llvm_unreachable("Unsupported synchronization scope"); 1520 } 1521 } 1522 1523 /// The scratch address space does not need the global memory caches 1524 /// to be bypassed as all memory operations by the same thread are 1525 /// sequentially consistent, and no other thread can access scratch 1526 /// memory. 1527 1528 /// Other address spaces do not have a cache. 1529 1530 return Changed; 1531 } 1532 1533 bool SIGfx940CacheControl::enableRMWCacheBypass( 1534 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1535 SIAtomicAddrSpace AddrSpace) const { 1536 assert(MI->mayLoad() && MI->mayStore()); 1537 bool Changed = false; 1538 1539 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1540 switch (Scope) { 1541 case SIAtomicScope::SYSTEM: 1542 // Set SC1 bit to indicate system scope. 1543 Changed |= enableSC1Bit(MI); 1544 break; 1545 case SIAtomicScope::AGENT: 1546 case SIAtomicScope::WORKGROUP: 1547 case SIAtomicScope::WAVEFRONT: 1548 case SIAtomicScope::SINGLETHREAD: 1549 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1550 // to indicate system or agent scope. The SC0 bit is used to indicate if 1551 // they are return or no-return. Leave SC1 bit unset to indicate agent 1552 // scope. 1553 break; 1554 default: 1555 llvm_unreachable("Unsupported synchronization scope"); 1556 } 1557 } 1558 1559 return Changed; 1560 } 1561 1562 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1563 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1564 bool IsVolatile, bool IsNonTemporal) const { 1565 // Only handle load and store, not atomic read-modify-write insructions. The 1566 // latter use glc to indicate if the atomic returns a result and so must not 1567 // be used for cache control. 1568 assert(MI->mayLoad() ^ MI->mayStore()); 1569 1570 // Only update load and store, not LLVM IR atomic read-modify-write 1571 // instructions. The latter are always marked as volatile so cannot sensibly 1572 // handle it as do not want to pessimize all atomics. Also they do not support 1573 // the nontemporal attribute. 1574 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1575 1576 bool Changed = false; 1577 1578 if (IsVolatile) { 1579 // Set SC bits to indicate system scope. 1580 Changed |= enableSC0Bit(MI); 1581 Changed |= enableSC1Bit(MI); 1582 1583 // Ensure operation has completed at system scope to cause all volatile 1584 // operations to be visible outside the program in a global order. Do not 1585 // request cross address space as only the global address space can be 1586 // observable outside the program, so no need to cause a waitcnt for LDS 1587 // address space operations. 1588 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1589 Position::AFTER); 1590 1591 return Changed; 1592 } 1593 1594 if (IsNonTemporal) { 1595 Changed |= enableNTBit(MI); 1596 return Changed; 1597 } 1598 1599 return Changed; 1600 } 1601 1602 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1603 SIAtomicScope Scope, 1604 SIAtomicAddrSpace AddrSpace, 1605 Position Pos) const { 1606 if (!InsertCacheInv) 1607 return false; 1608 1609 bool Changed = false; 1610 1611 MachineBasicBlock &MBB = *MI->getParent(); 1612 DebugLoc DL = MI->getDebugLoc(); 1613 1614 if (Pos == Position::AFTER) 1615 ++MI; 1616 1617 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1618 switch (Scope) { 1619 case SIAtomicScope::SYSTEM: 1620 // Ensures that following loads will not see stale remote VMEM data or 1621 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1622 // CC will never be stale due to the local memory probes. 1623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1624 // Set SC bits to indicate system scope. 1625 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1626 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1627 // hardware does not reorder memory operations by the same wave with 1628 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1629 // remove any cache lines of earlier writes by the same wave and ensures 1630 // later reads by the same wave will refetch the cache lines. 1631 Changed = true; 1632 break; 1633 case SIAtomicScope::AGENT: 1634 // Ensures that following loads will not see stale remote date or local 1635 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1636 // due to the memory probes. 1637 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1638 // Set SC bits to indicate agent scope. 1639 .addImm(AMDGPU::CPol::SC1); 1640 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1641 // does not reorder memory operations with respect to preceeding buffer 1642 // invalidate. The invalidate is guaranteed to remove any cache lines of 1643 // earlier writes and ensures later writes will refetch the cache lines. 1644 Changed = true; 1645 break; 1646 case SIAtomicScope::WORKGROUP: 1647 // In threadgroup split mode the waves of a work-group can be executing on 1648 // different CUs. Therefore need to invalidate the L1 which is per CU. 1649 // Otherwise in non-threadgroup split mode all waves of a work-group are 1650 // on the same CU, and so the L1 does not need to be invalidated. 1651 if (ST.isTgSplitEnabled()) { 1652 // Ensures L1 is invalidated if in threadgroup split mode. In 1653 // non-threadgroup split mode it is a NOP, but no point generating it in 1654 // that case if know not in that mode. 1655 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1656 // Set SC bits to indicate work-group scope. 1657 .addImm(AMDGPU::CPol::SC0); 1658 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1659 // does not reorder memory operations with respect to preceeding buffer 1660 // invalidate. The invalidate is guaranteed to remove any cache lines of 1661 // earlier writes and ensures later writes will refetch the cache lines. 1662 Changed = true; 1663 } 1664 break; 1665 case SIAtomicScope::WAVEFRONT: 1666 case SIAtomicScope::SINGLETHREAD: 1667 // Could generate "BUFFER_INV" but it would do nothing as there are no 1668 // caches to invalidate. 1669 break; 1670 default: 1671 llvm_unreachable("Unsupported synchronization scope"); 1672 } 1673 } 1674 1675 /// The scratch address space does not need the global memory cache 1676 /// to be flushed as all memory operations by the same thread are 1677 /// sequentially consistent, and no other thread can access scratch 1678 /// memory. 1679 1680 /// Other address spaces do not have a cache. 1681 1682 if (Pos == Position::AFTER) 1683 --MI; 1684 1685 return Changed; 1686 } 1687 1688 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1689 SIAtomicScope Scope, 1690 SIAtomicAddrSpace AddrSpace, 1691 bool IsCrossAddrSpaceOrdering, 1692 Position Pos) const { 1693 bool Changed = false; 1694 1695 MachineBasicBlock &MBB = *MI->getParent(); 1696 DebugLoc DL = MI->getDebugLoc(); 1697 1698 if (Pos == Position::AFTER) 1699 ++MI; 1700 1701 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1702 switch (Scope) { 1703 case SIAtomicScope::SYSTEM: 1704 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1705 // hardware does not reorder memory operations by the same wave with 1706 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1707 // to initiate writeback of any dirty cache lines of earlier writes by the 1708 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1709 // writeback has completed. 1710 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1711 // Set SC bits to indicate system scope. 1712 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1713 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1714 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1715 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1716 Changed = true; 1717 break; 1718 case SIAtomicScope::AGENT: 1719 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1720 // Set SC bits to indicate agent scope. 1721 .addImm(AMDGPU::CPol::SC1); 1722 1723 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1724 // SIAtomicScope::AGENT, the following insertWait will generate the 1725 // required "S_WAITCNT vmcnt(0)". 1726 Changed = true; 1727 break; 1728 case SIAtomicScope::WORKGROUP: 1729 case SIAtomicScope::WAVEFRONT: 1730 case SIAtomicScope::SINGLETHREAD: 1731 // Do not generate "BUFFER_WBL2" as there are no caches it would 1732 // writeback, and would require an otherwise unnecessary 1733 // "S_WAITCNT vmcnt(0)". 1734 break; 1735 default: 1736 llvm_unreachable("Unsupported synchronization scope"); 1737 } 1738 } 1739 1740 if (Pos == Position::AFTER) 1741 --MI; 1742 1743 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1744 // S_WAITCNT needed. 1745 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1746 IsCrossAddrSpaceOrdering, Pos); 1747 1748 return Changed; 1749 } 1750 1751 bool SIGfx10CacheControl::enableLoadCacheBypass( 1752 const MachineBasicBlock::iterator &MI, 1753 SIAtomicScope Scope, 1754 SIAtomicAddrSpace AddrSpace) const { 1755 assert(MI->mayLoad() && !MI->mayStore()); 1756 bool Changed = false; 1757 1758 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1759 switch (Scope) { 1760 case SIAtomicScope::SYSTEM: 1761 case SIAtomicScope::AGENT: 1762 // Set the L0 and L1 cache policies to MISS_EVICT. 1763 // Note: there is no L2 cache coherent bypass control at the ISA level. 1764 Changed |= enableGLCBit(MI); 1765 Changed |= enableDLCBit(MI); 1766 break; 1767 case SIAtomicScope::WORKGROUP: 1768 // In WGP mode the waves of a work-group can be executing on either CU of 1769 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1770 // CU mode all waves of a work-group are on the same CU, and so the L0 1771 // does not need to be bypassed. 1772 if (!ST.isCuModeEnabled()) 1773 Changed |= enableGLCBit(MI); 1774 break; 1775 case SIAtomicScope::WAVEFRONT: 1776 case SIAtomicScope::SINGLETHREAD: 1777 // No cache to bypass. 1778 break; 1779 default: 1780 llvm_unreachable("Unsupported synchronization scope"); 1781 } 1782 } 1783 1784 /// The scratch address space does not need the global memory caches 1785 /// to be bypassed as all memory operations by the same thread are 1786 /// sequentially consistent, and no other thread can access scratch 1787 /// memory. 1788 1789 /// Other address spaces do not have a cache. 1790 1791 return Changed; 1792 } 1793 1794 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1795 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1796 bool IsVolatile, bool IsNonTemporal) const { 1797 1798 // Only handle load and store, not atomic read-modify-write insructions. The 1799 // latter use glc to indicate if the atomic returns a result and so must not 1800 // be used for cache control. 1801 assert(MI->mayLoad() ^ MI->mayStore()); 1802 1803 // Only update load and store, not LLVM IR atomic read-modify-write 1804 // instructions. The latter are always marked as volatile so cannot sensibly 1805 // handle it as do not want to pessimize all atomics. Also they do not support 1806 // the nontemporal attribute. 1807 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1808 1809 bool Changed = false; 1810 1811 if (IsVolatile) { 1812 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1813 // and MISS_LRU for store instructions. 1814 // Note: there is no L2 cache coherent bypass control at the ISA level. 1815 if (Op == SIMemOp::LOAD) { 1816 Changed |= enableGLCBit(MI); 1817 Changed |= enableDLCBit(MI); 1818 } 1819 1820 // Ensure operation has completed at system scope to cause all volatile 1821 // operations to be visible outside the program in a global order. Do not 1822 // request cross address space as only the global address space can be 1823 // observable outside the program, so no need to cause a waitcnt for LDS 1824 // address space operations. 1825 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1826 Position::AFTER); 1827 return Changed; 1828 } 1829 1830 if (IsNonTemporal) { 1831 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1832 // and L2 cache policy to STREAM. 1833 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1834 // to MISS_EVICT and the L2 cache policy to STREAM. 1835 if (Op == SIMemOp::STORE) 1836 Changed |= enableGLCBit(MI); 1837 Changed |= enableSLCBit(MI); 1838 1839 return Changed; 1840 } 1841 1842 return Changed; 1843 } 1844 1845 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1846 SIAtomicScope Scope, 1847 SIAtomicAddrSpace AddrSpace, 1848 SIMemOp Op, 1849 bool IsCrossAddrSpaceOrdering, 1850 Position Pos) const { 1851 bool Changed = false; 1852 1853 MachineBasicBlock &MBB = *MI->getParent(); 1854 DebugLoc DL = MI->getDebugLoc(); 1855 1856 if (Pos == Position::AFTER) 1857 ++MI; 1858 1859 bool VMCnt = false; 1860 bool VSCnt = false; 1861 bool LGKMCnt = false; 1862 1863 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1864 SIAtomicAddrSpace::NONE) { 1865 switch (Scope) { 1866 case SIAtomicScope::SYSTEM: 1867 case SIAtomicScope::AGENT: 1868 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1869 VMCnt |= true; 1870 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1871 VSCnt |= true; 1872 break; 1873 case SIAtomicScope::WORKGROUP: 1874 // In WGP mode the waves of a work-group can be executing on either CU of 1875 // the WGP. Therefore need to wait for operations to complete to ensure 1876 // they are visible to waves in the other CU as the L0 is per CU. 1877 // Otherwise in CU mode and all waves of a work-group are on the same CU 1878 // which shares the same L0. 1879 if (!ST.isCuModeEnabled()) { 1880 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1881 VMCnt |= true; 1882 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1883 VSCnt |= true; 1884 } 1885 break; 1886 case SIAtomicScope::WAVEFRONT: 1887 case SIAtomicScope::SINGLETHREAD: 1888 // The L0 cache keeps all memory operations in order for 1889 // work-items in the same wavefront. 1890 break; 1891 default: 1892 llvm_unreachable("Unsupported synchronization scope"); 1893 } 1894 } 1895 1896 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1897 switch (Scope) { 1898 case SIAtomicScope::SYSTEM: 1899 case SIAtomicScope::AGENT: 1900 case SIAtomicScope::WORKGROUP: 1901 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1902 // not needed as LDS operations for all waves are executed in a total 1903 // global ordering as observed by all waves. Required if also 1904 // synchronizing with global/GDS memory as LDS operations could be 1905 // reordered with respect to later global/GDS memory operations of the 1906 // same wave. 1907 LGKMCnt |= IsCrossAddrSpaceOrdering; 1908 break; 1909 case SIAtomicScope::WAVEFRONT: 1910 case SIAtomicScope::SINGLETHREAD: 1911 // The LDS keeps all memory operations in order for 1912 // the same wavefront. 1913 break; 1914 default: 1915 llvm_unreachable("Unsupported synchronization scope"); 1916 } 1917 } 1918 1919 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1920 switch (Scope) { 1921 case SIAtomicScope::SYSTEM: 1922 case SIAtomicScope::AGENT: 1923 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1924 // is not needed as GDS operations for all waves are executed in a total 1925 // global ordering as observed by all waves. Required if also 1926 // synchronizing with global/LDS memory as GDS operations could be 1927 // reordered with respect to later global/LDS memory operations of the 1928 // same wave. 1929 LGKMCnt |= IsCrossAddrSpaceOrdering; 1930 break; 1931 case SIAtomicScope::WORKGROUP: 1932 case SIAtomicScope::WAVEFRONT: 1933 case SIAtomicScope::SINGLETHREAD: 1934 // The GDS keeps all memory operations in order for 1935 // the same work-group. 1936 break; 1937 default: 1938 llvm_unreachable("Unsupported synchronization scope"); 1939 } 1940 } 1941 1942 if (VMCnt || LGKMCnt) { 1943 unsigned WaitCntImmediate = 1944 AMDGPU::encodeWaitcnt(IV, 1945 VMCnt ? 0 : getVmcntBitMask(IV), 1946 getExpcntBitMask(IV), 1947 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1948 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1949 Changed = true; 1950 } 1951 1952 if (VSCnt) { 1953 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1954 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1955 .addImm(0); 1956 Changed = true; 1957 } 1958 1959 if (Pos == Position::AFTER) 1960 --MI; 1961 1962 return Changed; 1963 } 1964 1965 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1966 SIAtomicScope Scope, 1967 SIAtomicAddrSpace AddrSpace, 1968 Position Pos) const { 1969 if (!InsertCacheInv) 1970 return false; 1971 1972 bool Changed = false; 1973 1974 MachineBasicBlock &MBB = *MI->getParent(); 1975 DebugLoc DL = MI->getDebugLoc(); 1976 1977 if (Pos == Position::AFTER) 1978 ++MI; 1979 1980 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1981 switch (Scope) { 1982 case SIAtomicScope::SYSTEM: 1983 case SIAtomicScope::AGENT: 1984 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1985 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1986 Changed = true; 1987 break; 1988 case SIAtomicScope::WORKGROUP: 1989 // In WGP mode the waves of a work-group can be executing on either CU of 1990 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1991 // in CU mode and all waves of a work-group are on the same CU, and so the 1992 // L0 does not need to be invalidated. 1993 if (!ST.isCuModeEnabled()) { 1994 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1995 Changed = true; 1996 } 1997 break; 1998 case SIAtomicScope::WAVEFRONT: 1999 case SIAtomicScope::SINGLETHREAD: 2000 // No cache to invalidate. 2001 break; 2002 default: 2003 llvm_unreachable("Unsupported synchronization scope"); 2004 } 2005 } 2006 2007 /// The scratch address space does not need the global memory cache 2008 /// to be flushed as all memory operations by the same thread are 2009 /// sequentially consistent, and no other thread can access scratch 2010 /// memory. 2011 2012 /// Other address spaces do not have a cache. 2013 2014 if (Pos == Position::AFTER) 2015 --MI; 2016 2017 return Changed; 2018 } 2019 2020 bool SIGfx11CacheControl::enableLoadCacheBypass( 2021 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 2022 SIAtomicAddrSpace AddrSpace) const { 2023 assert(MI->mayLoad() && !MI->mayStore()); 2024 bool Changed = false; 2025 2026 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2027 switch (Scope) { 2028 case SIAtomicScope::SYSTEM: 2029 case SIAtomicScope::AGENT: 2030 // Set the L0 and L1 cache policies to MISS_EVICT. 2031 // Note: there is no L2 cache coherent bypass control at the ISA level. 2032 Changed |= enableGLCBit(MI); 2033 break; 2034 case SIAtomicScope::WORKGROUP: 2035 // In WGP mode the waves of a work-group can be executing on either CU of 2036 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 2037 // CU mode all waves of a work-group are on the same CU, and so the L0 2038 // does not need to be bypassed. 2039 if (!ST.isCuModeEnabled()) 2040 Changed |= enableGLCBit(MI); 2041 break; 2042 case SIAtomicScope::WAVEFRONT: 2043 case SIAtomicScope::SINGLETHREAD: 2044 // No cache to bypass. 2045 break; 2046 default: 2047 llvm_unreachable("Unsupported synchronization scope"); 2048 } 2049 } 2050 2051 /// The scratch address space does not need the global memory caches 2052 /// to be bypassed as all memory operations by the same thread are 2053 /// sequentially consistent, and no other thread can access scratch 2054 /// memory. 2055 2056 /// Other address spaces do not have a cache. 2057 2058 return Changed; 2059 } 2060 2061 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 2062 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2063 bool IsVolatile, bool IsNonTemporal) const { 2064 2065 // Only handle load and store, not atomic read-modify-write insructions. The 2066 // latter use glc to indicate if the atomic returns a result and so must not 2067 // be used for cache control. 2068 assert(MI->mayLoad() ^ MI->mayStore()); 2069 2070 // Only update load and store, not LLVM IR atomic read-modify-write 2071 // instructions. The latter are always marked as volatile so cannot sensibly 2072 // handle it as do not want to pessimize all atomics. Also they do not support 2073 // the nontemporal attribute. 2074 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 2075 2076 bool Changed = false; 2077 2078 if (IsVolatile) { 2079 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 2080 // and MISS_LRU for store instructions. 2081 // Note: there is no L2 cache coherent bypass control at the ISA level. 2082 if (Op == SIMemOp::LOAD) 2083 Changed |= enableGLCBit(MI); 2084 2085 // Set MALL NOALLOC for load and store instructions. 2086 Changed |= enableDLCBit(MI); 2087 2088 // Ensure operation has completed at system scope to cause all volatile 2089 // operations to be visible outside the program in a global order. Do not 2090 // request cross address space as only the global address space can be 2091 // observable outside the program, so no need to cause a waitcnt for LDS 2092 // address space operations. 2093 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 2094 Position::AFTER); 2095 return Changed; 2096 } 2097 2098 if (IsNonTemporal) { 2099 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 2100 // and L2 cache policy to STREAM. 2101 // For stores setting both GLC and SLC configures L0 and L1 cache policy 2102 // to MISS_EVICT and the L2 cache policy to STREAM. 2103 if (Op == SIMemOp::STORE) 2104 Changed |= enableGLCBit(MI); 2105 Changed |= enableSLCBit(MI); 2106 2107 // Set MALL NOALLOC for load and store instructions. 2108 Changed |= enableDLCBit(MI); 2109 return Changed; 2110 } 2111 2112 return Changed; 2113 } 2114 2115 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2116 if (AtomicPseudoMIs.empty()) 2117 return false; 2118 2119 for (auto &MI : AtomicPseudoMIs) 2120 MI->eraseFromParent(); 2121 2122 AtomicPseudoMIs.clear(); 2123 return true; 2124 } 2125 2126 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2127 MachineBasicBlock::iterator &MI) { 2128 assert(MI->mayLoad() && !MI->mayStore()); 2129 2130 bool Changed = false; 2131 2132 if (MOI.isAtomic()) { 2133 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2134 MOI.getOrdering() == AtomicOrdering::Acquire || 2135 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2136 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2137 MOI.getOrderingAddrSpace()); 2138 } 2139 2140 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2141 Changed |= CC->insertWait(MI, MOI.getScope(), 2142 MOI.getOrderingAddrSpace(), 2143 SIMemOp::LOAD | SIMemOp::STORE, 2144 MOI.getIsCrossAddressSpaceOrdering(), 2145 Position::BEFORE); 2146 2147 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2148 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2149 Changed |= CC->insertWait(MI, MOI.getScope(), 2150 MOI.getInstrAddrSpace(), 2151 SIMemOp::LOAD, 2152 MOI.getIsCrossAddressSpaceOrdering(), 2153 Position::AFTER); 2154 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2155 MOI.getOrderingAddrSpace(), 2156 Position::AFTER); 2157 } 2158 2159 return Changed; 2160 } 2161 2162 // Atomic instructions already bypass caches to the scope specified by the 2163 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2164 // need additional treatment. 2165 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2166 SIMemOp::LOAD, MOI.isVolatile(), 2167 MOI.isNonTemporal()); 2168 return Changed; 2169 } 2170 2171 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2172 MachineBasicBlock::iterator &MI) { 2173 assert(!MI->mayLoad() && MI->mayStore()); 2174 2175 bool Changed = false; 2176 2177 if (MOI.isAtomic()) { 2178 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2179 MOI.getOrdering() == AtomicOrdering::Release || 2180 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2181 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2182 MOI.getOrderingAddrSpace()); 2183 } 2184 2185 if (MOI.getOrdering() == AtomicOrdering::Release || 2186 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2187 Changed |= CC->insertRelease(MI, MOI.getScope(), 2188 MOI.getOrderingAddrSpace(), 2189 MOI.getIsCrossAddressSpaceOrdering(), 2190 Position::BEFORE); 2191 2192 return Changed; 2193 } 2194 2195 // Atomic instructions already bypass caches to the scope specified by the 2196 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2197 // need additional treatment. 2198 Changed |= CC->enableVolatileAndOrNonTemporal( 2199 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2200 MOI.isNonTemporal()); 2201 return Changed; 2202 } 2203 2204 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2205 MachineBasicBlock::iterator &MI) { 2206 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2207 2208 AtomicPseudoMIs.push_back(MI); 2209 bool Changed = false; 2210 2211 if (MOI.isAtomic()) { 2212 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2213 MOI.getOrdering() == AtomicOrdering::Release || 2214 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2215 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2216 /// TODO: This relies on a barrier always generating a waitcnt 2217 /// for LDS to ensure it is not reordered with the completion of 2218 /// the proceeding LDS operations. If barrier had a memory 2219 /// ordering and memory scope, then library does not need to 2220 /// generate a fence. Could add support in this file for 2221 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2222 /// adding S_WAITCNT before a S_BARRIER. 2223 Changed |= CC->insertRelease(MI, MOI.getScope(), 2224 MOI.getOrderingAddrSpace(), 2225 MOI.getIsCrossAddressSpaceOrdering(), 2226 Position::BEFORE); 2227 2228 // TODO: If both release and invalidate are happening they could be combined 2229 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2230 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2231 // track cache invalidate and write back instructions. 2232 2233 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2234 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2235 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2236 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2237 MOI.getOrderingAddrSpace(), 2238 Position::BEFORE); 2239 2240 return Changed; 2241 } 2242 2243 return Changed; 2244 } 2245 2246 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2247 MachineBasicBlock::iterator &MI) { 2248 assert(MI->mayLoad() && MI->mayStore()); 2249 2250 bool Changed = false; 2251 2252 if (MOI.isAtomic()) { 2253 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2254 MOI.getOrdering() == AtomicOrdering::Acquire || 2255 MOI.getOrdering() == AtomicOrdering::Release || 2256 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2257 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2258 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2259 MOI.getInstrAddrSpace()); 2260 } 2261 2262 if (MOI.getOrdering() == AtomicOrdering::Release || 2263 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2264 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2265 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2266 Changed |= CC->insertRelease(MI, MOI.getScope(), 2267 MOI.getOrderingAddrSpace(), 2268 MOI.getIsCrossAddressSpaceOrdering(), 2269 Position::BEFORE); 2270 2271 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2272 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2273 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2274 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2275 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2276 Changed |= CC->insertWait(MI, MOI.getScope(), 2277 MOI.getInstrAddrSpace(), 2278 isAtomicRet(*MI) ? SIMemOp::LOAD : 2279 SIMemOp::STORE, 2280 MOI.getIsCrossAddressSpaceOrdering(), 2281 Position::AFTER); 2282 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2283 MOI.getOrderingAddrSpace(), 2284 Position::AFTER); 2285 } 2286 2287 return Changed; 2288 } 2289 2290 return Changed; 2291 } 2292 2293 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2294 bool Changed = false; 2295 2296 SIMemOpAccess MOA(MF); 2297 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2298 2299 for (auto &MBB : MF) { 2300 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2301 2302 // Unbundle instructions after the post-RA scheduler. 2303 if (MI->isBundle() && MI->mayLoadOrStore()) { 2304 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2305 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2306 I != E && I->isBundledWithPred(); ++I) { 2307 I->unbundleFromPred(); 2308 for (MachineOperand &MO : I->operands()) 2309 if (MO.isReg()) 2310 MO.setIsInternalRead(false); 2311 } 2312 2313 MI->eraseFromParent(); 2314 MI = II->getIterator(); 2315 } 2316 2317 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2318 continue; 2319 2320 if (const auto &MOI = MOA.getLoadInfo(MI)) 2321 Changed |= expandLoad(*MOI, MI); 2322 else if (const auto &MOI = MOA.getStoreInfo(MI)) 2323 Changed |= expandStore(*MOI, MI); 2324 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2325 Changed |= expandAtomicFence(*MOI, MI); 2326 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2327 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 2328 } 2329 } 2330 2331 Changed |= removeAtomicPseudoMIs(); 2332 return Changed; 2333 } 2334 2335 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2336 2337 char SIMemoryLegalizer::ID = 0; 2338 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2339 2340 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2341 return new SIMemoryLegalizer(); 2342 } 2343