1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE); 130 131 // There is also no cross address space ordering if the ordering 132 // address space is the same as the instruction address space and 133 // only contains a single address space. 134 if ((OrderingAddrSpace == InstrAddrSpace) && 135 isPowerOf2_32(uint32_t(InstrAddrSpace))) 136 this->IsCrossAddressSpaceOrdering = false; 137 138 // Limit the scope to the maximum supported by the instruction's address 139 // spaces. 140 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 141 SIAtomicAddrSpace::NONE) { 142 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 143 } else if ((InstrAddrSpace & 144 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 145 SIAtomicAddrSpace::NONE) { 146 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 147 } else if ((InstrAddrSpace & 148 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 149 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 150 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 151 } 152 } 153 154 public: 155 /// \returns Atomic synchronization scope of the machine instruction used to 156 /// create this SIMemOpInfo. 157 SIAtomicScope getScope() const { 158 return Scope; 159 } 160 161 /// \returns Ordering constraint of the machine instruction used to 162 /// create this SIMemOpInfo. 163 AtomicOrdering getOrdering() const { 164 return Ordering; 165 } 166 167 /// \returns Failure ordering constraint of the machine instruction used to 168 /// create this SIMemOpInfo. 169 AtomicOrdering getFailureOrdering() const { 170 return FailureOrdering; 171 } 172 173 /// \returns The address spaces be accessed by the machine 174 /// instruction used to create this SiMemOpInfo. 175 SIAtomicAddrSpace getInstrAddrSpace() const { 176 return InstrAddrSpace; 177 } 178 179 /// \returns The address spaces that must be ordered by the machine 180 /// instruction used to create this SiMemOpInfo. 181 SIAtomicAddrSpace getOrderingAddrSpace() const { 182 return OrderingAddrSpace; 183 } 184 185 /// \returns Return true iff memory ordering of operations on 186 /// different address spaces is required. 187 bool getIsCrossAddressSpaceOrdering() const { 188 return IsCrossAddressSpaceOrdering; 189 } 190 191 /// \returns True if memory access of the machine instruction used to 192 /// create this SIMemOpInfo is volatile, false otherwise. 193 bool isVolatile() const { 194 return IsVolatile; 195 } 196 197 /// \returns True if memory access of the machine instruction used to 198 /// create this SIMemOpInfo is nontemporal, false otherwise. 199 bool isNonTemporal() const { 200 return IsNonTemporal; 201 } 202 203 /// \returns True if ordering constraint of the machine instruction used to 204 /// create this SIMemOpInfo is unordered or higher, false otherwise. 205 bool isAtomic() const { 206 return Ordering != AtomicOrdering::NotAtomic; 207 } 208 209 }; 210 211 class SIMemOpAccess final { 212 private: 213 AMDGPUMachineModuleInfo *MMI = nullptr; 214 215 /// Reports unsupported message \p Msg for \p MI to LLVM context. 216 void reportUnsupported(const MachineBasicBlock::iterator &MI, 217 const char *Msg) const; 218 219 /// Inspects the target synchronization scope \p SSID and determines 220 /// the SI atomic scope it corresponds to, the address spaces it 221 /// covers, and whether the memory ordering applies between address 222 /// spaces. 223 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 224 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 225 226 /// \return Return a bit set of the address spaces accessed by \p AS. 227 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 228 229 /// \returns Info constructed from \p MI, which has at least machine memory 230 /// operand. 231 Optional<SIMemOpInfo> constructFromMIWithMMO( 232 const MachineBasicBlock::iterator &MI) const; 233 234 public: 235 /// Construct class to support accessing the machine memory operands 236 /// of instructions in the machine function \p MF. 237 SIMemOpAccess(MachineFunction &MF); 238 239 /// \returns Load info if \p MI is a load operation, "None" otherwise. 240 Optional<SIMemOpInfo> getLoadInfo( 241 const MachineBasicBlock::iterator &MI) const; 242 243 /// \returns Store info if \p MI is a store operation, "None" otherwise. 244 Optional<SIMemOpInfo> getStoreInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic fence info if \p MI is an atomic fence operation, 248 /// "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicFenceInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 252 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 253 /// rmw operation, "None" otherwise. 254 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 255 const MachineBasicBlock::iterator &MI) const; 256 }; 257 258 class SICacheControl { 259 protected: 260 261 /// AMDGPU subtarget info. 262 const GCNSubtarget &ST; 263 264 /// Instruction info. 265 const SIInstrInfo *TII = nullptr; 266 267 IsaVersion IV; 268 269 /// Whether to insert cache invalidating instructions. 270 bool InsertCacheInv; 271 272 SICacheControl(const GCNSubtarget &ST); 273 274 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 275 /// \returns Returns true if \p MI is modified, false otherwise. 276 bool enableNamedBit(const MachineBasicBlock::iterator MI, 277 AMDGPU::CPol::CPol Bit) const; 278 279 public: 280 281 /// Create a cache control for the subtarget \p ST. 282 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 283 284 /// Update \p MI memory load instruction to bypass any caches up to 285 /// the \p Scope memory scope for address spaces \p 286 /// AddrSpace. Return true iff the instruction was modified. 287 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 288 SIAtomicScope Scope, 289 SIAtomicAddrSpace AddrSpace) const = 0; 290 291 /// Update \p MI memory store instruction to bypass any caches up to 292 /// the \p Scope memory scope for address spaces \p 293 /// AddrSpace. Return true iff the instruction was modified. 294 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 295 SIAtomicScope Scope, 296 SIAtomicAddrSpace AddrSpace) const = 0; 297 298 /// Update \p MI memory read-modify-write instruction to bypass any caches up 299 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 300 /// iff the instruction was modified. 301 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 302 SIAtomicScope Scope, 303 SIAtomicAddrSpace AddrSpace) const = 0; 304 305 /// Update \p MI memory instruction of kind \p Op associated with address 306 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 307 /// true iff the instruction was modified. 308 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 309 SIAtomicAddrSpace AddrSpace, 310 SIMemOp Op, bool IsVolatile, 311 bool IsNonTemporal) const = 0; 312 313 /// Inserts any necessary instructions at position \p Pos relative 314 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 315 /// \p Op associated with address spaces \p AddrSpace have completed. Used 316 /// between memory instructions to enforce the order they become visible as 317 /// observed by other memory instructions executing in memory scope \p Scope. 318 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 319 /// address spaces. Returns true iff any instructions inserted. 320 virtual bool insertWait(MachineBasicBlock::iterator &MI, 321 SIAtomicScope Scope, 322 SIAtomicAddrSpace AddrSpace, 323 SIMemOp Op, 324 bool IsCrossAddrSpaceOrdering, 325 Position Pos) const = 0; 326 327 /// Inserts any necessary instructions at position \p Pos relative to 328 /// instruction \p MI to ensure any subsequent memory instructions of this 329 /// thread with address spaces \p AddrSpace will observe the previous memory 330 /// operations by any thread for memory scopes up to memory scope \p Scope . 331 /// Returns true iff any instructions inserted. 332 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 333 SIAtomicScope Scope, 334 SIAtomicAddrSpace AddrSpace, 335 Position Pos) const = 0; 336 337 /// Inserts any necessary instructions at position \p Pos relative to 338 /// instruction \p MI to ensure previous memory instructions by this thread 339 /// with address spaces \p AddrSpace have completed and can be observed by 340 /// subsequent memory instructions by any thread executing in memory scope \p 341 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 342 /// between address spaces. Returns true iff any instructions inserted. 343 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 344 SIAtomicScope Scope, 345 SIAtomicAddrSpace AddrSpace, 346 bool IsCrossAddrSpaceOrdering, 347 Position Pos) const = 0; 348 349 /// Virtual destructor to allow derivations to be deleted. 350 virtual ~SICacheControl() = default; 351 352 }; 353 354 class SIGfx6CacheControl : public SICacheControl { 355 protected: 356 357 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 358 /// is modified, false otherwise. 359 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 360 return enableNamedBit(MI, AMDGPU::CPol::GLC); 361 } 362 363 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::SLC); 367 } 368 369 public: 370 371 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 372 373 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 374 SIAtomicScope Scope, 375 SIAtomicAddrSpace AddrSpace) const override; 376 377 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 378 SIAtomicScope Scope, 379 SIAtomicAddrSpace AddrSpace) const override; 380 381 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 382 SIAtomicScope Scope, 383 SIAtomicAddrSpace AddrSpace) const override; 384 385 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 386 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 387 bool IsVolatile, 388 bool IsNonTemporal) const override; 389 390 bool insertWait(MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace, 393 SIMemOp Op, 394 bool IsCrossAddrSpaceOrdering, 395 Position Pos) const override; 396 397 bool insertAcquire(MachineBasicBlock::iterator &MI, 398 SIAtomicScope Scope, 399 SIAtomicAddrSpace AddrSpace, 400 Position Pos) const override; 401 402 bool insertRelease(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 bool IsCrossAddrSpaceOrdering, 406 Position Pos) const override; 407 }; 408 409 class SIGfx7CacheControl : public SIGfx6CacheControl { 410 public: 411 412 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 413 414 bool insertAcquire(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 Position Pos) const override; 418 419 }; 420 421 class SIGfx90ACacheControl : public SIGfx7CacheControl { 422 public: 423 424 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 425 426 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 427 SIAtomicScope Scope, 428 SIAtomicAddrSpace AddrSpace) const override; 429 430 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 431 SIAtomicScope Scope, 432 SIAtomicAddrSpace AddrSpace) const override; 433 434 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 435 SIAtomicScope Scope, 436 SIAtomicAddrSpace AddrSpace) const override; 437 438 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 439 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 440 bool IsVolatile, 441 bool IsNonTemporal) const override; 442 443 bool insertWait(MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace, 446 SIMemOp Op, 447 bool IsCrossAddrSpaceOrdering, 448 Position Pos) const override; 449 450 bool insertAcquire(MachineBasicBlock::iterator &MI, 451 SIAtomicScope Scope, 452 SIAtomicAddrSpace AddrSpace, 453 Position Pos) const override; 454 455 bool insertRelease(MachineBasicBlock::iterator &MI, 456 SIAtomicScope Scope, 457 SIAtomicAddrSpace AddrSpace, 458 bool IsCrossAddrSpaceOrdering, 459 Position Pos) const override; 460 }; 461 462 class SIGfx10CacheControl : public SIGfx7CacheControl { 463 protected: 464 465 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 466 /// is modified, false otherwise. 467 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 468 return enableNamedBit(MI, AMDGPU::CPol::DLC); 469 } 470 471 public: 472 473 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 474 475 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 476 SIAtomicScope Scope, 477 SIAtomicAddrSpace AddrSpace) const override; 478 479 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 480 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 481 bool IsVolatile, 482 bool IsNonTemporal) const override; 483 484 bool insertWait(MachineBasicBlock::iterator &MI, 485 SIAtomicScope Scope, 486 SIAtomicAddrSpace AddrSpace, 487 SIMemOp Op, 488 bool IsCrossAddrSpaceOrdering, 489 Position Pos) const override; 490 491 bool insertAcquire(MachineBasicBlock::iterator &MI, 492 SIAtomicScope Scope, 493 SIAtomicAddrSpace AddrSpace, 494 Position Pos) const override; 495 }; 496 497 class SIMemoryLegalizer final : public MachineFunctionPass { 498 private: 499 500 /// Cache Control. 501 std::unique_ptr<SICacheControl> CC = nullptr; 502 503 /// List of atomic pseudo instructions. 504 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 505 506 /// Return true iff instruction \p MI is a atomic instruction that 507 /// returns a result. 508 bool isAtomicRet(const MachineInstr &MI) const { 509 return SIInstrInfo::isAtomicRet(MI); 510 } 511 512 /// Removes all processed atomic pseudo instructions from the current 513 /// function. Returns true if current function is modified, false otherwise. 514 bool removeAtomicPseudoMIs(); 515 516 /// Expands load operation \p MI. Returns true if instructions are 517 /// added/deleted or \p MI is modified, false otherwise. 518 bool expandLoad(const SIMemOpInfo &MOI, 519 MachineBasicBlock::iterator &MI); 520 /// Expands store operation \p MI. Returns true if instructions are 521 /// added/deleted or \p MI is modified, false otherwise. 522 bool expandStore(const SIMemOpInfo &MOI, 523 MachineBasicBlock::iterator &MI); 524 /// Expands atomic fence operation \p MI. Returns true if 525 /// instructions are added/deleted or \p MI is modified, false otherwise. 526 bool expandAtomicFence(const SIMemOpInfo &MOI, 527 MachineBasicBlock::iterator &MI); 528 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 529 /// instructions are added/deleted or \p MI is modified, false otherwise. 530 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 531 MachineBasicBlock::iterator &MI); 532 533 public: 534 static char ID; 535 536 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 537 538 void getAnalysisUsage(AnalysisUsage &AU) const override { 539 AU.setPreservesCFG(); 540 MachineFunctionPass::getAnalysisUsage(AU); 541 } 542 543 StringRef getPassName() const override { 544 return PASS_NAME; 545 } 546 547 bool runOnMachineFunction(MachineFunction &MF) override; 548 }; 549 550 } // end namespace anonymous 551 552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 553 const char *Msg) const { 554 const Function &Func = MI->getParent()->getParent()->getFunction(); 555 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 556 Func.getContext().diagnose(Diag); 557 } 558 559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 561 SIAtomicAddrSpace InstrAddrSpace) const { 562 if (SSID == SyncScope::System) 563 return std::make_tuple(SIAtomicScope::SYSTEM, 564 SIAtomicAddrSpace::ATOMIC, 565 true); 566 if (SSID == MMI->getAgentSSID()) 567 return std::make_tuple(SIAtomicScope::AGENT, 568 SIAtomicAddrSpace::ATOMIC, 569 true); 570 if (SSID == MMI->getWorkgroupSSID()) 571 return std::make_tuple(SIAtomicScope::WORKGROUP, 572 SIAtomicAddrSpace::ATOMIC, 573 true); 574 if (SSID == MMI->getWavefrontSSID()) 575 return std::make_tuple(SIAtomicScope::WAVEFRONT, 576 SIAtomicAddrSpace::ATOMIC, 577 true); 578 if (SSID == SyncScope::SingleThread) 579 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 580 SIAtomicAddrSpace::ATOMIC, 581 true); 582 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 583 return std::make_tuple(SIAtomicScope::SYSTEM, 584 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 585 false); 586 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 587 return std::make_tuple(SIAtomicScope::AGENT, 588 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 589 false); 590 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 591 return std::make_tuple(SIAtomicScope::WORKGROUP, 592 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 593 false); 594 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 595 return std::make_tuple(SIAtomicScope::WAVEFRONT, 596 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 597 false); 598 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 599 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 600 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 601 false); 602 return None; 603 } 604 605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 606 if (AS == AMDGPUAS::FLAT_ADDRESS) 607 return SIAtomicAddrSpace::FLAT; 608 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 609 return SIAtomicAddrSpace::GLOBAL; 610 if (AS == AMDGPUAS::LOCAL_ADDRESS) 611 return SIAtomicAddrSpace::LDS; 612 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 613 return SIAtomicAddrSpace::SCRATCH; 614 if (AS == AMDGPUAS::REGION_ADDRESS) 615 return SIAtomicAddrSpace::GDS; 616 617 return SIAtomicAddrSpace::OTHER; 618 } 619 620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 621 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 622 } 623 624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 625 const MachineBasicBlock::iterator &MI) const { 626 assert(MI->getNumMemOperands() > 0); 627 628 SyncScope::ID SSID = SyncScope::SingleThread; 629 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 630 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 631 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 632 bool IsNonTemporal = true; 633 bool IsVolatile = false; 634 635 // Validator should check whether or not MMOs cover the entire set of 636 // locations accessed by the memory instruction. 637 for (const auto &MMO : MI->memoperands()) { 638 IsNonTemporal &= MMO->isNonTemporal(); 639 IsVolatile |= MMO->isVolatile(); 640 InstrAddrSpace |= 641 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 642 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 643 if (OpOrdering != AtomicOrdering::NotAtomic) { 644 const auto &IsSyncScopeInclusion = 645 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 646 if (!IsSyncScopeInclusion) { 647 reportUnsupported(MI, 648 "Unsupported non-inclusive atomic synchronization scope"); 649 return None; 650 } 651 652 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 653 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 654 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 655 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 656 FailureOrdering = 657 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 658 } 659 } 660 661 SIAtomicScope Scope = SIAtomicScope::NONE; 662 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 663 bool IsCrossAddressSpaceOrdering = false; 664 if (Ordering != AtomicOrdering::NotAtomic) { 665 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 666 if (!ScopeOrNone) { 667 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 668 return None; 669 } 670 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 671 ScopeOrNone.getValue(); 672 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 673 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 674 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 675 reportUnsupported(MI, "Unsupported atomic address space"); 676 return None; 677 } 678 } 679 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 680 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 681 IsNonTemporal); 682 } 683 684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 685 const MachineBasicBlock::iterator &MI) const { 686 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 687 688 if (!(MI->mayLoad() && !MI->mayStore())) 689 return None; 690 691 // Be conservative if there are no memory operands. 692 if (MI->getNumMemOperands() == 0) 693 return SIMemOpInfo(); 694 695 return constructFromMIWithMMO(MI); 696 } 697 698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 699 const MachineBasicBlock::iterator &MI) const { 700 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 701 702 if (!(!MI->mayLoad() && MI->mayStore())) 703 return None; 704 705 // Be conservative if there are no memory operands. 706 if (MI->getNumMemOperands() == 0) 707 return SIMemOpInfo(); 708 709 return constructFromMIWithMMO(MI); 710 } 711 712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 713 const MachineBasicBlock::iterator &MI) const { 714 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 715 716 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 717 return None; 718 719 AtomicOrdering Ordering = 720 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 721 722 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 723 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 724 if (!ScopeOrNone) { 725 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 726 return None; 727 } 728 729 SIAtomicScope Scope = SIAtomicScope::NONE; 730 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 731 bool IsCrossAddressSpaceOrdering = false; 732 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 733 ScopeOrNone.getValue(); 734 735 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 736 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 737 reportUnsupported(MI, "Unsupported atomic address space"); 738 return None; 739 } 740 741 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 742 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 743 } 744 745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 746 const MachineBasicBlock::iterator &MI) const { 747 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 748 749 if (!(MI->mayLoad() && MI->mayStore())) 750 return None; 751 752 // Be conservative if there are no memory operands. 753 if (MI->getNumMemOperands() == 0) 754 return SIMemOpInfo(); 755 756 return constructFromMIWithMMO(MI); 757 } 758 759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 760 TII = ST.getInstrInfo(); 761 IV = getIsaVersion(ST.getCPU()); 762 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 763 } 764 765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 766 AMDGPU::CPol::CPol Bit) const { 767 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 768 if (!CPol) 769 return false; 770 771 CPol->setImm(CPol->getImm() | Bit); 772 return true; 773 } 774 775 /* static */ 776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 777 GCNSubtarget::Generation Generation = ST.getGeneration(); 778 if (ST.hasGFX90AInsts()) 779 return std::make_unique<SIGfx90ACacheControl>(ST); 780 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 781 return std::make_unique<SIGfx6CacheControl>(ST); 782 if (Generation < AMDGPUSubtarget::GFX10) 783 return std::make_unique<SIGfx7CacheControl>(ST); 784 return std::make_unique<SIGfx10CacheControl>(ST); 785 } 786 787 bool SIGfx6CacheControl::enableLoadCacheBypass( 788 const MachineBasicBlock::iterator &MI, 789 SIAtomicScope Scope, 790 SIAtomicAddrSpace AddrSpace) const { 791 assert(MI->mayLoad() && !MI->mayStore()); 792 bool Changed = false; 793 794 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 795 switch (Scope) { 796 case SIAtomicScope::SYSTEM: 797 case SIAtomicScope::AGENT: 798 // Set L1 cache policy to MISS_EVICT. 799 // Note: there is no L2 cache bypass policy at the ISA level. 800 Changed |= enableGLCBit(MI); 801 break; 802 case SIAtomicScope::WORKGROUP: 803 case SIAtomicScope::WAVEFRONT: 804 case SIAtomicScope::SINGLETHREAD: 805 // No cache to bypass. 806 break; 807 default: 808 llvm_unreachable("Unsupported synchronization scope"); 809 } 810 } 811 812 /// The scratch address space does not need the global memory caches 813 /// to be bypassed as all memory operations by the same thread are 814 /// sequentially consistent, and no other thread can access scratch 815 /// memory. 816 817 /// Other address spaces do not have a cache. 818 819 return Changed; 820 } 821 822 bool SIGfx6CacheControl::enableStoreCacheBypass( 823 const MachineBasicBlock::iterator &MI, 824 SIAtomicScope Scope, 825 SIAtomicAddrSpace AddrSpace) const { 826 assert(!MI->mayLoad() && MI->mayStore()); 827 bool Changed = false; 828 829 /// The L1 cache is write through so does not need to be bypassed. There is no 830 /// bypass control for the L2 cache at the isa level. 831 832 return Changed; 833 } 834 835 bool SIGfx6CacheControl::enableRMWCacheBypass( 836 const MachineBasicBlock::iterator &MI, 837 SIAtomicScope Scope, 838 SIAtomicAddrSpace AddrSpace) const { 839 assert(MI->mayLoad() && MI->mayStore()); 840 bool Changed = false; 841 842 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 843 /// bypassed, and the GLC bit is instead used to indicate if they are 844 /// return or no-return. 845 /// Note: there is no L2 cache coherent bypass control at the ISA level. 846 847 return Changed; 848 } 849 850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 851 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 852 bool IsVolatile, bool IsNonTemporal) const { 853 // Only handle load and store, not atomic read-modify-write insructions. The 854 // latter use glc to indicate if the atomic returns a result and so must not 855 // be used for cache control. 856 assert(MI->mayLoad() ^ MI->mayStore()); 857 858 // Only update load and store, not LLVM IR atomic read-modify-write 859 // instructions. The latter are always marked as volatile so cannot sensibly 860 // handle it as do not want to pessimize all atomics. Also they do not support 861 // the nontemporal attribute. 862 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 863 864 bool Changed = false; 865 866 if (IsVolatile) { 867 // Set L1 cache policy to be MISS_EVICT for load instructions 868 // and MISS_LRU for store instructions. 869 // Note: there is no L2 cache bypass policy at the ISA level. 870 if (Op == SIMemOp::LOAD) 871 Changed |= enableGLCBit(MI); 872 873 // Ensure operation has completed at system scope to cause all volatile 874 // operations to be visible outside the program in a global order. Do not 875 // request cross address space as only the global address space can be 876 // observable outside the program, so no need to cause a waitcnt for LDS 877 // address space operations. 878 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 879 Position::AFTER); 880 881 return Changed; 882 } 883 884 if (IsNonTemporal) { 885 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 886 // for both loads and stores, and the L2 cache policy to STREAM. 887 Changed |= enableGLCBit(MI); 888 Changed |= enableSLCBit(MI); 889 return Changed; 890 } 891 892 return Changed; 893 } 894 895 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 896 SIAtomicScope Scope, 897 SIAtomicAddrSpace AddrSpace, 898 SIMemOp Op, 899 bool IsCrossAddrSpaceOrdering, 900 Position Pos) const { 901 bool Changed = false; 902 903 MachineBasicBlock &MBB = *MI->getParent(); 904 DebugLoc DL = MI->getDebugLoc(); 905 906 if (Pos == Position::AFTER) 907 ++MI; 908 909 bool VMCnt = false; 910 bool LGKMCnt = false; 911 912 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 913 SIAtomicAddrSpace::NONE) { 914 switch (Scope) { 915 case SIAtomicScope::SYSTEM: 916 case SIAtomicScope::AGENT: 917 VMCnt |= true; 918 break; 919 case SIAtomicScope::WORKGROUP: 920 case SIAtomicScope::WAVEFRONT: 921 case SIAtomicScope::SINGLETHREAD: 922 // The L1 cache keeps all memory operations in order for 923 // wavefronts in the same work-group. 924 break; 925 default: 926 llvm_unreachable("Unsupported synchronization scope"); 927 } 928 } 929 930 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 931 switch (Scope) { 932 case SIAtomicScope::SYSTEM: 933 case SIAtomicScope::AGENT: 934 case SIAtomicScope::WORKGROUP: 935 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 936 // not needed as LDS operations for all waves are executed in a total 937 // global ordering as observed by all waves. Required if also 938 // synchronizing with global/GDS memory as LDS operations could be 939 // reordered with respect to later global/GDS memory operations of the 940 // same wave. 941 LGKMCnt |= IsCrossAddrSpaceOrdering; 942 break; 943 case SIAtomicScope::WAVEFRONT: 944 case SIAtomicScope::SINGLETHREAD: 945 // The LDS keeps all memory operations in order for 946 // the same wavesfront. 947 break; 948 default: 949 llvm_unreachable("Unsupported synchronization scope"); 950 } 951 } 952 953 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 954 switch (Scope) { 955 case SIAtomicScope::SYSTEM: 956 case SIAtomicScope::AGENT: 957 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 958 // is not needed as GDS operations for all waves are executed in a total 959 // global ordering as observed by all waves. Required if also 960 // synchronizing with global/LDS memory as GDS operations could be 961 // reordered with respect to later global/LDS memory operations of the 962 // same wave. 963 LGKMCnt |= IsCrossAddrSpaceOrdering; 964 break; 965 case SIAtomicScope::WORKGROUP: 966 case SIAtomicScope::WAVEFRONT: 967 case SIAtomicScope::SINGLETHREAD: 968 // The GDS keeps all memory operations in order for 969 // the same work-group. 970 break; 971 default: 972 llvm_unreachable("Unsupported synchronization scope"); 973 } 974 } 975 976 if (VMCnt || LGKMCnt) { 977 unsigned WaitCntImmediate = 978 AMDGPU::encodeWaitcnt(IV, 979 VMCnt ? 0 : getVmcntBitMask(IV), 980 getExpcntBitMask(IV), 981 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 982 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 983 Changed = true; 984 } 985 986 if (Pos == Position::AFTER) 987 --MI; 988 989 return Changed; 990 } 991 992 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 993 SIAtomicScope Scope, 994 SIAtomicAddrSpace AddrSpace, 995 Position Pos) const { 996 if (!InsertCacheInv) 997 return false; 998 999 bool Changed = false; 1000 1001 MachineBasicBlock &MBB = *MI->getParent(); 1002 DebugLoc DL = MI->getDebugLoc(); 1003 1004 if (Pos == Position::AFTER) 1005 ++MI; 1006 1007 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1008 switch (Scope) { 1009 case SIAtomicScope::SYSTEM: 1010 case SIAtomicScope::AGENT: 1011 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1012 Changed = true; 1013 break; 1014 case SIAtomicScope::WORKGROUP: 1015 case SIAtomicScope::WAVEFRONT: 1016 case SIAtomicScope::SINGLETHREAD: 1017 // No cache to invalidate. 1018 break; 1019 default: 1020 llvm_unreachable("Unsupported synchronization scope"); 1021 } 1022 } 1023 1024 /// The scratch address space does not need the global memory cache 1025 /// to be flushed as all memory operations by the same thread are 1026 /// sequentially consistent, and no other thread can access scratch 1027 /// memory. 1028 1029 /// Other address spaces do not have a cache. 1030 1031 if (Pos == Position::AFTER) 1032 --MI; 1033 1034 return Changed; 1035 } 1036 1037 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1038 SIAtomicScope Scope, 1039 SIAtomicAddrSpace AddrSpace, 1040 bool IsCrossAddrSpaceOrdering, 1041 Position Pos) const { 1042 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1043 IsCrossAddrSpaceOrdering, Pos); 1044 } 1045 1046 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1047 SIAtomicScope Scope, 1048 SIAtomicAddrSpace AddrSpace, 1049 Position Pos) const { 1050 if (!InsertCacheInv) 1051 return false; 1052 1053 bool Changed = false; 1054 1055 MachineBasicBlock &MBB = *MI->getParent(); 1056 DebugLoc DL = MI->getDebugLoc(); 1057 1058 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1059 1060 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1061 ? AMDGPU::BUFFER_WBINVL1 1062 : AMDGPU::BUFFER_WBINVL1_VOL; 1063 1064 if (Pos == Position::AFTER) 1065 ++MI; 1066 1067 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1068 switch (Scope) { 1069 case SIAtomicScope::SYSTEM: 1070 case SIAtomicScope::AGENT: 1071 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1072 Changed = true; 1073 break; 1074 case SIAtomicScope::WORKGROUP: 1075 case SIAtomicScope::WAVEFRONT: 1076 case SIAtomicScope::SINGLETHREAD: 1077 // No cache to invalidate. 1078 break; 1079 default: 1080 llvm_unreachable("Unsupported synchronization scope"); 1081 } 1082 } 1083 1084 /// The scratch address space does not need the global memory cache 1085 /// to be flushed as all memory operations by the same thread are 1086 /// sequentially consistent, and no other thread can access scratch 1087 /// memory. 1088 1089 /// Other address spaces do not have a cache. 1090 1091 if (Pos == Position::AFTER) 1092 --MI; 1093 1094 return Changed; 1095 } 1096 1097 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1098 const MachineBasicBlock::iterator &MI, 1099 SIAtomicScope Scope, 1100 SIAtomicAddrSpace AddrSpace) const { 1101 assert(MI->mayLoad() && !MI->mayStore()); 1102 bool Changed = false; 1103 1104 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1105 switch (Scope) { 1106 case SIAtomicScope::SYSTEM: 1107 case SIAtomicScope::AGENT: 1108 // Set the L1 cache policy to MISS_LRU. 1109 // Note: there is no L2 cache bypass policy at the ISA level. 1110 Changed |= enableGLCBit(MI); 1111 break; 1112 case SIAtomicScope::WORKGROUP: 1113 // In threadgroup split mode the waves of a work-group can be executing on 1114 // different CUs. Therefore need to bypass the L1 which is per CU. 1115 // Otherwise in non-threadgroup split mode all waves of a work-group are 1116 // on the same CU, and so the L1 does not need to be bypassed. 1117 if (ST.isTgSplitEnabled()) 1118 Changed |= enableGLCBit(MI); 1119 break; 1120 case SIAtomicScope::WAVEFRONT: 1121 case SIAtomicScope::SINGLETHREAD: 1122 // No cache to bypass. 1123 break; 1124 default: 1125 llvm_unreachable("Unsupported synchronization scope"); 1126 } 1127 } 1128 1129 /// The scratch address space does not need the global memory caches 1130 /// to be bypassed as all memory operations by the same thread are 1131 /// sequentially consistent, and no other thread can access scratch 1132 /// memory. 1133 1134 /// Other address spaces do not have a cache. 1135 1136 return Changed; 1137 } 1138 1139 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1140 const MachineBasicBlock::iterator &MI, 1141 SIAtomicScope Scope, 1142 SIAtomicAddrSpace AddrSpace) const { 1143 assert(!MI->mayLoad() && MI->mayStore()); 1144 bool Changed = false; 1145 1146 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1147 switch (Scope) { 1148 case SIAtomicScope::SYSTEM: 1149 case SIAtomicScope::AGENT: 1150 /// Do not set glc for store atomic operations as they implicitly write 1151 /// through the L1 cache. 1152 break; 1153 case SIAtomicScope::WORKGROUP: 1154 case SIAtomicScope::WAVEFRONT: 1155 case SIAtomicScope::SINGLETHREAD: 1156 // No cache to bypass. Store atomics implicitly write through the L1 1157 // cache. 1158 break; 1159 default: 1160 llvm_unreachable("Unsupported synchronization scope"); 1161 } 1162 } 1163 1164 /// The scratch address space does not need the global memory caches 1165 /// to be bypassed as all memory operations by the same thread are 1166 /// sequentially consistent, and no other thread can access scratch 1167 /// memory. 1168 1169 /// Other address spaces do not have a cache. 1170 1171 return Changed; 1172 } 1173 1174 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1175 const MachineBasicBlock::iterator &MI, 1176 SIAtomicScope Scope, 1177 SIAtomicAddrSpace AddrSpace) const { 1178 assert(MI->mayLoad() && MI->mayStore()); 1179 bool Changed = false; 1180 1181 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1182 switch (Scope) { 1183 case SIAtomicScope::SYSTEM: 1184 case SIAtomicScope::AGENT: 1185 /// Do not set glc for RMW atomic operations as they implicitly bypass 1186 /// the L1 cache, and the glc bit is instead used to indicate if they are 1187 /// return or no-return. 1188 break; 1189 case SIAtomicScope::WORKGROUP: 1190 case SIAtomicScope::WAVEFRONT: 1191 case SIAtomicScope::SINGLETHREAD: 1192 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1193 break; 1194 default: 1195 llvm_unreachable("Unsupported synchronization scope"); 1196 } 1197 } 1198 1199 return Changed; 1200 } 1201 1202 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1203 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1204 bool IsVolatile, bool IsNonTemporal) const { 1205 // Only handle load and store, not atomic read-modify-write insructions. The 1206 // latter use glc to indicate if the atomic returns a result and so must not 1207 // be used for cache control. 1208 assert(MI->mayLoad() ^ MI->mayStore()); 1209 1210 // Only update load and store, not LLVM IR atomic read-modify-write 1211 // instructions. The latter are always marked as volatile so cannot sensibly 1212 // handle it as do not want to pessimize all atomics. Also they do not support 1213 // the nontemporal attribute. 1214 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1215 1216 bool Changed = false; 1217 1218 if (IsVolatile) { 1219 // Set L1 cache policy to be MISS_EVICT for load instructions 1220 // and MISS_LRU for store instructions. 1221 // Note: there is no L2 cache bypass policy at the ISA level. 1222 if (Op == SIMemOp::LOAD) 1223 Changed |= enableGLCBit(MI); 1224 1225 // Ensure operation has completed at system scope to cause all volatile 1226 // operations to be visible outside the program in a global order. Do not 1227 // request cross address space as only the global address space can be 1228 // observable outside the program, so no need to cause a waitcnt for LDS 1229 // address space operations. 1230 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1231 Position::AFTER); 1232 1233 return Changed; 1234 } 1235 1236 if (IsNonTemporal) { 1237 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1238 // for both loads and stores, and the L2 cache policy to STREAM. 1239 Changed |= enableGLCBit(MI); 1240 Changed |= enableSLCBit(MI); 1241 return Changed; 1242 } 1243 1244 return Changed; 1245 } 1246 1247 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1248 SIAtomicScope Scope, 1249 SIAtomicAddrSpace AddrSpace, 1250 SIMemOp Op, 1251 bool IsCrossAddrSpaceOrdering, 1252 Position Pos) const { 1253 if (ST.isTgSplitEnabled()) { 1254 // In threadgroup split mode the waves of a work-group can be executing on 1255 // different CUs. Therefore need to wait for global or GDS memory operations 1256 // to complete to ensure they are visible to waves in the other CUs. 1257 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1258 // the same CU, so no need to wait for global memory as all waves in the 1259 // work-group access the same the L1, nor wait for GDS as access are ordered 1260 // on a CU. 1261 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1262 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1263 (Scope == SIAtomicScope::WORKGROUP)) { 1264 // Same as GFX7 using agent scope. 1265 Scope = SIAtomicScope::AGENT; 1266 } 1267 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1268 // LDS memory operations. 1269 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1270 } 1271 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1272 IsCrossAddrSpaceOrdering, Pos); 1273 } 1274 1275 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1276 SIAtomicScope Scope, 1277 SIAtomicAddrSpace AddrSpace, 1278 Position Pos) const { 1279 if (!InsertCacheInv) 1280 return false; 1281 1282 bool Changed = false; 1283 1284 MachineBasicBlock &MBB = *MI->getParent(); 1285 DebugLoc DL = MI->getDebugLoc(); 1286 1287 if (Pos == Position::AFTER) 1288 ++MI; 1289 1290 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1291 switch (Scope) { 1292 case SIAtomicScope::SYSTEM: 1293 // Ensures that following loads will not see stale remote VMEM data or 1294 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1295 // CC will never be stale due to the local memory probes. 1296 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1297 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1298 // hardware does not reorder memory operations by the same wave with 1299 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1300 // remove any cache lines of earlier writes by the same wave and ensures 1301 // later reads by the same wave will refetch the cache lines. 1302 Changed = true; 1303 break; 1304 case SIAtomicScope::AGENT: 1305 // Same as GFX7. 1306 break; 1307 case SIAtomicScope::WORKGROUP: 1308 // In threadgroup split mode the waves of a work-group can be executing on 1309 // different CUs. Therefore need to invalidate the L1 which is per CU. 1310 // Otherwise in non-threadgroup split mode all waves of a work-group are 1311 // on the same CU, and so the L1 does not need to be invalidated. 1312 if (ST.isTgSplitEnabled()) { 1313 // Same as GFX7 using agent scope. 1314 Scope = SIAtomicScope::AGENT; 1315 } 1316 break; 1317 case SIAtomicScope::WAVEFRONT: 1318 case SIAtomicScope::SINGLETHREAD: 1319 // Same as GFX7. 1320 break; 1321 default: 1322 llvm_unreachable("Unsupported synchronization scope"); 1323 } 1324 } 1325 1326 /// The scratch address space does not need the global memory cache 1327 /// to be flushed as all memory operations by the same thread are 1328 /// sequentially consistent, and no other thread can access scratch 1329 /// memory. 1330 1331 /// Other address spaces do not have a cache. 1332 1333 if (Pos == Position::AFTER) 1334 --MI; 1335 1336 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1337 1338 return Changed; 1339 } 1340 1341 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1342 SIAtomicScope Scope, 1343 SIAtomicAddrSpace AddrSpace, 1344 bool IsCrossAddrSpaceOrdering, 1345 Position Pos) const { 1346 bool Changed = false; 1347 1348 MachineBasicBlock &MBB = *MI->getParent(); 1349 DebugLoc DL = MI->getDebugLoc(); 1350 1351 if (Pos == Position::AFTER) 1352 ++MI; 1353 1354 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1355 switch (Scope) { 1356 case SIAtomicScope::SYSTEM: 1357 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1358 // hardware does not reorder memory operations by the same wave with 1359 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1360 // to initiate writeback of any dirty cache lines of earlier writes by the 1361 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1362 // writeback has completed. 1363 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); 1364 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1365 // vmcnt(0)" needed by the "BUFFER_WBL2". 1366 Changed = true; 1367 break; 1368 case SIAtomicScope::AGENT: 1369 case SIAtomicScope::WORKGROUP: 1370 case SIAtomicScope::WAVEFRONT: 1371 case SIAtomicScope::SINGLETHREAD: 1372 // Same as GFX7. 1373 break; 1374 default: 1375 llvm_unreachable("Unsupported synchronization scope"); 1376 } 1377 } 1378 1379 if (Pos == Position::AFTER) 1380 --MI; 1381 1382 Changed |= 1383 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1384 IsCrossAddrSpaceOrdering, Pos); 1385 1386 return Changed; 1387 } 1388 1389 bool SIGfx10CacheControl::enableLoadCacheBypass( 1390 const MachineBasicBlock::iterator &MI, 1391 SIAtomicScope Scope, 1392 SIAtomicAddrSpace AddrSpace) const { 1393 assert(MI->mayLoad() && !MI->mayStore()); 1394 bool Changed = false; 1395 1396 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1397 switch (Scope) { 1398 case SIAtomicScope::SYSTEM: 1399 case SIAtomicScope::AGENT: 1400 // Set the L0 and L1 cache policies to MISS_EVICT. 1401 // Note: there is no L2 cache coherent bypass control at the ISA level. 1402 Changed |= enableGLCBit(MI); 1403 Changed |= enableDLCBit(MI); 1404 break; 1405 case SIAtomicScope::WORKGROUP: 1406 // In WGP mode the waves of a work-group can be executing on either CU of 1407 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1408 // CU mode all waves of a work-group are on the same CU, and so the L0 1409 // does not need to be bypassed. 1410 if (!ST.isCuModeEnabled()) 1411 Changed |= enableGLCBit(MI); 1412 break; 1413 case SIAtomicScope::WAVEFRONT: 1414 case SIAtomicScope::SINGLETHREAD: 1415 // No cache to bypass. 1416 break; 1417 default: 1418 llvm_unreachable("Unsupported synchronization scope"); 1419 } 1420 } 1421 1422 /// The scratch address space does not need the global memory caches 1423 /// to be bypassed as all memory operations by the same thread are 1424 /// sequentially consistent, and no other thread can access scratch 1425 /// memory. 1426 1427 /// Other address spaces do not have a cache. 1428 1429 return Changed; 1430 } 1431 1432 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1433 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1434 bool IsVolatile, bool IsNonTemporal) const { 1435 1436 // Only handle load and store, not atomic read-modify-write insructions. The 1437 // latter use glc to indicate if the atomic returns a result and so must not 1438 // be used for cache control. 1439 assert(MI->mayLoad() ^ MI->mayStore()); 1440 1441 // Only update load and store, not LLVM IR atomic read-modify-write 1442 // instructions. The latter are always marked as volatile so cannot sensibly 1443 // handle it as do not want to pessimize all atomics. Also they do not support 1444 // the nontemporal attribute. 1445 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1446 1447 bool Changed = false; 1448 1449 if (IsVolatile) { 1450 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1451 // and MISS_LRU for store instructions. 1452 // Note: there is no L2 cache coherent bypass control at the ISA level. 1453 if (Op == SIMemOp::LOAD) { 1454 Changed |= enableGLCBit(MI); 1455 Changed |= enableDLCBit(MI); 1456 } 1457 1458 // Ensure operation has completed at system scope to cause all volatile 1459 // operations to be visible outside the program in a global order. Do not 1460 // request cross address space as only the global address space can be 1461 // observable outside the program, so no need to cause a waitcnt for LDS 1462 // address space operations. 1463 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1464 Position::AFTER); 1465 return Changed; 1466 } 1467 1468 if (IsNonTemporal) { 1469 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1470 // and L2 cache policy to STREAM. 1471 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1472 // to MISS_EVICT and the L2 cache policy to STREAM. 1473 if (Op == SIMemOp::STORE) 1474 Changed |= enableGLCBit(MI); 1475 Changed |= enableSLCBit(MI); 1476 1477 return Changed; 1478 } 1479 1480 return Changed; 1481 } 1482 1483 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1484 SIAtomicScope Scope, 1485 SIAtomicAddrSpace AddrSpace, 1486 SIMemOp Op, 1487 bool IsCrossAddrSpaceOrdering, 1488 Position Pos) const { 1489 bool Changed = false; 1490 1491 MachineBasicBlock &MBB = *MI->getParent(); 1492 DebugLoc DL = MI->getDebugLoc(); 1493 1494 if (Pos == Position::AFTER) 1495 ++MI; 1496 1497 bool VMCnt = false; 1498 bool VSCnt = false; 1499 bool LGKMCnt = false; 1500 1501 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1502 SIAtomicAddrSpace::NONE) { 1503 switch (Scope) { 1504 case SIAtomicScope::SYSTEM: 1505 case SIAtomicScope::AGENT: 1506 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1507 VMCnt |= true; 1508 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1509 VSCnt |= true; 1510 break; 1511 case SIAtomicScope::WORKGROUP: 1512 // In WGP mode the waves of a work-group can be executing on either CU of 1513 // the WGP. Therefore need to wait for operations to complete to ensure 1514 // they are visible to waves in the other CU as the L0 is per CU. 1515 // Otherwise in CU mode and all waves of a work-group are on the same CU 1516 // which shares the same L0. 1517 if (!ST.isCuModeEnabled()) { 1518 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1519 VMCnt |= true; 1520 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1521 VSCnt |= true; 1522 } 1523 break; 1524 case SIAtomicScope::WAVEFRONT: 1525 case SIAtomicScope::SINGLETHREAD: 1526 // The L0 cache keeps all memory operations in order for 1527 // work-items in the same wavefront. 1528 break; 1529 default: 1530 llvm_unreachable("Unsupported synchronization scope"); 1531 } 1532 } 1533 1534 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1535 switch (Scope) { 1536 case SIAtomicScope::SYSTEM: 1537 case SIAtomicScope::AGENT: 1538 case SIAtomicScope::WORKGROUP: 1539 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1540 // not needed as LDS operations for all waves are executed in a total 1541 // global ordering as observed by all waves. Required if also 1542 // synchronizing with global/GDS memory as LDS operations could be 1543 // reordered with respect to later global/GDS memory operations of the 1544 // same wave. 1545 LGKMCnt |= IsCrossAddrSpaceOrdering; 1546 break; 1547 case SIAtomicScope::WAVEFRONT: 1548 case SIAtomicScope::SINGLETHREAD: 1549 // The LDS keeps all memory operations in order for 1550 // the same wavesfront. 1551 break; 1552 default: 1553 llvm_unreachable("Unsupported synchronization scope"); 1554 } 1555 } 1556 1557 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1558 switch (Scope) { 1559 case SIAtomicScope::SYSTEM: 1560 case SIAtomicScope::AGENT: 1561 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1562 // is not needed as GDS operations for all waves are executed in a total 1563 // global ordering as observed by all waves. Required if also 1564 // synchronizing with global/LDS memory as GDS operations could be 1565 // reordered with respect to later global/LDS memory operations of the 1566 // same wave. 1567 LGKMCnt |= IsCrossAddrSpaceOrdering; 1568 break; 1569 case SIAtomicScope::WORKGROUP: 1570 case SIAtomicScope::WAVEFRONT: 1571 case SIAtomicScope::SINGLETHREAD: 1572 // The GDS keeps all memory operations in order for 1573 // the same work-group. 1574 break; 1575 default: 1576 llvm_unreachable("Unsupported synchronization scope"); 1577 } 1578 } 1579 1580 if (VMCnt || LGKMCnt) { 1581 unsigned WaitCntImmediate = 1582 AMDGPU::encodeWaitcnt(IV, 1583 VMCnt ? 0 : getVmcntBitMask(IV), 1584 getExpcntBitMask(IV), 1585 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1586 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1587 Changed = true; 1588 } 1589 1590 if (VSCnt) { 1591 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1592 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1593 .addImm(0); 1594 Changed = true; 1595 } 1596 1597 if (Pos == Position::AFTER) 1598 --MI; 1599 1600 return Changed; 1601 } 1602 1603 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1604 SIAtomicScope Scope, 1605 SIAtomicAddrSpace AddrSpace, 1606 Position Pos) const { 1607 if (!InsertCacheInv) 1608 return false; 1609 1610 bool Changed = false; 1611 1612 MachineBasicBlock &MBB = *MI->getParent(); 1613 DebugLoc DL = MI->getDebugLoc(); 1614 1615 if (Pos == Position::AFTER) 1616 ++MI; 1617 1618 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1619 switch (Scope) { 1620 case SIAtomicScope::SYSTEM: 1621 case SIAtomicScope::AGENT: 1622 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1624 Changed = true; 1625 break; 1626 case SIAtomicScope::WORKGROUP: 1627 // In WGP mode the waves of a work-group can be executing on either CU of 1628 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1629 // in CU mode and all waves of a work-group are on the same CU, and so the 1630 // L0 does not need to be invalidated. 1631 if (!ST.isCuModeEnabled()) { 1632 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1633 Changed = true; 1634 } 1635 break; 1636 case SIAtomicScope::WAVEFRONT: 1637 case SIAtomicScope::SINGLETHREAD: 1638 // No cache to invalidate. 1639 break; 1640 default: 1641 llvm_unreachable("Unsupported synchronization scope"); 1642 } 1643 } 1644 1645 /// The scratch address space does not need the global memory cache 1646 /// to be flushed as all memory operations by the same thread are 1647 /// sequentially consistent, and no other thread can access scratch 1648 /// memory. 1649 1650 /// Other address spaces do not have a cache. 1651 1652 if (Pos == Position::AFTER) 1653 --MI; 1654 1655 return Changed; 1656 } 1657 1658 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1659 if (AtomicPseudoMIs.empty()) 1660 return false; 1661 1662 for (auto &MI : AtomicPseudoMIs) 1663 MI->eraseFromParent(); 1664 1665 AtomicPseudoMIs.clear(); 1666 return true; 1667 } 1668 1669 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1670 MachineBasicBlock::iterator &MI) { 1671 assert(MI->mayLoad() && !MI->mayStore()); 1672 1673 bool Changed = false; 1674 1675 if (MOI.isAtomic()) { 1676 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1677 MOI.getOrdering() == AtomicOrdering::Acquire || 1678 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1679 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1680 MOI.getOrderingAddrSpace()); 1681 } 1682 1683 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1684 Changed |= CC->insertWait(MI, MOI.getScope(), 1685 MOI.getOrderingAddrSpace(), 1686 SIMemOp::LOAD | SIMemOp::STORE, 1687 MOI.getIsCrossAddressSpaceOrdering(), 1688 Position::BEFORE); 1689 1690 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1691 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1692 Changed |= CC->insertWait(MI, MOI.getScope(), 1693 MOI.getInstrAddrSpace(), 1694 SIMemOp::LOAD, 1695 MOI.getIsCrossAddressSpaceOrdering(), 1696 Position::AFTER); 1697 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1698 MOI.getOrderingAddrSpace(), 1699 Position::AFTER); 1700 } 1701 1702 return Changed; 1703 } 1704 1705 // Atomic instructions already bypass caches to the scope specified by the 1706 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1707 // need additional treatment. 1708 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1709 SIMemOp::LOAD, MOI.isVolatile(), 1710 MOI.isNonTemporal()); 1711 return Changed; 1712 } 1713 1714 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1715 MachineBasicBlock::iterator &MI) { 1716 assert(!MI->mayLoad() && MI->mayStore()); 1717 1718 bool Changed = false; 1719 1720 if (MOI.isAtomic()) { 1721 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1722 MOI.getOrdering() == AtomicOrdering::Release || 1723 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1724 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1725 MOI.getOrderingAddrSpace()); 1726 } 1727 1728 if (MOI.getOrdering() == AtomicOrdering::Release || 1729 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1730 Changed |= CC->insertRelease(MI, MOI.getScope(), 1731 MOI.getOrderingAddrSpace(), 1732 MOI.getIsCrossAddressSpaceOrdering(), 1733 Position::BEFORE); 1734 1735 return Changed; 1736 } 1737 1738 // Atomic instructions already bypass caches to the scope specified by the 1739 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1740 // need additional treatment. 1741 Changed |= CC->enableVolatileAndOrNonTemporal( 1742 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1743 MOI.isNonTemporal()); 1744 return Changed; 1745 } 1746 1747 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1748 MachineBasicBlock::iterator &MI) { 1749 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1750 1751 AtomicPseudoMIs.push_back(MI); 1752 bool Changed = false; 1753 1754 if (MOI.isAtomic()) { 1755 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1756 MOI.getOrdering() == AtomicOrdering::Release || 1757 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1758 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1759 /// TODO: This relies on a barrier always generating a waitcnt 1760 /// for LDS to ensure it is not reordered with the completion of 1761 /// the proceeding LDS operations. If barrier had a memory 1762 /// ordering and memory scope, then library does not need to 1763 /// generate a fence. Could add support in this file for 1764 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1765 /// adding S_WAITCNT before a S_BARRIER. 1766 Changed |= CC->insertRelease(MI, MOI.getScope(), 1767 MOI.getOrderingAddrSpace(), 1768 MOI.getIsCrossAddressSpaceOrdering(), 1769 Position::BEFORE); 1770 1771 // TODO: If both release and invalidate are happening they could be combined 1772 // to use the single "BUFFER_WBINV*" instruction. This could be done by 1773 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1774 // track cache invalidate and write back instructions. 1775 1776 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1777 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1778 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1779 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1780 MOI.getOrderingAddrSpace(), 1781 Position::BEFORE); 1782 1783 return Changed; 1784 } 1785 1786 return Changed; 1787 } 1788 1789 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1790 MachineBasicBlock::iterator &MI) { 1791 assert(MI->mayLoad() && MI->mayStore()); 1792 1793 bool Changed = false; 1794 1795 if (MOI.isAtomic()) { 1796 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1797 MOI.getOrdering() == AtomicOrdering::Acquire || 1798 MOI.getOrdering() == AtomicOrdering::Release || 1799 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1800 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1801 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1802 MOI.getInstrAddrSpace()); 1803 } 1804 1805 if (MOI.getOrdering() == AtomicOrdering::Release || 1806 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1807 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1808 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1809 Changed |= CC->insertRelease(MI, MOI.getScope(), 1810 MOI.getOrderingAddrSpace(), 1811 MOI.getIsCrossAddressSpaceOrdering(), 1812 Position::BEFORE); 1813 1814 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1815 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1816 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1817 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1818 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1819 Changed |= CC->insertWait(MI, MOI.getScope(), 1820 MOI.getInstrAddrSpace(), 1821 isAtomicRet(*MI) ? SIMemOp::LOAD : 1822 SIMemOp::STORE, 1823 MOI.getIsCrossAddressSpaceOrdering(), 1824 Position::AFTER); 1825 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1826 MOI.getOrderingAddrSpace(), 1827 Position::AFTER); 1828 } 1829 1830 return Changed; 1831 } 1832 1833 return Changed; 1834 } 1835 1836 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1837 bool Changed = false; 1838 1839 SIMemOpAccess MOA(MF); 1840 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1841 1842 for (auto &MBB : MF) { 1843 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1844 1845 // Unbundle instructions after the post-RA scheduler. 1846 if (MI->isBundle() && MI->mayLoadOrStore()) { 1847 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1848 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1849 I != E && I->isBundledWithPred(); ++I) { 1850 I->unbundleFromPred(); 1851 for (MachineOperand &MO : I->operands()) 1852 if (MO.isReg()) 1853 MO.setIsInternalRead(false); 1854 } 1855 1856 MI->eraseFromParent(); 1857 MI = II->getIterator(); 1858 } 1859 1860 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1861 continue; 1862 1863 if (const auto &MOI = MOA.getLoadInfo(MI)) 1864 Changed |= expandLoad(MOI.getValue(), MI); 1865 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1866 Changed |= expandStore(MOI.getValue(), MI); 1867 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1868 Changed |= expandAtomicFence(MOI.getValue(), MI); 1869 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1870 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1871 } 1872 } 1873 1874 Changed |= removeAtomicPseudoMIs(); 1875 return Changed; 1876 } 1877 1878 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1879 1880 char SIMemoryLegalizer::ID = 0; 1881 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1882 1883 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1884 return new SIMemoryLegalizer(); 1885 } 1886