1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE && 130 !isStrongerThan(FailureOrdering, Ordering)); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SiMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SiMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 Optional<SIMemOpInfo> constructFromMIWithMMO( 233 const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "None" otherwise. 241 Optional<SIMemOpInfo> getLoadInfo( 242 const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "None" otherwise. 245 Optional<SIMemOpInfo> getStoreInfo( 246 const MachineBasicBlock::iterator &MI) const; 247 248 /// \returns Atomic fence info if \p MI is an atomic fence operation, 249 /// "None" otherwise. 250 Optional<SIMemOpInfo> getAtomicFenceInfo( 251 const MachineBasicBlock::iterator &MI) const; 252 253 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 254 /// rmw operation, "None" otherwise. 255 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 256 const MachineBasicBlock::iterator &MI) const; 257 }; 258 259 class SICacheControl { 260 protected: 261 262 /// AMDGPU subtarget info. 263 const GCNSubtarget &ST; 264 265 /// Instruction info. 266 const SIInstrInfo *TII = nullptr; 267 268 IsaVersion IV; 269 270 /// Whether to insert cache invalidating instructions. 271 bool InsertCacheInv; 272 273 SICacheControl(const GCNSubtarget &ST); 274 275 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 276 /// \returns Returns true if \p MI is modified, false otherwise. 277 bool enableNamedBit(const MachineBasicBlock::iterator MI, 278 AMDGPU::CPol::CPol Bit) const; 279 280 public: 281 282 /// Create a cache control for the subtarget \p ST. 283 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 284 285 /// Update \p MI memory load instruction to bypass any caches up to 286 /// the \p Scope memory scope for address spaces \p 287 /// AddrSpace. Return true iff the instruction was modified. 288 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 289 SIAtomicScope Scope, 290 SIAtomicAddrSpace AddrSpace) const = 0; 291 292 /// Update \p MI memory store instruction to bypass any caches up to 293 /// the \p Scope memory scope for address spaces \p 294 /// AddrSpace. Return true iff the instruction was modified. 295 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 296 SIAtomicScope Scope, 297 SIAtomicAddrSpace AddrSpace) const = 0; 298 299 /// Update \p MI memory read-modify-write instruction to bypass any caches up 300 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 301 /// iff the instruction was modified. 302 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 303 SIAtomicScope Scope, 304 SIAtomicAddrSpace AddrSpace) const = 0; 305 306 /// Update \p MI memory instruction of kind \p Op associated with address 307 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 308 /// true iff the instruction was modified. 309 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 310 SIAtomicAddrSpace AddrSpace, 311 SIMemOp Op, bool IsVolatile, 312 bool IsNonTemporal) const = 0; 313 314 /// Inserts any necessary instructions at position \p Pos relative 315 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 316 /// \p Op associated with address spaces \p AddrSpace have completed. Used 317 /// between memory instructions to enforce the order they become visible as 318 /// observed by other memory instructions executing in memory scope \p Scope. 319 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 320 /// address spaces. Returns true iff any instructions inserted. 321 virtual bool insertWait(MachineBasicBlock::iterator &MI, 322 SIAtomicScope Scope, 323 SIAtomicAddrSpace AddrSpace, 324 SIMemOp Op, 325 bool IsCrossAddrSpaceOrdering, 326 Position Pos) const = 0; 327 328 /// Inserts any necessary instructions at position \p Pos relative to 329 /// instruction \p MI to ensure any subsequent memory instructions of this 330 /// thread with address spaces \p AddrSpace will observe the previous memory 331 /// operations by any thread for memory scopes up to memory scope \p Scope . 332 /// Returns true iff any instructions inserted. 333 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 334 SIAtomicScope Scope, 335 SIAtomicAddrSpace AddrSpace, 336 Position Pos) const = 0; 337 338 /// Inserts any necessary instructions at position \p Pos relative to 339 /// instruction \p MI to ensure previous memory instructions by this thread 340 /// with address spaces \p AddrSpace have completed and can be observed by 341 /// subsequent memory instructions by any thread executing in memory scope \p 342 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 343 /// between address spaces. Returns true iff any instructions inserted. 344 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 345 SIAtomicScope Scope, 346 SIAtomicAddrSpace AddrSpace, 347 bool IsCrossAddrSpaceOrdering, 348 Position Pos) const = 0; 349 350 /// Virtual destructor to allow derivations to be deleted. 351 virtual ~SICacheControl() = default; 352 353 }; 354 355 class SIGfx6CacheControl : public SICacheControl { 356 protected: 357 358 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 359 /// is modified, false otherwise. 360 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 361 return enableNamedBit(MI, AMDGPU::CPol::GLC); 362 } 363 364 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 365 /// is modified, false otherwise. 366 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 367 return enableNamedBit(MI, AMDGPU::CPol::SLC); 368 } 369 370 public: 371 372 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 373 374 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 375 SIAtomicScope Scope, 376 SIAtomicAddrSpace AddrSpace) const override; 377 378 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 379 SIAtomicScope Scope, 380 SIAtomicAddrSpace AddrSpace) const override; 381 382 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 383 SIAtomicScope Scope, 384 SIAtomicAddrSpace AddrSpace) const override; 385 386 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 387 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 388 bool IsVolatile, 389 bool IsNonTemporal) const override; 390 391 bool insertWait(MachineBasicBlock::iterator &MI, 392 SIAtomicScope Scope, 393 SIAtomicAddrSpace AddrSpace, 394 SIMemOp Op, 395 bool IsCrossAddrSpaceOrdering, 396 Position Pos) const override; 397 398 bool insertAcquire(MachineBasicBlock::iterator &MI, 399 SIAtomicScope Scope, 400 SIAtomicAddrSpace AddrSpace, 401 Position Pos) const override; 402 403 bool insertRelease(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 bool IsCrossAddrSpaceOrdering, 407 Position Pos) const override; 408 }; 409 410 class SIGfx7CacheControl : public SIGfx6CacheControl { 411 public: 412 413 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 414 415 bool insertAcquire(MachineBasicBlock::iterator &MI, 416 SIAtomicScope Scope, 417 SIAtomicAddrSpace AddrSpace, 418 Position Pos) const override; 419 420 }; 421 422 class SIGfx90ACacheControl : public SIGfx7CacheControl { 423 public: 424 425 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 426 427 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 428 SIAtomicScope Scope, 429 SIAtomicAddrSpace AddrSpace) const override; 430 431 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 432 SIAtomicScope Scope, 433 SIAtomicAddrSpace AddrSpace) const override; 434 435 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 436 SIAtomicScope Scope, 437 SIAtomicAddrSpace AddrSpace) const override; 438 439 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 440 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 441 bool IsVolatile, 442 bool IsNonTemporal) const override; 443 444 bool insertWait(MachineBasicBlock::iterator &MI, 445 SIAtomicScope Scope, 446 SIAtomicAddrSpace AddrSpace, 447 SIMemOp Op, 448 bool IsCrossAddrSpaceOrdering, 449 Position Pos) const override; 450 451 bool insertAcquire(MachineBasicBlock::iterator &MI, 452 SIAtomicScope Scope, 453 SIAtomicAddrSpace AddrSpace, 454 Position Pos) const override; 455 456 bool insertRelease(MachineBasicBlock::iterator &MI, 457 SIAtomicScope Scope, 458 SIAtomicAddrSpace AddrSpace, 459 bool IsCrossAddrSpaceOrdering, 460 Position Pos) const override; 461 }; 462 463 class SIGfx10CacheControl : public SIGfx7CacheControl { 464 protected: 465 466 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 467 /// is modified, false otherwise. 468 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 469 return enableNamedBit(MI, AMDGPU::CPol::DLC); 470 } 471 472 public: 473 474 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 475 476 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 477 SIAtomicScope Scope, 478 SIAtomicAddrSpace AddrSpace) const override; 479 480 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 481 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 482 bool IsVolatile, 483 bool IsNonTemporal) const override; 484 485 bool insertWait(MachineBasicBlock::iterator &MI, 486 SIAtomicScope Scope, 487 SIAtomicAddrSpace AddrSpace, 488 SIMemOp Op, 489 bool IsCrossAddrSpaceOrdering, 490 Position Pos) const override; 491 492 bool insertAcquire(MachineBasicBlock::iterator &MI, 493 SIAtomicScope Scope, 494 SIAtomicAddrSpace AddrSpace, 495 Position Pos) const override; 496 }; 497 498 class SIMemoryLegalizer final : public MachineFunctionPass { 499 private: 500 501 /// Cache Control. 502 std::unique_ptr<SICacheControl> CC = nullptr; 503 504 /// List of atomic pseudo instructions. 505 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 506 507 /// Return true iff instruction \p MI is a atomic instruction that 508 /// returns a result. 509 bool isAtomicRet(const MachineInstr &MI) const { 510 return SIInstrInfo::isAtomicRet(MI); 511 } 512 513 /// Removes all processed atomic pseudo instructions from the current 514 /// function. Returns true if current function is modified, false otherwise. 515 bool removeAtomicPseudoMIs(); 516 517 /// Expands load operation \p MI. Returns true if instructions are 518 /// added/deleted or \p MI is modified, false otherwise. 519 bool expandLoad(const SIMemOpInfo &MOI, 520 MachineBasicBlock::iterator &MI); 521 /// Expands store operation \p MI. Returns true if instructions are 522 /// added/deleted or \p MI is modified, false otherwise. 523 bool expandStore(const SIMemOpInfo &MOI, 524 MachineBasicBlock::iterator &MI); 525 /// Expands atomic fence operation \p MI. Returns true if 526 /// instructions are added/deleted or \p MI is modified, false otherwise. 527 bool expandAtomicFence(const SIMemOpInfo &MOI, 528 MachineBasicBlock::iterator &MI); 529 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 530 /// instructions are added/deleted or \p MI is modified, false otherwise. 531 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 532 MachineBasicBlock::iterator &MI); 533 534 public: 535 static char ID; 536 537 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 538 539 void getAnalysisUsage(AnalysisUsage &AU) const override { 540 AU.setPreservesCFG(); 541 MachineFunctionPass::getAnalysisUsage(AU); 542 } 543 544 StringRef getPassName() const override { 545 return PASS_NAME; 546 } 547 548 bool runOnMachineFunction(MachineFunction &MF) override; 549 }; 550 551 } // end namespace anonymous 552 553 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 554 const char *Msg) const { 555 const Function &Func = MI->getParent()->getParent()->getFunction(); 556 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 557 Func.getContext().diagnose(Diag); 558 } 559 560 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 561 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 562 SIAtomicAddrSpace InstrAddrSpace) const { 563 if (SSID == SyncScope::System) 564 return std::make_tuple(SIAtomicScope::SYSTEM, 565 SIAtomicAddrSpace::ATOMIC, 566 true); 567 if (SSID == MMI->getAgentSSID()) 568 return std::make_tuple(SIAtomicScope::AGENT, 569 SIAtomicAddrSpace::ATOMIC, 570 true); 571 if (SSID == MMI->getWorkgroupSSID()) 572 return std::make_tuple(SIAtomicScope::WORKGROUP, 573 SIAtomicAddrSpace::ATOMIC, 574 true); 575 if (SSID == MMI->getWavefrontSSID()) 576 return std::make_tuple(SIAtomicScope::WAVEFRONT, 577 SIAtomicAddrSpace::ATOMIC, 578 true); 579 if (SSID == SyncScope::SingleThread) 580 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 581 SIAtomicAddrSpace::ATOMIC, 582 true); 583 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 584 return std::make_tuple(SIAtomicScope::SYSTEM, 585 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 586 false); 587 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 588 return std::make_tuple(SIAtomicScope::AGENT, 589 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 590 false); 591 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 592 return std::make_tuple(SIAtomicScope::WORKGROUP, 593 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 594 false); 595 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 596 return std::make_tuple(SIAtomicScope::WAVEFRONT, 597 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 598 false); 599 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 600 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 601 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 602 false); 603 return None; 604 } 605 606 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 607 if (AS == AMDGPUAS::FLAT_ADDRESS) 608 return SIAtomicAddrSpace::FLAT; 609 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 610 return SIAtomicAddrSpace::GLOBAL; 611 if (AS == AMDGPUAS::LOCAL_ADDRESS) 612 return SIAtomicAddrSpace::LDS; 613 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 614 return SIAtomicAddrSpace::SCRATCH; 615 if (AS == AMDGPUAS::REGION_ADDRESS) 616 return SIAtomicAddrSpace::GDS; 617 618 return SIAtomicAddrSpace::OTHER; 619 } 620 621 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 622 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 623 } 624 625 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 626 const MachineBasicBlock::iterator &MI) const { 627 assert(MI->getNumMemOperands() > 0); 628 629 SyncScope::ID SSID = SyncScope::SingleThread; 630 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 631 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 632 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 633 bool IsNonTemporal = true; 634 bool IsVolatile = false; 635 636 // Validator should check whether or not MMOs cover the entire set of 637 // locations accessed by the memory instruction. 638 for (const auto &MMO : MI->memoperands()) { 639 IsNonTemporal &= MMO->isNonTemporal(); 640 IsVolatile |= MMO->isVolatile(); 641 InstrAddrSpace |= 642 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 643 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 644 if (OpOrdering != AtomicOrdering::NotAtomic) { 645 const auto &IsSyncScopeInclusion = 646 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 647 if (!IsSyncScopeInclusion) { 648 reportUnsupported(MI, 649 "Unsupported non-inclusive atomic synchronization scope"); 650 return None; 651 } 652 653 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 654 Ordering = isStrongerThan(Ordering, OpOrdering) 655 ? Ordering 656 : MMO->getSuccessOrdering(); 657 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 658 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 659 FailureOrdering = 660 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 661 FailureOrdering : MMO->getFailureOrdering(); 662 } 663 } 664 665 SIAtomicScope Scope = SIAtomicScope::NONE; 666 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 667 bool IsCrossAddressSpaceOrdering = false; 668 if (Ordering != AtomicOrdering::NotAtomic) { 669 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 670 if (!ScopeOrNone) { 671 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 672 return None; 673 } 674 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 675 ScopeOrNone.getValue(); 676 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 677 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 678 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 679 reportUnsupported(MI, "Unsupported atomic address space"); 680 return None; 681 } 682 } 683 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 684 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 685 IsNonTemporal); 686 } 687 688 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 689 const MachineBasicBlock::iterator &MI) const { 690 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 691 692 if (!(MI->mayLoad() && !MI->mayStore())) 693 return None; 694 695 // Be conservative if there are no memory operands. 696 if (MI->getNumMemOperands() == 0) 697 return SIMemOpInfo(); 698 699 return constructFromMIWithMMO(MI); 700 } 701 702 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 703 const MachineBasicBlock::iterator &MI) const { 704 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 705 706 if (!(!MI->mayLoad() && MI->mayStore())) 707 return None; 708 709 // Be conservative if there are no memory operands. 710 if (MI->getNumMemOperands() == 0) 711 return SIMemOpInfo(); 712 713 return constructFromMIWithMMO(MI); 714 } 715 716 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 717 const MachineBasicBlock::iterator &MI) const { 718 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 719 720 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 721 return None; 722 723 AtomicOrdering Ordering = 724 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 725 726 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 727 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 728 if (!ScopeOrNone) { 729 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 730 return None; 731 } 732 733 SIAtomicScope Scope = SIAtomicScope::NONE; 734 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 735 bool IsCrossAddressSpaceOrdering = false; 736 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 737 ScopeOrNone.getValue(); 738 739 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 740 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 741 reportUnsupported(MI, "Unsupported atomic address space"); 742 return None; 743 } 744 745 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 746 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 747 } 748 749 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 750 const MachineBasicBlock::iterator &MI) const { 751 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 752 753 if (!(MI->mayLoad() && MI->mayStore())) 754 return None; 755 756 // Be conservative if there are no memory operands. 757 if (MI->getNumMemOperands() == 0) 758 return SIMemOpInfo(); 759 760 return constructFromMIWithMMO(MI); 761 } 762 763 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 764 TII = ST.getInstrInfo(); 765 IV = getIsaVersion(ST.getCPU()); 766 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 767 } 768 769 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 770 AMDGPU::CPol::CPol Bit) const { 771 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 772 if (!CPol) 773 return false; 774 775 CPol->setImm(CPol->getImm() | Bit); 776 return true; 777 } 778 779 /* static */ 780 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 781 GCNSubtarget::Generation Generation = ST.getGeneration(); 782 if (ST.hasGFX90AInsts()) 783 return std::make_unique<SIGfx90ACacheControl>(ST); 784 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 785 return std::make_unique<SIGfx6CacheControl>(ST); 786 if (Generation < AMDGPUSubtarget::GFX10) 787 return std::make_unique<SIGfx7CacheControl>(ST); 788 return std::make_unique<SIGfx10CacheControl>(ST); 789 } 790 791 bool SIGfx6CacheControl::enableLoadCacheBypass( 792 const MachineBasicBlock::iterator &MI, 793 SIAtomicScope Scope, 794 SIAtomicAddrSpace AddrSpace) const { 795 assert(MI->mayLoad() && !MI->mayStore()); 796 bool Changed = false; 797 798 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 799 switch (Scope) { 800 case SIAtomicScope::SYSTEM: 801 case SIAtomicScope::AGENT: 802 Changed |= enableGLCBit(MI); 803 break; 804 case SIAtomicScope::WORKGROUP: 805 case SIAtomicScope::WAVEFRONT: 806 case SIAtomicScope::SINGLETHREAD: 807 // No cache to bypass. 808 break; 809 default: 810 llvm_unreachable("Unsupported synchronization scope"); 811 } 812 } 813 814 /// The scratch address space does not need the global memory caches 815 /// to be bypassed as all memory operations by the same thread are 816 /// sequentially consistent, and no other thread can access scratch 817 /// memory. 818 819 /// Other address spaces do not have a cache. 820 821 return Changed; 822 } 823 824 bool SIGfx6CacheControl::enableStoreCacheBypass( 825 const MachineBasicBlock::iterator &MI, 826 SIAtomicScope Scope, 827 SIAtomicAddrSpace AddrSpace) const { 828 assert(!MI->mayLoad() && MI->mayStore()); 829 bool Changed = false; 830 831 /// The L1 cache is write through so does not need to be bypassed. There is no 832 /// bypass control for the L2 cache at the isa level. 833 834 return Changed; 835 } 836 837 bool SIGfx6CacheControl::enableRMWCacheBypass( 838 const MachineBasicBlock::iterator &MI, 839 SIAtomicScope Scope, 840 SIAtomicAddrSpace AddrSpace) const { 841 assert(MI->mayLoad() && MI->mayStore()); 842 bool Changed = false; 843 844 /// The L1 cache is write through so does not need to be bypassed. There is no 845 /// bypass control for the L2 cache at the isa level. 846 847 return Changed; 848 } 849 850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 851 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 852 bool IsVolatile, bool IsNonTemporal) const { 853 // Only handle load and store, not atomic read-modify-write insructions. The 854 // latter use glc to indicate if the atomic returns a result and so must not 855 // be used for cache control. 856 assert(MI->mayLoad() ^ MI->mayStore()); 857 858 // Only update load and store, not LLVM IR atomic read-modify-write 859 // instructions. The latter are always marked as volatile so cannot sensibly 860 // handle it as do not want to pessimize all atomics. Also they do not support 861 // the nontemporal attribute. 862 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 863 864 bool Changed = false; 865 866 if (IsVolatile) { 867 if (Op == SIMemOp::LOAD) 868 Changed |= enableGLCBit(MI); 869 870 // Ensure operation has completed at system scope to cause all volatile 871 // operations to be visible outside the program in a global order. Do not 872 // request cross address space as only the global address space can be 873 // observable outside the program, so no need to cause a waitcnt for LDS 874 // address space operations. 875 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 876 Position::AFTER); 877 878 return Changed; 879 } 880 881 if (IsNonTemporal) { 882 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 883 Changed |= enableGLCBit(MI); 884 Changed |= enableSLCBit(MI); 885 return Changed; 886 } 887 888 return Changed; 889 } 890 891 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 892 SIAtomicScope Scope, 893 SIAtomicAddrSpace AddrSpace, 894 SIMemOp Op, 895 bool IsCrossAddrSpaceOrdering, 896 Position Pos) const { 897 bool Changed = false; 898 899 MachineBasicBlock &MBB = *MI->getParent(); 900 DebugLoc DL = MI->getDebugLoc(); 901 902 if (Pos == Position::AFTER) 903 ++MI; 904 905 bool VMCnt = false; 906 bool LGKMCnt = false; 907 908 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 909 SIAtomicAddrSpace::NONE) { 910 switch (Scope) { 911 case SIAtomicScope::SYSTEM: 912 case SIAtomicScope::AGENT: 913 VMCnt |= true; 914 break; 915 case SIAtomicScope::WORKGROUP: 916 case SIAtomicScope::WAVEFRONT: 917 case SIAtomicScope::SINGLETHREAD: 918 // The L1 cache keeps all memory operations in order for 919 // wavefronts in the same work-group. 920 break; 921 default: 922 llvm_unreachable("Unsupported synchronization scope"); 923 } 924 } 925 926 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 927 switch (Scope) { 928 case SIAtomicScope::SYSTEM: 929 case SIAtomicScope::AGENT: 930 case SIAtomicScope::WORKGROUP: 931 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 932 // not needed as LDS operations for all waves are executed in a total 933 // global ordering as observed by all waves. Required if also 934 // synchronizing with global/GDS memory as LDS operations could be 935 // reordered with respect to later global/GDS memory operations of the 936 // same wave. 937 LGKMCnt |= IsCrossAddrSpaceOrdering; 938 break; 939 case SIAtomicScope::WAVEFRONT: 940 case SIAtomicScope::SINGLETHREAD: 941 // The LDS keeps all memory operations in order for 942 // the same wavesfront. 943 break; 944 default: 945 llvm_unreachable("Unsupported synchronization scope"); 946 } 947 } 948 949 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 950 switch (Scope) { 951 case SIAtomicScope::SYSTEM: 952 case SIAtomicScope::AGENT: 953 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 954 // is not needed as GDS operations for all waves are executed in a total 955 // global ordering as observed by all waves. Required if also 956 // synchronizing with global/LDS memory as GDS operations could be 957 // reordered with respect to later global/LDS memory operations of the 958 // same wave. 959 LGKMCnt |= IsCrossAddrSpaceOrdering; 960 break; 961 case SIAtomicScope::WORKGROUP: 962 case SIAtomicScope::WAVEFRONT: 963 case SIAtomicScope::SINGLETHREAD: 964 // The GDS keeps all memory operations in order for 965 // the same work-group. 966 break; 967 default: 968 llvm_unreachable("Unsupported synchronization scope"); 969 } 970 } 971 972 if (VMCnt || LGKMCnt) { 973 unsigned WaitCntImmediate = 974 AMDGPU::encodeWaitcnt(IV, 975 VMCnt ? 0 : getVmcntBitMask(IV), 976 getExpcntBitMask(IV), 977 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 978 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 979 Changed = true; 980 } 981 982 if (Pos == Position::AFTER) 983 --MI; 984 985 return Changed; 986 } 987 988 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 989 SIAtomicScope Scope, 990 SIAtomicAddrSpace AddrSpace, 991 Position Pos) const { 992 if (!InsertCacheInv) 993 return false; 994 995 bool Changed = false; 996 997 MachineBasicBlock &MBB = *MI->getParent(); 998 DebugLoc DL = MI->getDebugLoc(); 999 1000 if (Pos == Position::AFTER) 1001 ++MI; 1002 1003 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1004 switch (Scope) { 1005 case SIAtomicScope::SYSTEM: 1006 case SIAtomicScope::AGENT: 1007 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1008 Changed = true; 1009 break; 1010 case SIAtomicScope::WORKGROUP: 1011 case SIAtomicScope::WAVEFRONT: 1012 case SIAtomicScope::SINGLETHREAD: 1013 // No cache to invalidate. 1014 break; 1015 default: 1016 llvm_unreachable("Unsupported synchronization scope"); 1017 } 1018 } 1019 1020 /// The scratch address space does not need the global memory cache 1021 /// to be flushed as all memory operations by the same thread are 1022 /// sequentially consistent, and no other thread can access scratch 1023 /// memory. 1024 1025 /// Other address spaces do not have a cache. 1026 1027 if (Pos == Position::AFTER) 1028 --MI; 1029 1030 return Changed; 1031 } 1032 1033 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1034 SIAtomicScope Scope, 1035 SIAtomicAddrSpace AddrSpace, 1036 bool IsCrossAddrSpaceOrdering, 1037 Position Pos) const { 1038 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1039 IsCrossAddrSpaceOrdering, Pos); 1040 } 1041 1042 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1043 SIAtomicScope Scope, 1044 SIAtomicAddrSpace AddrSpace, 1045 Position Pos) const { 1046 if (!InsertCacheInv) 1047 return false; 1048 1049 bool Changed = false; 1050 1051 MachineBasicBlock &MBB = *MI->getParent(); 1052 DebugLoc DL = MI->getDebugLoc(); 1053 1054 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1055 1056 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1057 ? AMDGPU::BUFFER_WBINVL1 1058 : AMDGPU::BUFFER_WBINVL1_VOL; 1059 1060 if (Pos == Position::AFTER) 1061 ++MI; 1062 1063 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1064 switch (Scope) { 1065 case SIAtomicScope::SYSTEM: 1066 case SIAtomicScope::AGENT: 1067 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1068 Changed = true; 1069 break; 1070 case SIAtomicScope::WORKGROUP: 1071 case SIAtomicScope::WAVEFRONT: 1072 case SIAtomicScope::SINGLETHREAD: 1073 // No cache to invalidate. 1074 break; 1075 default: 1076 llvm_unreachable("Unsupported synchronization scope"); 1077 } 1078 } 1079 1080 /// The scratch address space does not need the global memory cache 1081 /// to be flushed as all memory operations by the same thread are 1082 /// sequentially consistent, and no other thread can access scratch 1083 /// memory. 1084 1085 /// Other address spaces do not have a cache. 1086 1087 if (Pos == Position::AFTER) 1088 --MI; 1089 1090 return Changed; 1091 } 1092 1093 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1094 const MachineBasicBlock::iterator &MI, 1095 SIAtomicScope Scope, 1096 SIAtomicAddrSpace AddrSpace) const { 1097 assert(MI->mayLoad() && !MI->mayStore()); 1098 bool Changed = false; 1099 1100 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1101 switch (Scope) { 1102 case SIAtomicScope::SYSTEM: 1103 case SIAtomicScope::AGENT: 1104 Changed |= enableGLCBit(MI); 1105 break; 1106 case SIAtomicScope::WORKGROUP: 1107 // In threadgroup split mode the waves of a work-group can be executing on 1108 // different CUs. Therefore need to bypass the L1 which is per CU. 1109 // Otherwise in non-threadgroup split mode all waves of a work-group are 1110 // on the same CU, and so the L1 does not need to be bypassed. 1111 if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI); 1112 break; 1113 case SIAtomicScope::WAVEFRONT: 1114 case SIAtomicScope::SINGLETHREAD: 1115 // No cache to bypass. 1116 break; 1117 default: 1118 llvm_unreachable("Unsupported synchronization scope"); 1119 } 1120 } 1121 1122 /// The scratch address space does not need the global memory caches 1123 /// to be bypassed as all memory operations by the same thread are 1124 /// sequentially consistent, and no other thread can access scratch 1125 /// memory. 1126 1127 /// Other address spaces do not have a cache. 1128 1129 return Changed; 1130 } 1131 1132 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1133 const MachineBasicBlock::iterator &MI, 1134 SIAtomicScope Scope, 1135 SIAtomicAddrSpace AddrSpace) const { 1136 assert(!MI->mayLoad() && MI->mayStore()); 1137 bool Changed = false; 1138 1139 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1140 switch (Scope) { 1141 case SIAtomicScope::SYSTEM: 1142 case SIAtomicScope::AGENT: 1143 /// Do not set glc for store atomic operations as they implicitly write 1144 /// through the L1 cache. 1145 break; 1146 case SIAtomicScope::WORKGROUP: 1147 case SIAtomicScope::WAVEFRONT: 1148 case SIAtomicScope::SINGLETHREAD: 1149 // No cache to bypass. Store atomics implicitly write through the L1 1150 // cache. 1151 break; 1152 default: 1153 llvm_unreachable("Unsupported synchronization scope"); 1154 } 1155 } 1156 1157 /// The scratch address space does not need the global memory caches 1158 /// to be bypassed as all memory operations by the same thread are 1159 /// sequentially consistent, and no other thread can access scratch 1160 /// memory. 1161 1162 /// Other address spaces do not have a cache. 1163 1164 return Changed; 1165 } 1166 1167 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1168 const MachineBasicBlock::iterator &MI, 1169 SIAtomicScope Scope, 1170 SIAtomicAddrSpace AddrSpace) const { 1171 assert(MI->mayLoad() && MI->mayStore()); 1172 bool Changed = false; 1173 1174 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1175 switch (Scope) { 1176 case SIAtomicScope::SYSTEM: 1177 case SIAtomicScope::AGENT: 1178 /// Do not set glc for RMW atomic operations as they implicitly bypass 1179 /// the L1 cache, and the glc bit is instead used to indicate if they are 1180 /// return or no-return. 1181 break; 1182 case SIAtomicScope::WORKGROUP: 1183 case SIAtomicScope::WAVEFRONT: 1184 case SIAtomicScope::SINGLETHREAD: 1185 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1186 break; 1187 default: 1188 llvm_unreachable("Unsupported synchronization scope"); 1189 } 1190 } 1191 1192 return Changed; 1193 } 1194 1195 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1196 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1197 bool IsVolatile, bool IsNonTemporal) const { 1198 // Only handle load and store, not atomic read-modify-write insructions. The 1199 // latter use glc to indicate if the atomic returns a result and so must not 1200 // be used for cache control. 1201 assert(MI->mayLoad() ^ MI->mayStore()); 1202 1203 // Only update load and store, not LLVM IR atomic read-modify-write 1204 // instructions. The latter are always marked as volatile so cannot sensibly 1205 // handle it as do not want to pessimize all atomics. Also they do not support 1206 // the nontemporal attribute. 1207 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1208 1209 bool Changed = false; 1210 1211 if (IsVolatile) { 1212 if (Op == SIMemOp::LOAD) { 1213 Changed |= enableGLCBit(MI); 1214 } 1215 1216 // Ensure operation has completed at system scope to cause all volatile 1217 // operations to be visible outside the program in a global order. Do not 1218 // request cross address space as only the global address space can be 1219 // observable outside the program, so no need to cause a waitcnt for LDS 1220 // address space operations. 1221 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1222 Position::AFTER); 1223 1224 return Changed; 1225 } 1226 1227 if (IsNonTemporal) { 1228 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 1229 Changed |= enableGLCBit(MI); 1230 Changed |= enableSLCBit(MI); 1231 return Changed; 1232 } 1233 1234 return Changed; 1235 } 1236 1237 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1238 SIAtomicScope Scope, 1239 SIAtomicAddrSpace AddrSpace, 1240 SIMemOp Op, 1241 bool IsCrossAddrSpaceOrdering, 1242 Position Pos) const { 1243 if (ST.isTgSplitEnabled()) { 1244 // In threadgroup split mode the waves of a work-group can be executing on 1245 // different CUs. Therefore need to wait for global or GDS memory operations 1246 // to complete to ensure they are visible to waves in the other CUs. 1247 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1248 // the same CU, so no need to wait for global memory as all waves in the 1249 // work-group access the same the L1, nor wait for GDS as access are ordered 1250 // on a CU. 1251 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1252 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1253 (Scope == SIAtomicScope::WORKGROUP)) { 1254 // Same as GFX7 using agent scope. 1255 Scope = SIAtomicScope::AGENT; 1256 } 1257 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1258 // LDS memory operations. 1259 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1260 } 1261 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1262 IsCrossAddrSpaceOrdering, Pos); 1263 } 1264 1265 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1266 SIAtomicScope Scope, 1267 SIAtomicAddrSpace AddrSpace, 1268 Position Pos) const { 1269 if (!InsertCacheInv) 1270 return false; 1271 1272 bool Changed = false; 1273 1274 MachineBasicBlock &MBB = *MI->getParent(); 1275 DebugLoc DL = MI->getDebugLoc(); 1276 1277 if (Pos == Position::AFTER) 1278 ++MI; 1279 1280 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1281 switch (Scope) { 1282 case SIAtomicScope::SYSTEM: 1283 // Ensures that following loads will not see stale remote VMEM data or 1284 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1285 // CC will never be stale due to the local memory probes. 1286 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1287 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1288 // hardware does not reorder memory operations by the same wave with 1289 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1290 // remove any cache lines of earlier writes by the same wave and ensures 1291 // later reads by the same wave will refetch the cache lines. 1292 Changed = true; 1293 break; 1294 case SIAtomicScope::AGENT: 1295 // Same as GFX7. 1296 break; 1297 case SIAtomicScope::WORKGROUP: 1298 // In threadgroup split mode the waves of a work-group can be executing on 1299 // different CUs. Therefore need to invalidate the L1 which is per CU. 1300 // Otherwise in non-threadgroup split mode all waves of a work-group are 1301 // on the same CU, and so the L1 does not need to be invalidated. 1302 if (ST.isTgSplitEnabled()) { 1303 // Same as GFX7 using agent scope. 1304 Scope = SIAtomicScope::AGENT; 1305 } 1306 break; 1307 case SIAtomicScope::WAVEFRONT: 1308 case SIAtomicScope::SINGLETHREAD: 1309 // Same as GFX7. 1310 break; 1311 default: 1312 llvm_unreachable("Unsupported synchronization scope"); 1313 } 1314 } 1315 1316 /// The scratch address space does not need the global memory cache 1317 /// to be flushed as all memory operations by the same thread are 1318 /// sequentially consistent, and no other thread can access scratch 1319 /// memory. 1320 1321 /// Other address spaces do not have a cache. 1322 1323 if (Pos == Position::AFTER) 1324 --MI; 1325 1326 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1327 1328 return Changed; 1329 } 1330 1331 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1332 SIAtomicScope Scope, 1333 SIAtomicAddrSpace AddrSpace, 1334 bool IsCrossAddrSpaceOrdering, 1335 Position Pos) const { 1336 bool Changed = false; 1337 1338 MachineBasicBlock &MBB = *MI->getParent(); 1339 DebugLoc DL = MI->getDebugLoc(); 1340 1341 if (Pos == Position::AFTER) 1342 ++MI; 1343 1344 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1345 switch (Scope) { 1346 case SIAtomicScope::SYSTEM: 1347 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1348 // hardware does not reorder memory operations by the same wave with 1349 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1350 // to initiate writeback of any dirty cache lines of earlier writes by the 1351 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1352 // writeback has completed. 1353 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); 1354 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1355 // vmcnt(0)" needed by the "BUFFER_WBL2". 1356 Changed = true; 1357 break; 1358 case SIAtomicScope::AGENT: 1359 case SIAtomicScope::WORKGROUP: 1360 case SIAtomicScope::WAVEFRONT: 1361 case SIAtomicScope::SINGLETHREAD: 1362 // Same as GFX7. 1363 break; 1364 default: 1365 llvm_unreachable("Unsupported synchronization scope"); 1366 } 1367 } 1368 1369 if (Pos == Position::AFTER) 1370 --MI; 1371 1372 Changed |= 1373 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1374 IsCrossAddrSpaceOrdering, Pos); 1375 1376 return Changed; 1377 } 1378 1379 bool SIGfx10CacheControl::enableLoadCacheBypass( 1380 const MachineBasicBlock::iterator &MI, 1381 SIAtomicScope Scope, 1382 SIAtomicAddrSpace AddrSpace) const { 1383 assert(MI->mayLoad() && !MI->mayStore()); 1384 bool Changed = false; 1385 1386 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1387 /// TODO Do not set glc for rmw atomic operations as they 1388 /// implicitly bypass the L0/L1 caches. 1389 1390 switch (Scope) { 1391 case SIAtomicScope::SYSTEM: 1392 case SIAtomicScope::AGENT: 1393 Changed |= enableGLCBit(MI); 1394 Changed |= enableDLCBit(MI); 1395 break; 1396 case SIAtomicScope::WORKGROUP: 1397 // In WGP mode the waves of a work-group can be executing on either CU of 1398 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1399 // CU mode all waves of a work-group are on the same CU, and so the L0 1400 // does not need to be bypassed. 1401 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 1402 break; 1403 case SIAtomicScope::WAVEFRONT: 1404 case SIAtomicScope::SINGLETHREAD: 1405 // No cache to bypass. 1406 break; 1407 default: 1408 llvm_unreachable("Unsupported synchronization scope"); 1409 } 1410 } 1411 1412 /// The scratch address space does not need the global memory caches 1413 /// to be bypassed as all memory operations by the same thread are 1414 /// sequentially consistent, and no other thread can access scratch 1415 /// memory. 1416 1417 /// Other address spaces do not have a cache. 1418 1419 return Changed; 1420 } 1421 1422 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1423 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1424 bool IsVolatile, bool IsNonTemporal) const { 1425 1426 // Only handle load and store, not atomic read-modify-write insructions. The 1427 // latter use glc to indicate if the atomic returns a result and so must not 1428 // be used for cache control. 1429 assert(MI->mayLoad() ^ MI->mayStore()); 1430 1431 // Only update load and store, not LLVM IR atomic read-modify-write 1432 // instructions. The latter are always marked as volatile so cannot sensibly 1433 // handle it as do not want to pessimize all atomics. Also they do not support 1434 // the nontemporal attribute. 1435 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1436 1437 bool Changed = false; 1438 1439 if (IsVolatile) { 1440 1441 if (Op == SIMemOp::LOAD) { 1442 Changed |= enableGLCBit(MI); 1443 Changed |= enableDLCBit(MI); 1444 } 1445 1446 // Ensure operation has completed at system scope to cause all volatile 1447 // operations to be visible outside the program in a global order. Do not 1448 // request cross address space as only the global address space can be 1449 // observable outside the program, so no need to cause a waitcnt for LDS 1450 // address space operations. 1451 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1452 Position::AFTER); 1453 return Changed; 1454 } 1455 1456 if (IsNonTemporal) { 1457 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1458 Changed |= enableSLCBit(MI); 1459 return Changed; 1460 } 1461 1462 return Changed; 1463 } 1464 1465 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1466 SIAtomicScope Scope, 1467 SIAtomicAddrSpace AddrSpace, 1468 SIMemOp Op, 1469 bool IsCrossAddrSpaceOrdering, 1470 Position Pos) const { 1471 bool Changed = false; 1472 1473 MachineBasicBlock &MBB = *MI->getParent(); 1474 DebugLoc DL = MI->getDebugLoc(); 1475 1476 if (Pos == Position::AFTER) 1477 ++MI; 1478 1479 bool VMCnt = false; 1480 bool VSCnt = false; 1481 bool LGKMCnt = false; 1482 1483 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1484 SIAtomicAddrSpace::NONE) { 1485 switch (Scope) { 1486 case SIAtomicScope::SYSTEM: 1487 case SIAtomicScope::AGENT: 1488 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1489 VMCnt |= true; 1490 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1491 VSCnt |= true; 1492 break; 1493 case SIAtomicScope::WORKGROUP: 1494 // In WGP mode the waves of a work-group can be executing on either CU of 1495 // the WGP. Therefore need to wait for operations to complete to ensure 1496 // they are visible to waves in the other CU as the L0 is per CU. 1497 // Otherwise in CU mode and all waves of a work-group are on the same CU 1498 // which shares the same L0. 1499 if (!ST.isCuModeEnabled()) { 1500 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1501 VMCnt |= true; 1502 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1503 VSCnt |= true; 1504 } 1505 break; 1506 case SIAtomicScope::WAVEFRONT: 1507 case SIAtomicScope::SINGLETHREAD: 1508 // The L0 cache keeps all memory operations in order for 1509 // work-items in the same wavefront. 1510 break; 1511 default: 1512 llvm_unreachable("Unsupported synchronization scope"); 1513 } 1514 } 1515 1516 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1517 switch (Scope) { 1518 case SIAtomicScope::SYSTEM: 1519 case SIAtomicScope::AGENT: 1520 case SIAtomicScope::WORKGROUP: 1521 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1522 // not needed as LDS operations for all waves are executed in a total 1523 // global ordering as observed by all waves. Required if also 1524 // synchronizing with global/GDS memory as LDS operations could be 1525 // reordered with respect to later global/GDS memory operations of the 1526 // same wave. 1527 LGKMCnt |= IsCrossAddrSpaceOrdering; 1528 break; 1529 case SIAtomicScope::WAVEFRONT: 1530 case SIAtomicScope::SINGLETHREAD: 1531 // The LDS keeps all memory operations in order for 1532 // the same wavesfront. 1533 break; 1534 default: 1535 llvm_unreachable("Unsupported synchronization scope"); 1536 } 1537 } 1538 1539 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1540 switch (Scope) { 1541 case SIAtomicScope::SYSTEM: 1542 case SIAtomicScope::AGENT: 1543 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1544 // is not needed as GDS operations for all waves are executed in a total 1545 // global ordering as observed by all waves. Required if also 1546 // synchronizing with global/LDS memory as GDS operations could be 1547 // reordered with respect to later global/LDS memory operations of the 1548 // same wave. 1549 LGKMCnt |= IsCrossAddrSpaceOrdering; 1550 break; 1551 case SIAtomicScope::WORKGROUP: 1552 case SIAtomicScope::WAVEFRONT: 1553 case SIAtomicScope::SINGLETHREAD: 1554 // The GDS keeps all memory operations in order for 1555 // the same work-group. 1556 break; 1557 default: 1558 llvm_unreachable("Unsupported synchronization scope"); 1559 } 1560 } 1561 1562 if (VMCnt || LGKMCnt) { 1563 unsigned WaitCntImmediate = 1564 AMDGPU::encodeWaitcnt(IV, 1565 VMCnt ? 0 : getVmcntBitMask(IV), 1566 getExpcntBitMask(IV), 1567 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1568 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1569 Changed = true; 1570 } 1571 1572 if (VSCnt) { 1573 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1574 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1575 .addImm(0); 1576 Changed = true; 1577 } 1578 1579 if (Pos == Position::AFTER) 1580 --MI; 1581 1582 return Changed; 1583 } 1584 1585 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1586 SIAtomicScope Scope, 1587 SIAtomicAddrSpace AddrSpace, 1588 Position Pos) const { 1589 if (!InsertCacheInv) 1590 return false; 1591 1592 bool Changed = false; 1593 1594 MachineBasicBlock &MBB = *MI->getParent(); 1595 DebugLoc DL = MI->getDebugLoc(); 1596 1597 if (Pos == Position::AFTER) 1598 ++MI; 1599 1600 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1601 switch (Scope) { 1602 case SIAtomicScope::SYSTEM: 1603 case SIAtomicScope::AGENT: 1604 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1605 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1606 Changed = true; 1607 break; 1608 case SIAtomicScope::WORKGROUP: 1609 // In WGP mode the waves of a work-group can be executing on either CU of 1610 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1611 // in CU mode and all waves of a work-group are on the same CU, and so the 1612 // L0 does not need to be invalidated. 1613 if (!ST.isCuModeEnabled()) { 1614 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1615 Changed = true; 1616 } 1617 break; 1618 case SIAtomicScope::WAVEFRONT: 1619 case SIAtomicScope::SINGLETHREAD: 1620 // No cache to invalidate. 1621 break; 1622 default: 1623 llvm_unreachable("Unsupported synchronization scope"); 1624 } 1625 } 1626 1627 /// The scratch address space does not need the global memory cache 1628 /// to be flushed as all memory operations by the same thread are 1629 /// sequentially consistent, and no other thread can access scratch 1630 /// memory. 1631 1632 /// Other address spaces do not have a cache. 1633 1634 if (Pos == Position::AFTER) 1635 --MI; 1636 1637 return Changed; 1638 } 1639 1640 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1641 if (AtomicPseudoMIs.empty()) 1642 return false; 1643 1644 for (auto &MI : AtomicPseudoMIs) 1645 MI->eraseFromParent(); 1646 1647 AtomicPseudoMIs.clear(); 1648 return true; 1649 } 1650 1651 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1652 MachineBasicBlock::iterator &MI) { 1653 assert(MI->mayLoad() && !MI->mayStore()); 1654 1655 bool Changed = false; 1656 1657 if (MOI.isAtomic()) { 1658 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1659 MOI.getOrdering() == AtomicOrdering::Acquire || 1660 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1661 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1662 MOI.getOrderingAddrSpace()); 1663 } 1664 1665 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1666 Changed |= CC->insertWait(MI, MOI.getScope(), 1667 MOI.getOrderingAddrSpace(), 1668 SIMemOp::LOAD | SIMemOp::STORE, 1669 MOI.getIsCrossAddressSpaceOrdering(), 1670 Position::BEFORE); 1671 1672 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1673 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1674 Changed |= CC->insertWait(MI, MOI.getScope(), 1675 MOI.getInstrAddrSpace(), 1676 SIMemOp::LOAD, 1677 MOI.getIsCrossAddressSpaceOrdering(), 1678 Position::AFTER); 1679 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1680 MOI.getOrderingAddrSpace(), 1681 Position::AFTER); 1682 } 1683 1684 return Changed; 1685 } 1686 1687 // Atomic instructions already bypass caches to the scope specified by the 1688 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1689 // need additional treatment. 1690 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1691 SIMemOp::LOAD, MOI.isVolatile(), 1692 MOI.isNonTemporal()); 1693 return Changed; 1694 } 1695 1696 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1697 MachineBasicBlock::iterator &MI) { 1698 assert(!MI->mayLoad() && MI->mayStore()); 1699 1700 bool Changed = false; 1701 1702 if (MOI.isAtomic()) { 1703 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1704 MOI.getOrdering() == AtomicOrdering::Release || 1705 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1706 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1707 MOI.getOrderingAddrSpace()); 1708 } 1709 1710 if (MOI.getOrdering() == AtomicOrdering::Release || 1711 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1712 Changed |= CC->insertRelease(MI, MOI.getScope(), 1713 MOI.getOrderingAddrSpace(), 1714 MOI.getIsCrossAddressSpaceOrdering(), 1715 Position::BEFORE); 1716 1717 return Changed; 1718 } 1719 1720 // Atomic instructions already bypass caches to the scope specified by the 1721 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1722 // need additional treatment. 1723 Changed |= CC->enableVolatileAndOrNonTemporal( 1724 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1725 MOI.isNonTemporal()); 1726 return Changed; 1727 } 1728 1729 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1730 MachineBasicBlock::iterator &MI) { 1731 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1732 1733 AtomicPseudoMIs.push_back(MI); 1734 bool Changed = false; 1735 1736 if (MOI.isAtomic()) { 1737 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1738 MOI.getOrdering() == AtomicOrdering::Release || 1739 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1740 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1741 /// TODO: This relies on a barrier always generating a waitcnt 1742 /// for LDS to ensure it is not reordered with the completion of 1743 /// the proceeding LDS operations. If barrier had a memory 1744 /// ordering and memory scope, then library does not need to 1745 /// generate a fence. Could add support in this file for 1746 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1747 /// adding S_WAITCNT before a S_BARRIER. 1748 Changed |= CC->insertRelease(MI, MOI.getScope(), 1749 MOI.getOrderingAddrSpace(), 1750 MOI.getIsCrossAddressSpaceOrdering(), 1751 Position::BEFORE); 1752 1753 // TODO: If both release and invalidate are happening they could be combined 1754 // to use the single "BUFFER_WBINV*" instruction. This could be done by 1755 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1756 // track cache invalidate and write back instructions. 1757 1758 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1759 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1760 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1761 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1762 MOI.getOrderingAddrSpace(), 1763 Position::BEFORE); 1764 1765 return Changed; 1766 } 1767 1768 return Changed; 1769 } 1770 1771 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1772 MachineBasicBlock::iterator &MI) { 1773 assert(MI->mayLoad() && MI->mayStore()); 1774 1775 bool Changed = false; 1776 1777 if (MOI.isAtomic()) { 1778 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1779 MOI.getOrdering() == AtomicOrdering::Acquire || 1780 MOI.getOrdering() == AtomicOrdering::Release || 1781 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1782 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1783 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1784 MOI.getInstrAddrSpace()); 1785 } 1786 1787 if (MOI.getOrdering() == AtomicOrdering::Release || 1788 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1789 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1790 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1791 Changed |= CC->insertRelease(MI, MOI.getScope(), 1792 MOI.getOrderingAddrSpace(), 1793 MOI.getIsCrossAddressSpaceOrdering(), 1794 Position::BEFORE); 1795 1796 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1797 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1798 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1799 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1800 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1801 Changed |= CC->insertWait(MI, MOI.getScope(), 1802 MOI.getInstrAddrSpace(), 1803 isAtomicRet(*MI) ? SIMemOp::LOAD : 1804 SIMemOp::STORE, 1805 MOI.getIsCrossAddressSpaceOrdering(), 1806 Position::AFTER); 1807 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1808 MOI.getOrderingAddrSpace(), 1809 Position::AFTER); 1810 } 1811 1812 return Changed; 1813 } 1814 1815 return Changed; 1816 } 1817 1818 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1819 bool Changed = false; 1820 1821 SIMemOpAccess MOA(MF); 1822 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1823 1824 for (auto &MBB : MF) { 1825 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1826 1827 // Unbundle instructions after the post-RA scheduler. 1828 if (MI->isBundle() && MI->mayLoadOrStore()) { 1829 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1830 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1831 I != E && I->isBundledWithPred(); ++I) { 1832 I->unbundleFromPred(); 1833 for (MachineOperand &MO : I->operands()) 1834 if (MO.isReg()) 1835 MO.setIsInternalRead(false); 1836 } 1837 1838 MI->eraseFromParent(); 1839 MI = II->getIterator(); 1840 } 1841 1842 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1843 continue; 1844 1845 if (const auto &MOI = MOA.getLoadInfo(MI)) 1846 Changed |= expandLoad(MOI.getValue(), MI); 1847 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1848 Changed |= expandStore(MOI.getValue(), MI); 1849 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1850 Changed |= expandAtomicFence(MOI.getValue(), MI); 1851 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1852 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1853 } 1854 } 1855 1856 Changed |= removeAtomicPseudoMIs(); 1857 return Changed; 1858 } 1859 1860 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1861 1862 char SIMemoryLegalizer::ID = 0; 1863 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1864 1865 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1866 return new SIMemoryLegalizer(); 1867 } 1868