1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 88 /// \returns Returns true if \p MI is modified, false otherwise. 89 template <uint16_t BitName> 90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 91 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 92 if (BitIdx == -1) 93 return false; 94 95 MachineOperand &Bit = MI->getOperand(BitIdx); 96 if (Bit.getImm() != 0) 97 return false; 98 99 Bit.setImm(1); 100 return true; 101 } 102 103 class SIMemOpInfo final { 104 private: 105 106 friend class SIMemOpAccess; 107 108 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 109 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 110 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 111 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 112 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 113 bool IsCrossAddressSpaceOrdering = false; 114 bool IsVolatile = false; 115 bool IsNonTemporal = false; 116 117 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 118 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 119 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 120 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 121 bool IsCrossAddressSpaceOrdering = true, 122 AtomicOrdering FailureOrdering = 123 AtomicOrdering::SequentiallyConsistent, 124 bool IsVolatile = false, 125 bool IsNonTemporal = false) 126 : Ordering(Ordering), FailureOrdering(FailureOrdering), 127 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 128 InstrAddrSpace(InstrAddrSpace), 129 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 130 IsVolatile(IsVolatile), 131 IsNonTemporal(IsNonTemporal) { 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 } 139 140 public: 141 /// \returns Atomic synchronization scope of the machine instruction used to 142 /// create this SIMemOpInfo. 143 SIAtomicScope getScope() const { 144 return Scope; 145 } 146 147 /// \returns Ordering constraint of the machine instruction used to 148 /// create this SIMemOpInfo. 149 AtomicOrdering getOrdering() const { 150 return Ordering; 151 } 152 153 /// \returns Failure ordering constraint of the machine instruction used to 154 /// create this SIMemOpInfo. 155 AtomicOrdering getFailureOrdering() const { 156 return FailureOrdering; 157 } 158 159 /// \returns The address spaces be accessed by the machine 160 /// instruction used to create this SiMemOpInfo. 161 SIAtomicAddrSpace getInstrAddrSpace() const { 162 return InstrAddrSpace; 163 } 164 165 /// \returns The address spaces that must be ordered by the machine 166 /// instruction used to create this SiMemOpInfo. 167 SIAtomicAddrSpace getOrderingAddrSpace() const { 168 return OrderingAddrSpace; 169 } 170 171 /// \returns Return true iff memory ordering of operations on 172 /// different address spaces is required. 173 bool getIsCrossAddressSpaceOrdering() const { 174 return IsCrossAddressSpaceOrdering; 175 } 176 177 /// \returns True if memory access of the machine instruction used to 178 /// create this SIMemOpInfo is volatile, false otherwise. 179 bool isVolatile() const { 180 return IsVolatile; 181 } 182 183 /// \returns True if memory access of the machine instruction used to 184 /// create this SIMemOpInfo is nontemporal, false otherwise. 185 bool isNonTemporal() const { 186 return IsNonTemporal; 187 } 188 189 /// \returns True if ordering constraint of the machine instruction used to 190 /// create this SIMemOpInfo is unordered or higher, false otherwise. 191 bool isAtomic() const { 192 return Ordering != AtomicOrdering::NotAtomic; 193 } 194 195 }; 196 197 class SIMemOpAccess final { 198 private: 199 AMDGPUMachineModuleInfo *MMI = nullptr; 200 201 /// Reports unsupported message \p Msg for \p MI to LLVM context. 202 void reportUnsupported(const MachineBasicBlock::iterator &MI, 203 const char *Msg) const; 204 205 /// Inspects the target synchonization scope \p SSID and determines 206 /// the SI atomic scope it corresponds to, the address spaces it 207 /// covers, and whether the memory ordering applies between address 208 /// spaces. 209 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 210 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 211 212 /// \return Return a bit set of the address spaces accessed by \p AS. 213 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 214 215 /// \returns Info constructed from \p MI, which has at least machine memory 216 /// operand. 217 Optional<SIMemOpInfo> constructFromMIWithMMO( 218 const MachineBasicBlock::iterator &MI) const; 219 220 public: 221 /// Construct class to support accessing the machine memory operands 222 /// of instructions in the machine function \p MF. 223 SIMemOpAccess(MachineFunction &MF); 224 225 /// \returns Load info if \p MI is a load operation, "None" otherwise. 226 Optional<SIMemOpInfo> getLoadInfo( 227 const MachineBasicBlock::iterator &MI) const; 228 229 /// \returns Store info if \p MI is a store operation, "None" otherwise. 230 Optional<SIMemOpInfo> getStoreInfo( 231 const MachineBasicBlock::iterator &MI) const; 232 233 /// \returns Atomic fence info if \p MI is an atomic fence operation, 234 /// "None" otherwise. 235 Optional<SIMemOpInfo> getAtomicFenceInfo( 236 const MachineBasicBlock::iterator &MI) const; 237 238 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 239 /// rmw operation, "None" otherwise. 240 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 241 const MachineBasicBlock::iterator &MI) const; 242 }; 243 244 class SICacheControl { 245 protected: 246 247 /// AMDGPU subtarget info. 248 const GCNSubtarget &ST; 249 250 /// Instruction info. 251 const SIInstrInfo *TII = nullptr; 252 253 IsaVersion IV; 254 255 /// Whether to insert cache invalidating instructions. 256 bool InsertCacheInv; 257 258 SICacheControl(const GCNSubtarget &ST); 259 260 public: 261 262 /// Create a cache control for the subtarget \p ST. 263 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 264 265 /// Update \p MI memory load instruction to bypass any caches up to 266 /// the \p Scope memory scope for address spaces \p 267 /// AddrSpace. Return true iff the instruction was modified. 268 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 269 SIAtomicScope Scope, 270 SIAtomicAddrSpace AddrSpace) const = 0; 271 272 /// Update \p MI memory instruction of kind \p Op associated with address 273 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 274 /// true iff the instruction was modified. 275 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 276 SIAtomicAddrSpace AddrSpace, 277 SIMemOp Op, bool IsVolatile, 278 bool IsNonTemporal) const = 0; 279 280 /// Inserts any necessary instructions at position \p Pos relative 281 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 282 /// \p Op associated with address spaces \p AddrSpace have completed. Used 283 /// between memory instructions to enforce the order they become visible as 284 /// observed by other memory instructions executing in memory scope \p Scope. 285 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 286 /// address spaces. Returns true iff any instructions inserted. 287 virtual bool insertWait(MachineBasicBlock::iterator &MI, 288 SIAtomicScope Scope, 289 SIAtomicAddrSpace AddrSpace, 290 SIMemOp Op, 291 bool IsCrossAddrSpaceOrdering, 292 Position Pos) const = 0; 293 294 /// Inserts any necessary instructions at position \p Pos relative to 295 /// instruction \p MI to ensure any subsequent memory instructions of this 296 /// thread with address spaces \p AddrSpace will observe the previous memory 297 /// operations by any thread for memory scopes up to memory scope \p Scope . 298 /// Returns true iff any instructions inserted. 299 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 300 SIAtomicScope Scope, 301 SIAtomicAddrSpace AddrSpace, 302 Position Pos) const = 0; 303 304 /// Inserts any necessary instructions at position \p Pos relative to 305 /// instruction \p MI to ensure previous memory instructions by this thread 306 /// with address spaces \p AddrSpace have completed and can be observed by 307 /// subsequent memory instructions by any thread executing in memory scope \p 308 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 309 /// between address spaces. Returns true iff any instructions inserted. 310 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 311 SIAtomicScope Scope, 312 SIAtomicAddrSpace AddrSpace, 313 bool IsCrossAddrSpaceOrdering, 314 Position Pos) const = 0; 315 316 /// Virtual destructor to allow derivations to be deleted. 317 virtual ~SICacheControl() = default; 318 319 }; 320 321 class SIGfx6CacheControl : public SICacheControl { 322 protected: 323 324 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 325 /// is modified, false otherwise. 326 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 327 return enableNamedBit<AMDGPU::OpName::glc>(MI); 328 } 329 330 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 331 /// is modified, false otherwise. 332 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 333 return enableNamedBit<AMDGPU::OpName::slc>(MI); 334 } 335 336 public: 337 338 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 339 340 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 341 SIAtomicScope Scope, 342 SIAtomicAddrSpace AddrSpace) const override; 343 344 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 345 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 346 bool IsVolatile, 347 bool IsNonTemporal) const override; 348 349 bool insertWait(MachineBasicBlock::iterator &MI, 350 SIAtomicScope Scope, 351 SIAtomicAddrSpace AddrSpace, 352 SIMemOp Op, 353 bool IsCrossAddrSpaceOrdering, 354 Position Pos) const override; 355 356 bool insertAcquire(MachineBasicBlock::iterator &MI, 357 SIAtomicScope Scope, 358 SIAtomicAddrSpace AddrSpace, 359 Position Pos) const override; 360 361 bool insertRelease(MachineBasicBlock::iterator &MI, 362 SIAtomicScope Scope, 363 SIAtomicAddrSpace AddrSpace, 364 bool IsCrossAddrSpaceOrdering, 365 Position Pos) const override; 366 }; 367 368 class SIGfx7CacheControl : public SIGfx6CacheControl { 369 public: 370 371 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 372 373 bool insertAcquire(MachineBasicBlock::iterator &MI, 374 SIAtomicScope Scope, 375 SIAtomicAddrSpace AddrSpace, 376 Position Pos) const override; 377 378 }; 379 380 class SIGfx10CacheControl : public SIGfx7CacheControl { 381 protected: 382 383 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 384 /// is modified, false otherwise. 385 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 386 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 387 } 388 389 public: 390 391 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 392 393 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 394 SIAtomicScope Scope, 395 SIAtomicAddrSpace AddrSpace) const override; 396 397 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 398 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 399 bool IsVolatile, 400 bool IsNonTemporal) const override; 401 402 bool insertWait(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 SIMemOp Op, 406 bool IsCrossAddrSpaceOrdering, 407 Position Pos) const override; 408 409 bool insertAcquire(MachineBasicBlock::iterator &MI, 410 SIAtomicScope Scope, 411 SIAtomicAddrSpace AddrSpace, 412 Position Pos) const override; 413 }; 414 415 class SIMemoryLegalizer final : public MachineFunctionPass { 416 private: 417 418 /// Cache Control. 419 std::unique_ptr<SICacheControl> CC = nullptr; 420 421 /// List of atomic pseudo instructions. 422 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 423 424 /// Return true iff instruction \p MI is a atomic instruction that 425 /// returns a result. 426 bool isAtomicRet(const MachineInstr &MI) const { 427 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 428 } 429 430 /// Removes all processed atomic pseudo instructions from the current 431 /// function. Returns true if current function is modified, false otherwise. 432 bool removeAtomicPseudoMIs(); 433 434 /// Expands load operation \p MI. Returns true if instructions are 435 /// added/deleted or \p MI is modified, false otherwise. 436 bool expandLoad(const SIMemOpInfo &MOI, 437 MachineBasicBlock::iterator &MI); 438 /// Expands store operation \p MI. Returns true if instructions are 439 /// added/deleted or \p MI is modified, false otherwise. 440 bool expandStore(const SIMemOpInfo &MOI, 441 MachineBasicBlock::iterator &MI); 442 /// Expands atomic fence operation \p MI. Returns true if 443 /// instructions are added/deleted or \p MI is modified, false otherwise. 444 bool expandAtomicFence(const SIMemOpInfo &MOI, 445 MachineBasicBlock::iterator &MI); 446 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 447 /// instructions are added/deleted or \p MI is modified, false otherwise. 448 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 449 MachineBasicBlock::iterator &MI); 450 451 public: 452 static char ID; 453 454 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 455 456 void getAnalysisUsage(AnalysisUsage &AU) const override { 457 AU.setPreservesCFG(); 458 MachineFunctionPass::getAnalysisUsage(AU); 459 } 460 461 StringRef getPassName() const override { 462 return PASS_NAME; 463 } 464 465 bool runOnMachineFunction(MachineFunction &MF) override; 466 }; 467 468 } // end namespace anonymous 469 470 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 471 const char *Msg) const { 472 const Function &Func = MI->getParent()->getParent()->getFunction(); 473 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 474 Func.getContext().diagnose(Diag); 475 } 476 477 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 478 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 479 SIAtomicAddrSpace InstrScope) const { 480 if (SSID == SyncScope::System) 481 return std::make_tuple(SIAtomicScope::SYSTEM, 482 SIAtomicAddrSpace::ATOMIC, 483 true); 484 if (SSID == MMI->getAgentSSID()) 485 return std::make_tuple(SIAtomicScope::AGENT, 486 SIAtomicAddrSpace::ATOMIC, 487 true); 488 if (SSID == MMI->getWorkgroupSSID()) 489 return std::make_tuple(SIAtomicScope::WORKGROUP, 490 SIAtomicAddrSpace::ATOMIC, 491 true); 492 if (SSID == MMI->getWavefrontSSID()) 493 return std::make_tuple(SIAtomicScope::WAVEFRONT, 494 SIAtomicAddrSpace::ATOMIC, 495 true); 496 if (SSID == SyncScope::SingleThread) 497 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 498 SIAtomicAddrSpace::ATOMIC, 499 true); 500 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 501 return std::make_tuple(SIAtomicScope::SYSTEM, 502 SIAtomicAddrSpace::ATOMIC & InstrScope, 503 false); 504 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 505 return std::make_tuple(SIAtomicScope::AGENT, 506 SIAtomicAddrSpace::ATOMIC & InstrScope, 507 false); 508 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 509 return std::make_tuple(SIAtomicScope::WORKGROUP, 510 SIAtomicAddrSpace::ATOMIC & InstrScope, 511 false); 512 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 513 return std::make_tuple(SIAtomicScope::WAVEFRONT, 514 SIAtomicAddrSpace::ATOMIC & InstrScope, 515 false); 516 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 517 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 518 SIAtomicAddrSpace::ATOMIC & InstrScope, 519 false); 520 return None; 521 } 522 523 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 524 if (AS == AMDGPUAS::FLAT_ADDRESS) 525 return SIAtomicAddrSpace::FLAT; 526 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 527 return SIAtomicAddrSpace::GLOBAL; 528 if (AS == AMDGPUAS::LOCAL_ADDRESS) 529 return SIAtomicAddrSpace::LDS; 530 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 531 return SIAtomicAddrSpace::SCRATCH; 532 if (AS == AMDGPUAS::REGION_ADDRESS) 533 return SIAtomicAddrSpace::GDS; 534 535 return SIAtomicAddrSpace::OTHER; 536 } 537 538 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 539 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 540 } 541 542 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 543 const MachineBasicBlock::iterator &MI) const { 544 assert(MI->getNumMemOperands() > 0); 545 546 SyncScope::ID SSID = SyncScope::SingleThread; 547 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 548 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 549 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 550 bool IsNonTemporal = true; 551 bool IsVolatile = false; 552 553 // Validator should check whether or not MMOs cover the entire set of 554 // locations accessed by the memory instruction. 555 for (const auto &MMO : MI->memoperands()) { 556 IsNonTemporal &= MMO->isNonTemporal(); 557 IsVolatile |= MMO->isVolatile(); 558 InstrAddrSpace |= 559 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 560 AtomicOrdering OpOrdering = MMO->getOrdering(); 561 if (OpOrdering != AtomicOrdering::NotAtomic) { 562 const auto &IsSyncScopeInclusion = 563 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 564 if (!IsSyncScopeInclusion) { 565 reportUnsupported(MI, 566 "Unsupported non-inclusive atomic synchronization scope"); 567 return None; 568 } 569 570 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 571 Ordering = 572 isStrongerThan(Ordering, OpOrdering) ? 573 Ordering : MMO->getOrdering(); 574 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 575 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 576 FailureOrdering = 577 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 578 FailureOrdering : MMO->getFailureOrdering(); 579 } 580 } 581 582 SIAtomicScope Scope = SIAtomicScope::NONE; 583 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 584 bool IsCrossAddressSpaceOrdering = false; 585 if (Ordering != AtomicOrdering::NotAtomic) { 586 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 587 if (!ScopeOrNone) { 588 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 589 return None; 590 } 591 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 592 ScopeOrNone.getValue(); 593 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 594 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 595 reportUnsupported(MI, "Unsupported atomic address space"); 596 return None; 597 } 598 } 599 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 600 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 601 IsNonTemporal); 602 } 603 604 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 605 const MachineBasicBlock::iterator &MI) const { 606 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 607 608 if (!(MI->mayLoad() && !MI->mayStore())) 609 return None; 610 611 // Be conservative if there are no memory operands. 612 if (MI->getNumMemOperands() == 0) 613 return SIMemOpInfo(); 614 615 return constructFromMIWithMMO(MI); 616 } 617 618 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 619 const MachineBasicBlock::iterator &MI) const { 620 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 621 622 if (!(!MI->mayLoad() && MI->mayStore())) 623 return None; 624 625 // Be conservative if there are no memory operands. 626 if (MI->getNumMemOperands() == 0) 627 return SIMemOpInfo(); 628 629 return constructFromMIWithMMO(MI); 630 } 631 632 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 633 const MachineBasicBlock::iterator &MI) const { 634 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 635 636 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 637 return None; 638 639 AtomicOrdering Ordering = 640 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 641 642 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 643 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 644 if (!ScopeOrNone) { 645 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 646 return None; 647 } 648 649 SIAtomicScope Scope = SIAtomicScope::NONE; 650 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 651 bool IsCrossAddressSpaceOrdering = false; 652 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 653 ScopeOrNone.getValue(); 654 655 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 656 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 657 reportUnsupported(MI, "Unsupported atomic address space"); 658 return None; 659 } 660 661 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 662 IsCrossAddressSpaceOrdering); 663 } 664 665 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 666 const MachineBasicBlock::iterator &MI) const { 667 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 668 669 if (!(MI->mayLoad() && MI->mayStore())) 670 return None; 671 672 // Be conservative if there are no memory operands. 673 if (MI->getNumMemOperands() == 0) 674 return SIMemOpInfo(); 675 676 return constructFromMIWithMMO(MI); 677 } 678 679 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 680 TII = ST.getInstrInfo(); 681 IV = getIsaVersion(ST.getCPU()); 682 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 683 } 684 685 /* static */ 686 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 687 GCNSubtarget::Generation Generation = ST.getGeneration(); 688 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 689 return std::make_unique<SIGfx6CacheControl>(ST); 690 if (Generation < AMDGPUSubtarget::GFX10) 691 return std::make_unique<SIGfx7CacheControl>(ST); 692 return std::make_unique<SIGfx10CacheControl>(ST); 693 } 694 695 bool SIGfx6CacheControl::enableLoadCacheBypass( 696 const MachineBasicBlock::iterator &MI, 697 SIAtomicScope Scope, 698 SIAtomicAddrSpace AddrSpace) const { 699 assert(MI->mayLoad() && !MI->mayStore()); 700 bool Changed = false; 701 702 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 703 switch (Scope) { 704 case SIAtomicScope::SYSTEM: 705 case SIAtomicScope::AGENT: 706 Changed |= enableGLCBit(MI); 707 break; 708 case SIAtomicScope::WORKGROUP: 709 case SIAtomicScope::WAVEFRONT: 710 case SIAtomicScope::SINGLETHREAD: 711 // No cache to bypass. 712 break; 713 default: 714 llvm_unreachable("Unsupported synchronization scope"); 715 } 716 } 717 718 /// The scratch address space does not need the global memory caches 719 /// to be bypassed as all memory operations by the same thread are 720 /// sequentially consistent, and no other thread can access scratch 721 /// memory. 722 723 /// Other address spaces do not have a cache. 724 725 return Changed; 726 } 727 728 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 729 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 730 bool IsVolatile, bool IsNonTemporal) const { 731 // Only handle load and store, not atomic read-modify-write insructions. The 732 // latter use glc to indicate if the atomic returns a result and so must not 733 // be used for cache control. 734 assert(MI->mayLoad() ^ MI->mayStore()); 735 736 // Only update load and store, not LLVM IR atomic read-modify-write 737 // instructions. The latter are always marked as volatile so cannot sensibly 738 // handle it as do not want to pessimize all atomics. Also they do not support 739 // the nontemporal attribute. 740 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 741 742 bool Changed = false; 743 744 if (IsVolatile) { 745 if (Op == SIMemOp::LOAD) 746 Changed |= enableGLCBit(MI); 747 748 // Ensure operation has completed at system scope to cause all volatile 749 // operations to be visible outside the program in a global order. Do not 750 // request cross address space as only the global address space can be 751 // observable outside the program, so no need to cause a waitcnt for LDS 752 // address space operations. 753 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 754 Position::AFTER); 755 756 return Changed; 757 } 758 759 if (IsNonTemporal) { 760 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 761 Changed |= enableGLCBit(MI); 762 Changed |= enableSLCBit(MI); 763 return Changed; 764 } 765 766 return Changed; 767 } 768 769 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 770 SIAtomicScope Scope, 771 SIAtomicAddrSpace AddrSpace, 772 SIMemOp Op, 773 bool IsCrossAddrSpaceOrdering, 774 Position Pos) const { 775 bool Changed = false; 776 777 MachineBasicBlock &MBB = *MI->getParent(); 778 DebugLoc DL = MI->getDebugLoc(); 779 780 if (Pos == Position::AFTER) 781 ++MI; 782 783 bool VMCnt = false; 784 bool LGKMCnt = false; 785 786 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 787 SIAtomicAddrSpace::NONE) { 788 switch (Scope) { 789 case SIAtomicScope::SYSTEM: 790 case SIAtomicScope::AGENT: 791 VMCnt |= true; 792 break; 793 case SIAtomicScope::WORKGROUP: 794 case SIAtomicScope::WAVEFRONT: 795 case SIAtomicScope::SINGLETHREAD: 796 // The L1 cache keeps all memory operations in order for 797 // wavefronts in the same work-group. 798 break; 799 default: 800 llvm_unreachable("Unsupported synchronization scope"); 801 } 802 } 803 804 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 805 switch (Scope) { 806 case SIAtomicScope::SYSTEM: 807 case SIAtomicScope::AGENT: 808 case SIAtomicScope::WORKGROUP: 809 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 810 // not needed as LDS operations for all waves are executed in a total 811 // global ordering as observed by all waves. Required if also 812 // synchronizing with global/GDS memory as LDS operations could be 813 // reordered with respect to later global/GDS memory operations of the 814 // same wave. 815 LGKMCnt |= IsCrossAddrSpaceOrdering; 816 break; 817 case SIAtomicScope::WAVEFRONT: 818 case SIAtomicScope::SINGLETHREAD: 819 // The LDS keeps all memory operations in order for 820 // the same wavesfront. 821 break; 822 default: 823 llvm_unreachable("Unsupported synchronization scope"); 824 } 825 } 826 827 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 828 switch (Scope) { 829 case SIAtomicScope::SYSTEM: 830 case SIAtomicScope::AGENT: 831 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 832 // is not needed as GDS operations for all waves are executed in a total 833 // global ordering as observed by all waves. Required if also 834 // synchronizing with global/LDS memory as GDS operations could be 835 // reordered with respect to later global/LDS memory operations of the 836 // same wave. 837 LGKMCnt |= IsCrossAddrSpaceOrdering; 838 break; 839 case SIAtomicScope::WORKGROUP: 840 case SIAtomicScope::WAVEFRONT: 841 case SIAtomicScope::SINGLETHREAD: 842 // The GDS keeps all memory operations in order for 843 // the same work-group. 844 break; 845 default: 846 llvm_unreachable("Unsupported synchronization scope"); 847 } 848 } 849 850 if (VMCnt || LGKMCnt) { 851 unsigned WaitCntImmediate = 852 AMDGPU::encodeWaitcnt(IV, 853 VMCnt ? 0 : getVmcntBitMask(IV), 854 getExpcntBitMask(IV), 855 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 856 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 857 Changed = true; 858 } 859 860 if (Pos == Position::AFTER) 861 --MI; 862 863 return Changed; 864 } 865 866 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 867 SIAtomicScope Scope, 868 SIAtomicAddrSpace AddrSpace, 869 Position Pos) const { 870 if (!InsertCacheInv) 871 return false; 872 873 bool Changed = false; 874 875 MachineBasicBlock &MBB = *MI->getParent(); 876 DebugLoc DL = MI->getDebugLoc(); 877 878 if (Pos == Position::AFTER) 879 ++MI; 880 881 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 882 switch (Scope) { 883 case SIAtomicScope::SYSTEM: 884 case SIAtomicScope::AGENT: 885 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 886 Changed = true; 887 break; 888 case SIAtomicScope::WORKGROUP: 889 case SIAtomicScope::WAVEFRONT: 890 case SIAtomicScope::SINGLETHREAD: 891 // No cache to invalidate. 892 break; 893 default: 894 llvm_unreachable("Unsupported synchronization scope"); 895 } 896 } 897 898 /// The scratch address space does not need the global memory cache 899 /// to be flushed as all memory operations by the same thread are 900 /// sequentially consistent, and no other thread can access scratch 901 /// memory. 902 903 /// Other address spaces do not have a cache. 904 905 if (Pos == Position::AFTER) 906 --MI; 907 908 return Changed; 909 } 910 911 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 912 SIAtomicScope Scope, 913 SIAtomicAddrSpace AddrSpace, 914 bool IsCrossAddrSpaceOrdering, 915 Position Pos) const { 916 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 917 IsCrossAddrSpaceOrdering, Pos); 918 } 919 920 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 921 SIAtomicScope Scope, 922 SIAtomicAddrSpace AddrSpace, 923 Position Pos) const { 924 if (!InsertCacheInv) 925 return false; 926 927 bool Changed = false; 928 929 MachineBasicBlock &MBB = *MI->getParent(); 930 DebugLoc DL = MI->getDebugLoc(); 931 932 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 933 934 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 935 ? AMDGPU::BUFFER_WBINVL1 936 : AMDGPU::BUFFER_WBINVL1_VOL; 937 938 if (Pos == Position::AFTER) 939 ++MI; 940 941 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 942 switch (Scope) { 943 case SIAtomicScope::SYSTEM: 944 case SIAtomicScope::AGENT: 945 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 946 Changed = true; 947 break; 948 case SIAtomicScope::WORKGROUP: 949 case SIAtomicScope::WAVEFRONT: 950 case SIAtomicScope::SINGLETHREAD: 951 // No cache to invalidate. 952 break; 953 default: 954 llvm_unreachable("Unsupported synchronization scope"); 955 } 956 } 957 958 /// The scratch address space does not need the global memory cache 959 /// to be flushed as all memory operations by the same thread are 960 /// sequentially consistent, and no other thread can access scratch 961 /// memory. 962 963 /// Other address spaces do not have a cache. 964 965 if (Pos == Position::AFTER) 966 --MI; 967 968 return Changed; 969 } 970 971 bool SIGfx10CacheControl::enableLoadCacheBypass( 972 const MachineBasicBlock::iterator &MI, 973 SIAtomicScope Scope, 974 SIAtomicAddrSpace AddrSpace) const { 975 assert(MI->mayLoad() && !MI->mayStore()); 976 bool Changed = false; 977 978 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 979 /// TODO Do not set glc for rmw atomic operations as they 980 /// implicitly bypass the L0/L1 caches. 981 982 switch (Scope) { 983 case SIAtomicScope::SYSTEM: 984 case SIAtomicScope::AGENT: 985 Changed |= enableGLCBit(MI); 986 Changed |= enableDLCBit(MI); 987 break; 988 case SIAtomicScope::WORKGROUP: 989 // In WGP mode the waves of a work-group can be executing on either CU of 990 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 991 // CU mode all waves of a work-group are on the same CU, and so the L0 992 // does not need to be bypassed. 993 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 994 break; 995 case SIAtomicScope::WAVEFRONT: 996 case SIAtomicScope::SINGLETHREAD: 997 // No cache to bypass. 998 break; 999 default: 1000 llvm_unreachable("Unsupported synchronization scope"); 1001 } 1002 } 1003 1004 /// The scratch address space does not need the global memory caches 1005 /// to be bypassed as all memory operations by the same thread are 1006 /// sequentially consistent, and no other thread can access scratch 1007 /// memory. 1008 1009 /// Other address spaces do not have a cache. 1010 1011 return Changed; 1012 } 1013 1014 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1015 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1016 bool IsVolatile, bool IsNonTemporal) const { 1017 1018 // Only handle load and store, not atomic read-modify-write insructions. The 1019 // latter use glc to indicate if the atomic returns a result and so must not 1020 // be used for cache control. 1021 assert(MI->mayLoad() ^ MI->mayStore()); 1022 1023 // Only update load and store, not LLVM IR atomic read-modify-write 1024 // instructions. The latter are always marked as volatile so cannot sensibly 1025 // handle it as do not want to pessimize all atomics. Also they do not support 1026 // the nontemporal attribute. 1027 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1028 1029 bool Changed = false; 1030 1031 if (IsVolatile) { 1032 1033 if (Op == SIMemOp::LOAD) { 1034 Changed |= enableGLCBit(MI); 1035 Changed |= enableDLCBit(MI); 1036 } 1037 1038 // Ensure operation has completed at system scope to cause all volatile 1039 // operations to be visible outside the program in a global order. Do not 1040 // request cross address space as only the global address space can be 1041 // observable outside the program, so no need to cause a waitcnt for LDS 1042 // address space operations. 1043 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1044 Position::AFTER); 1045 return Changed; 1046 } 1047 1048 if (IsNonTemporal) { 1049 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1050 Changed |= enableSLCBit(MI); 1051 return Changed; 1052 } 1053 1054 return Changed; 1055 } 1056 1057 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1058 SIAtomicScope Scope, 1059 SIAtomicAddrSpace AddrSpace, 1060 SIMemOp Op, 1061 bool IsCrossAddrSpaceOrdering, 1062 Position Pos) const { 1063 bool Changed = false; 1064 1065 MachineBasicBlock &MBB = *MI->getParent(); 1066 DebugLoc DL = MI->getDebugLoc(); 1067 1068 if (Pos == Position::AFTER) 1069 ++MI; 1070 1071 bool VMCnt = false; 1072 bool VSCnt = false; 1073 bool LGKMCnt = false; 1074 1075 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1076 SIAtomicAddrSpace::NONE) { 1077 switch (Scope) { 1078 case SIAtomicScope::SYSTEM: 1079 case SIAtomicScope::AGENT: 1080 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1081 VMCnt |= true; 1082 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1083 VSCnt |= true; 1084 break; 1085 case SIAtomicScope::WORKGROUP: 1086 // In WGP mode the waves of a work-group can be executing on either CU of 1087 // the WGP. Therefore need to wait for operations to complete to ensure 1088 // they are visible to waves in the other CU as the L0 is per CU. 1089 // Otherwise in CU mode and all waves of a work-group are on the same CU 1090 // which shares the same L0. 1091 if (!ST.isCuModeEnabled()) { 1092 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1093 VMCnt |= true; 1094 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1095 VSCnt |= true; 1096 } 1097 break; 1098 case SIAtomicScope::WAVEFRONT: 1099 case SIAtomicScope::SINGLETHREAD: 1100 // The L0 cache keeps all memory operations in order for 1101 // work-items in the same wavefront. 1102 break; 1103 default: 1104 llvm_unreachable("Unsupported synchronization scope"); 1105 } 1106 } 1107 1108 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1109 switch (Scope) { 1110 case SIAtomicScope::SYSTEM: 1111 case SIAtomicScope::AGENT: 1112 case SIAtomicScope::WORKGROUP: 1113 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1114 // not needed as LDS operations for all waves are executed in a total 1115 // global ordering as observed by all waves. Required if also 1116 // synchronizing with global/GDS memory as LDS operations could be 1117 // reordered with respect to later global/GDS memory operations of the 1118 // same wave. 1119 LGKMCnt |= IsCrossAddrSpaceOrdering; 1120 break; 1121 case SIAtomicScope::WAVEFRONT: 1122 case SIAtomicScope::SINGLETHREAD: 1123 // The LDS keeps all memory operations in order for 1124 // the same wavesfront. 1125 break; 1126 default: 1127 llvm_unreachable("Unsupported synchronization scope"); 1128 } 1129 } 1130 1131 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1132 switch (Scope) { 1133 case SIAtomicScope::SYSTEM: 1134 case SIAtomicScope::AGENT: 1135 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1136 // is not needed as GDS operations for all waves are executed in a total 1137 // global ordering as observed by all waves. Required if also 1138 // synchronizing with global/LDS memory as GDS operations could be 1139 // reordered with respect to later global/LDS memory operations of the 1140 // same wave. 1141 LGKMCnt |= IsCrossAddrSpaceOrdering; 1142 break; 1143 case SIAtomicScope::WORKGROUP: 1144 case SIAtomicScope::WAVEFRONT: 1145 case SIAtomicScope::SINGLETHREAD: 1146 // The GDS keeps all memory operations in order for 1147 // the same work-group. 1148 break; 1149 default: 1150 llvm_unreachable("Unsupported synchronization scope"); 1151 } 1152 } 1153 1154 if (VMCnt || LGKMCnt) { 1155 unsigned WaitCntImmediate = 1156 AMDGPU::encodeWaitcnt(IV, 1157 VMCnt ? 0 : getVmcntBitMask(IV), 1158 getExpcntBitMask(IV), 1159 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1160 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1161 Changed = true; 1162 } 1163 1164 if (VSCnt) { 1165 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1166 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1167 .addImm(0); 1168 Changed = true; 1169 } 1170 1171 if (Pos == Position::AFTER) 1172 --MI; 1173 1174 return Changed; 1175 } 1176 1177 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1178 SIAtomicScope Scope, 1179 SIAtomicAddrSpace AddrSpace, 1180 Position Pos) const { 1181 if (!InsertCacheInv) 1182 return false; 1183 1184 bool Changed = false; 1185 1186 MachineBasicBlock &MBB = *MI->getParent(); 1187 DebugLoc DL = MI->getDebugLoc(); 1188 1189 if (Pos == Position::AFTER) 1190 ++MI; 1191 1192 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1193 switch (Scope) { 1194 case SIAtomicScope::SYSTEM: 1195 case SIAtomicScope::AGENT: 1196 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1197 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1198 Changed = true; 1199 break; 1200 case SIAtomicScope::WORKGROUP: 1201 // In WGP mode the waves of a work-group can be executing on either CU of 1202 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1203 // in CU mode and all waves of a work-group are on the same CU, and so the 1204 // L0 does not need to be invalidated. 1205 if (!ST.isCuModeEnabled()) { 1206 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1207 Changed = true; 1208 } 1209 break; 1210 case SIAtomicScope::WAVEFRONT: 1211 case SIAtomicScope::SINGLETHREAD: 1212 // No cache to invalidate. 1213 break; 1214 default: 1215 llvm_unreachable("Unsupported synchronization scope"); 1216 } 1217 } 1218 1219 /// The scratch address space does not need the global memory cache 1220 /// to be flushed as all memory operations by the same thread are 1221 /// sequentially consistent, and no other thread can access scratch 1222 /// memory. 1223 1224 /// Other address spaces do not have a cache. 1225 1226 if (Pos == Position::AFTER) 1227 --MI; 1228 1229 return Changed; 1230 } 1231 1232 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1233 if (AtomicPseudoMIs.empty()) 1234 return false; 1235 1236 for (auto &MI : AtomicPseudoMIs) 1237 MI->eraseFromParent(); 1238 1239 AtomicPseudoMIs.clear(); 1240 return true; 1241 } 1242 1243 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1244 MachineBasicBlock::iterator &MI) { 1245 assert(MI->mayLoad() && !MI->mayStore()); 1246 1247 bool Changed = false; 1248 1249 if (MOI.isAtomic()) { 1250 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1251 MOI.getOrdering() == AtomicOrdering::Acquire || 1252 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1253 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1254 MOI.getOrderingAddrSpace()); 1255 } 1256 1257 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1258 Changed |= CC->insertWait(MI, MOI.getScope(), 1259 MOI.getOrderingAddrSpace(), 1260 SIMemOp::LOAD | SIMemOp::STORE, 1261 MOI.getIsCrossAddressSpaceOrdering(), 1262 Position::BEFORE); 1263 1264 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1265 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1266 Changed |= CC->insertWait(MI, MOI.getScope(), 1267 MOI.getInstrAddrSpace(), 1268 SIMemOp::LOAD, 1269 MOI.getIsCrossAddressSpaceOrdering(), 1270 Position::AFTER); 1271 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1272 MOI.getOrderingAddrSpace(), 1273 Position::AFTER); 1274 } 1275 1276 return Changed; 1277 } 1278 1279 // Atomic instructions already bypass caches to the scope specified by the 1280 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1281 // need additional treatment. 1282 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1283 SIMemOp::LOAD, MOI.isVolatile(), 1284 MOI.isNonTemporal()); 1285 return Changed; 1286 } 1287 1288 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1289 MachineBasicBlock::iterator &MI) { 1290 assert(!MI->mayLoad() && MI->mayStore()); 1291 1292 bool Changed = false; 1293 1294 if (MOI.isAtomic()) { 1295 if (MOI.getOrdering() == AtomicOrdering::Release || 1296 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1297 Changed |= CC->insertRelease(MI, MOI.getScope(), 1298 MOI.getOrderingAddrSpace(), 1299 MOI.getIsCrossAddressSpaceOrdering(), 1300 Position::BEFORE); 1301 1302 return Changed; 1303 } 1304 1305 // Atomic instructions already bypass caches to the scope specified by the 1306 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1307 // need additional treatment. 1308 Changed |= CC->enableVolatileAndOrNonTemporal( 1309 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1310 MOI.isNonTemporal()); 1311 return Changed; 1312 } 1313 1314 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1315 MachineBasicBlock::iterator &MI) { 1316 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1317 1318 AtomicPseudoMIs.push_back(MI); 1319 bool Changed = false; 1320 1321 if (MOI.isAtomic()) { 1322 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1323 MOI.getOrdering() == AtomicOrdering::Release || 1324 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1325 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1326 /// TODO: This relies on a barrier always generating a waitcnt 1327 /// for LDS to ensure it is not reordered with the completion of 1328 /// the proceeding LDS operations. If barrier had a memory 1329 /// ordering and memory scope, then library does not need to 1330 /// generate a fence. Could add support in this file for 1331 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1332 /// adding S_WAITCNT before a S_BARRIER. 1333 Changed |= CC->insertRelease(MI, MOI.getScope(), 1334 MOI.getOrderingAddrSpace(), 1335 MOI.getIsCrossAddressSpaceOrdering(), 1336 Position::BEFORE); 1337 1338 // TODO: If both release and invalidate are happening they could be combined 1339 // to use the single "BUFFER_WBL2" instruction. This could be done by 1340 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1341 // track cache invalidate and write back instructions. 1342 1343 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1344 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1345 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1346 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1347 MOI.getOrderingAddrSpace(), 1348 Position::BEFORE); 1349 1350 return Changed; 1351 } 1352 1353 return Changed; 1354 } 1355 1356 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1357 MachineBasicBlock::iterator &MI) { 1358 assert(MI->mayLoad() && MI->mayStore()); 1359 1360 bool Changed = false; 1361 1362 if (MOI.isAtomic()) { 1363 if (MOI.getOrdering() == AtomicOrdering::Release || 1364 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1365 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1366 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1367 Changed |= CC->insertRelease(MI, MOI.getScope(), 1368 MOI.getOrderingAddrSpace(), 1369 MOI.getIsCrossAddressSpaceOrdering(), 1370 Position::BEFORE); 1371 1372 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1373 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1374 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1375 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1376 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1377 Changed |= CC->insertWait(MI, MOI.getScope(), 1378 MOI.getOrderingAddrSpace(), 1379 isAtomicRet(*MI) ? SIMemOp::LOAD : 1380 SIMemOp::STORE, 1381 MOI.getIsCrossAddressSpaceOrdering(), 1382 Position::AFTER); 1383 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1384 MOI.getOrderingAddrSpace(), 1385 Position::AFTER); 1386 } 1387 1388 return Changed; 1389 } 1390 1391 return Changed; 1392 } 1393 1394 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1395 bool Changed = false; 1396 1397 SIMemOpAccess MOA(MF); 1398 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1399 1400 for (auto &MBB : MF) { 1401 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1402 1403 // Unbundle instructions after the post-RA scheduler. 1404 if (MI->isBundle()) { 1405 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1406 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1407 I != E && I->isBundledWithPred(); ++I) { 1408 I->unbundleFromPred(); 1409 for (MachineOperand &MO : I->operands()) 1410 if (MO.isReg()) 1411 MO.setIsInternalRead(false); 1412 } 1413 1414 MI->eraseFromParent(); 1415 MI = II->getIterator(); 1416 } 1417 1418 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1419 continue; 1420 1421 if (const auto &MOI = MOA.getLoadInfo(MI)) 1422 Changed |= expandLoad(MOI.getValue(), MI); 1423 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1424 Changed |= expandStore(MOI.getValue(), MI); 1425 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1426 Changed |= expandAtomicFence(MOI.getValue(), MI); 1427 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1428 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1429 } 1430 } 1431 1432 Changed |= removeAtomicPseudoMIs(); 1433 return Changed; 1434 } 1435 1436 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1437 1438 char SIMemoryLegalizer::ID = 0; 1439 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1440 1441 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1442 return new SIMemoryLegalizer(); 1443 } 1444