xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// Memory legalizer - implements memory model. More information can be
110b57cec5SDimitry Andric /// found here:
120b57cec5SDimitry Andric ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
130b57cec5SDimitry Andric //
140b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
150b57cec5SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
170b57cec5SDimitry Andric #include "AMDGPUMachineModuleInfo.h"
18e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
190b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
200b57cec5SDimitry Andric #include "llvm/ADT/BitmaskEnum.h"
21*0fca6ea1SDimitry Andric #include "llvm/ADT/StringExtras.h"
220b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
2381ad6265SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
240b57cec5SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
25*0fca6ea1SDimitry Andric #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
260b57cec5SDimitry Andric #include "llvm/Support/AtomicOrdering.h"
2706c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h"
280b57cec5SDimitry Andric 
290b57cec5SDimitry Andric using namespace llvm;
300b57cec5SDimitry Andric using namespace llvm::AMDGPU;
310b57cec5SDimitry Andric 
320b57cec5SDimitry Andric #define DEBUG_TYPE "si-memory-legalizer"
330b57cec5SDimitry Andric #define PASS_NAME "SI Memory Legalizer"
340b57cec5SDimitry Andric 
35e8d8bef9SDimitry Andric static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36e8d8bef9SDimitry Andric     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37e8d8bef9SDimitry Andric     cl::desc("Use this to skip inserting cache invalidating instructions."));
38e8d8bef9SDimitry Andric 
390b57cec5SDimitry Andric namespace {
400b57cec5SDimitry Andric 
410b57cec5SDimitry Andric LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
420b57cec5SDimitry Andric 
430b57cec5SDimitry Andric /// Memory operation flags. Can be ORed together.
440b57cec5SDimitry Andric enum class SIMemOp {
450b57cec5SDimitry Andric   NONE = 0u,
460b57cec5SDimitry Andric   LOAD = 1u << 0,
470b57cec5SDimitry Andric   STORE = 1u << 1,
480b57cec5SDimitry Andric   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
490b57cec5SDimitry Andric };
500b57cec5SDimitry Andric 
510b57cec5SDimitry Andric /// Position to insert a new instruction relative to an existing
520b57cec5SDimitry Andric /// instruction.
530b57cec5SDimitry Andric enum class Position {
540b57cec5SDimitry Andric   BEFORE,
550b57cec5SDimitry Andric   AFTER
560b57cec5SDimitry Andric };
570b57cec5SDimitry Andric 
580b57cec5SDimitry Andric /// The atomic synchronization scopes supported by the AMDGPU target.
590b57cec5SDimitry Andric enum class SIAtomicScope {
600b57cec5SDimitry Andric   NONE,
610b57cec5SDimitry Andric   SINGLETHREAD,
620b57cec5SDimitry Andric   WAVEFRONT,
630b57cec5SDimitry Andric   WORKGROUP,
640b57cec5SDimitry Andric   AGENT,
650b57cec5SDimitry Andric   SYSTEM
660b57cec5SDimitry Andric };
670b57cec5SDimitry Andric 
680b57cec5SDimitry Andric /// The distinct address spaces supported by the AMDGPU target for
6981ad6265SDimitry Andric /// atomic memory operation. Can be ORed together.
700b57cec5SDimitry Andric enum class SIAtomicAddrSpace {
710b57cec5SDimitry Andric   NONE = 0u,
720b57cec5SDimitry Andric   GLOBAL = 1u << 0,
730b57cec5SDimitry Andric   LDS = 1u << 1,
740b57cec5SDimitry Andric   SCRATCH = 1u << 2,
750b57cec5SDimitry Andric   GDS = 1u << 3,
760b57cec5SDimitry Andric   OTHER = 1u << 4,
770b57cec5SDimitry Andric 
780b57cec5SDimitry Andric   /// The address spaces that can be accessed by a FLAT instruction.
790b57cec5SDimitry Andric   FLAT = GLOBAL | LDS | SCRATCH,
800b57cec5SDimitry Andric 
810b57cec5SDimitry Andric   /// The address spaces that support atomic instructions.
820b57cec5SDimitry Andric   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
830b57cec5SDimitry Andric 
840b57cec5SDimitry Andric   /// All address spaces.
850b57cec5SDimitry Andric   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
880b57cec5SDimitry Andric };
890b57cec5SDimitry Andric 
900b57cec5SDimitry Andric class SIMemOpInfo final {
910b57cec5SDimitry Andric private:
920b57cec5SDimitry Andric 
930b57cec5SDimitry Andric   friend class SIMemOpAccess;
940b57cec5SDimitry Andric 
950b57cec5SDimitry Andric   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
960b57cec5SDimitry Andric   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
970b57cec5SDimitry Andric   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
980b57cec5SDimitry Andric   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
990b57cec5SDimitry Andric   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
1000b57cec5SDimitry Andric   bool IsCrossAddressSpaceOrdering = false;
101e8d8bef9SDimitry Andric   bool IsVolatile = false;
1020b57cec5SDimitry Andric   bool IsNonTemporal = false;
103*0fca6ea1SDimitry Andric   bool IsLastUse = false;
1040b57cec5SDimitry Andric 
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false,bool IsLastUse=false)105*0fca6ea1SDimitry Andric   SIMemOpInfo(
106*0fca6ea1SDimitry Andric       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
1070b57cec5SDimitry Andric       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
1080b57cec5SDimitry Andric       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
1090b57cec5SDimitry Andric       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
1100b57cec5SDimitry Andric       bool IsCrossAddressSpaceOrdering = true,
111*0fca6ea1SDimitry Andric       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112*0fca6ea1SDimitry Andric       bool IsVolatile = false, bool IsNonTemporal = false,
113*0fca6ea1SDimitry Andric       bool IsLastUse = false)
114*0fca6ea1SDimitry Andric       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115*0fca6ea1SDimitry Andric         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
1160b57cec5SDimitry Andric         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117*0fca6ea1SDimitry Andric         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118*0fca6ea1SDimitry Andric         IsLastUse(IsLastUse) {
119fe6060f1SDimitry Andric 
120fe6060f1SDimitry Andric     if (Ordering == AtomicOrdering::NotAtomic) {
121fe6060f1SDimitry Andric       assert(Scope == SIAtomicScope::NONE &&
122fe6060f1SDimitry Andric              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123fe6060f1SDimitry Andric              !IsCrossAddressSpaceOrdering &&
124fe6060f1SDimitry Andric              FailureOrdering == AtomicOrdering::NotAtomic);
125fe6060f1SDimitry Andric       return;
126fe6060f1SDimitry Andric     }
127fe6060f1SDimitry Andric 
128fe6060f1SDimitry Andric     assert(Scope != SIAtomicScope::NONE &&
129fe6060f1SDimitry Andric            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130fe6060f1SDimitry Andric                SIAtomicAddrSpace::NONE &&
131fe6060f1SDimitry Andric            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132349cc55cSDimitry Andric                SIAtomicAddrSpace::NONE);
133fe6060f1SDimitry Andric 
1340b57cec5SDimitry Andric     // There is also no cross address space ordering if the ordering
1350b57cec5SDimitry Andric     // address space is the same as the instruction address space and
1360b57cec5SDimitry Andric     // only contains a single address space.
1370b57cec5SDimitry Andric     if ((OrderingAddrSpace == InstrAddrSpace) &&
1380b57cec5SDimitry Andric         isPowerOf2_32(uint32_t(InstrAddrSpace)))
1390b57cec5SDimitry Andric       this->IsCrossAddressSpaceOrdering = false;
140fe6060f1SDimitry Andric 
141fe6060f1SDimitry Andric     // Limit the scope to the maximum supported by the instruction's address
142fe6060f1SDimitry Andric     // spaces.
143fe6060f1SDimitry Andric     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144fe6060f1SDimitry Andric         SIAtomicAddrSpace::NONE) {
145fe6060f1SDimitry Andric       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146fe6060f1SDimitry Andric     } else if ((InstrAddrSpace &
147fe6060f1SDimitry Andric                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148fe6060f1SDimitry Andric                SIAtomicAddrSpace::NONE) {
149fe6060f1SDimitry Andric       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150fe6060f1SDimitry Andric     } else if ((InstrAddrSpace &
151fe6060f1SDimitry Andric                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152fe6060f1SDimitry Andric                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153fe6060f1SDimitry Andric       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154fe6060f1SDimitry Andric     }
1550b57cec5SDimitry Andric   }
1560b57cec5SDimitry Andric 
1570b57cec5SDimitry Andric public:
1580b57cec5SDimitry Andric   /// \returns Atomic synchronization scope of the machine instruction used to
1590b57cec5SDimitry Andric   /// create this SIMemOpInfo.
getScope() const1600b57cec5SDimitry Andric   SIAtomicScope getScope() const {
1610b57cec5SDimitry Andric     return Scope;
1620b57cec5SDimitry Andric   }
1630b57cec5SDimitry Andric 
1640b57cec5SDimitry Andric   /// \returns Ordering constraint of the machine instruction used to
1650b57cec5SDimitry Andric   /// create this SIMemOpInfo.
getOrdering() const1660b57cec5SDimitry Andric   AtomicOrdering getOrdering() const {
1670b57cec5SDimitry Andric     return Ordering;
1680b57cec5SDimitry Andric   }
1690b57cec5SDimitry Andric 
1700b57cec5SDimitry Andric   /// \returns Failure ordering constraint of the machine instruction used to
1710b57cec5SDimitry Andric   /// create this SIMemOpInfo.
getFailureOrdering() const1720b57cec5SDimitry Andric   AtomicOrdering getFailureOrdering() const {
1730b57cec5SDimitry Andric     return FailureOrdering;
1740b57cec5SDimitry Andric   }
1750b57cec5SDimitry Andric 
1760b57cec5SDimitry Andric   /// \returns The address spaces be accessed by the machine
177bdd1243dSDimitry Andric   /// instruction used to create this SIMemOpInfo.
getInstrAddrSpace() const1780b57cec5SDimitry Andric   SIAtomicAddrSpace getInstrAddrSpace() const {
1790b57cec5SDimitry Andric     return InstrAddrSpace;
1800b57cec5SDimitry Andric   }
1810b57cec5SDimitry Andric 
1820b57cec5SDimitry Andric   /// \returns The address spaces that must be ordered by the machine
183bdd1243dSDimitry Andric   /// instruction used to create this SIMemOpInfo.
getOrderingAddrSpace() const1840b57cec5SDimitry Andric   SIAtomicAddrSpace getOrderingAddrSpace() const {
1850b57cec5SDimitry Andric     return OrderingAddrSpace;
1860b57cec5SDimitry Andric   }
1870b57cec5SDimitry Andric 
1880b57cec5SDimitry Andric   /// \returns Return true iff memory ordering of operations on
1890b57cec5SDimitry Andric   /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const1900b57cec5SDimitry Andric   bool getIsCrossAddressSpaceOrdering() const {
1910b57cec5SDimitry Andric     return IsCrossAddressSpaceOrdering;
1920b57cec5SDimitry Andric   }
1930b57cec5SDimitry Andric 
1940b57cec5SDimitry Andric   /// \returns True if memory access of the machine instruction used to
195e8d8bef9SDimitry Andric   /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const196e8d8bef9SDimitry Andric   bool isVolatile() const {
197e8d8bef9SDimitry Andric     return IsVolatile;
198e8d8bef9SDimitry Andric   }
199e8d8bef9SDimitry Andric 
200e8d8bef9SDimitry Andric   /// \returns True if memory access of the machine instruction used to
201e8d8bef9SDimitry Andric   /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const2020b57cec5SDimitry Andric   bool isNonTemporal() const {
2030b57cec5SDimitry Andric     return IsNonTemporal;
2040b57cec5SDimitry Andric   }
2050b57cec5SDimitry Andric 
206*0fca6ea1SDimitry Andric   /// \returns True if memory access of the machine instruction used to
207*0fca6ea1SDimitry Andric   /// create this SIMemOpInfo is last use, false otherwise.
isLastUse() const208*0fca6ea1SDimitry Andric   bool isLastUse() const { return IsLastUse; }
209*0fca6ea1SDimitry Andric 
2100b57cec5SDimitry Andric   /// \returns True if ordering constraint of the machine instruction used to
2110b57cec5SDimitry Andric   /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const2120b57cec5SDimitry Andric   bool isAtomic() const {
2130b57cec5SDimitry Andric     return Ordering != AtomicOrdering::NotAtomic;
2140b57cec5SDimitry Andric   }
2150b57cec5SDimitry Andric 
2160b57cec5SDimitry Andric };
2170b57cec5SDimitry Andric 
2180b57cec5SDimitry Andric class SIMemOpAccess final {
2190b57cec5SDimitry Andric private:
220*0fca6ea1SDimitry Andric   const AMDGPUMachineModuleInfo *MMI = nullptr;
2210b57cec5SDimitry Andric 
2220b57cec5SDimitry Andric   /// Reports unsupported message \p Msg for \p MI to LLVM context.
2230b57cec5SDimitry Andric   void reportUnsupported(const MachineBasicBlock::iterator &MI,
2240b57cec5SDimitry Andric                          const char *Msg) const;
2250b57cec5SDimitry Andric 
226fe6060f1SDimitry Andric   /// Inspects the target synchronization scope \p SSID and determines
2270b57cec5SDimitry Andric   /// the SI atomic scope it corresponds to, the address spaces it
2280b57cec5SDimitry Andric   /// covers, and whether the memory ordering applies between address
2290b57cec5SDimitry Andric   /// spaces.
230bdd1243dSDimitry Andric   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231fe6060f1SDimitry Andric   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
2320b57cec5SDimitry Andric 
2330b57cec5SDimitry Andric   /// \return Return a bit set of the address spaces accessed by \p AS.
2340b57cec5SDimitry Andric   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
2350b57cec5SDimitry Andric 
2360b57cec5SDimitry Andric   /// \returns Info constructed from \p MI, which has at least machine memory
2370b57cec5SDimitry Andric   /// operand.
238bdd1243dSDimitry Andric   std::optional<SIMemOpInfo>
239bdd1243dSDimitry Andric   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
2400b57cec5SDimitry Andric 
2410b57cec5SDimitry Andric public:
2420b57cec5SDimitry Andric   /// Construct class to support accessing the machine memory operands
2430b57cec5SDimitry Andric   /// of instructions in the machine function \p MF.
244*0fca6ea1SDimitry Andric   SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
2450b57cec5SDimitry Andric 
246bdd1243dSDimitry Andric   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247bdd1243dSDimitry Andric   std::optional<SIMemOpInfo>
248bdd1243dSDimitry Andric   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
2490b57cec5SDimitry Andric 
250bdd1243dSDimitry Andric   /// \returns Store info if \p MI is a store operation, "std::nullopt"
251bdd1243dSDimitry Andric   /// otherwise.
252bdd1243dSDimitry Andric   std::optional<SIMemOpInfo>
253bdd1243dSDimitry Andric   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
2540b57cec5SDimitry Andric 
2550b57cec5SDimitry Andric   /// \returns Atomic fence info if \p MI is an atomic fence operation,
256bdd1243dSDimitry Andric   /// "std::nullopt" otherwise.
257bdd1243dSDimitry Andric   std::optional<SIMemOpInfo>
258bdd1243dSDimitry Andric   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
2590b57cec5SDimitry Andric 
2600b57cec5SDimitry Andric   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261bdd1243dSDimitry Andric   /// rmw operation, "std::nullopt" otherwise.
262bdd1243dSDimitry Andric   std::optional<SIMemOpInfo>
263bdd1243dSDimitry Andric   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
2640b57cec5SDimitry Andric };
2650b57cec5SDimitry Andric 
2660b57cec5SDimitry Andric class SICacheControl {
2670b57cec5SDimitry Andric protected:
2680b57cec5SDimitry Andric 
269e8d8bef9SDimitry Andric   /// AMDGPU subtarget info.
270e8d8bef9SDimitry Andric   const GCNSubtarget &ST;
271e8d8bef9SDimitry Andric 
2720b57cec5SDimitry Andric   /// Instruction info.
2730b57cec5SDimitry Andric   const SIInstrInfo *TII = nullptr;
2740b57cec5SDimitry Andric 
2750b57cec5SDimitry Andric   IsaVersion IV;
2760b57cec5SDimitry Andric 
277e8d8bef9SDimitry Andric   /// Whether to insert cache invalidating instructions.
2785ffd83dbSDimitry Andric   bool InsertCacheInv;
2795ffd83dbSDimitry Andric 
2800b57cec5SDimitry Andric   SICacheControl(const GCNSubtarget &ST);
2810b57cec5SDimitry Andric 
282fe6060f1SDimitry Andric   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283fe6060f1SDimitry Andric   /// \returns Returns true if \p MI is modified, false otherwise.
284fe6060f1SDimitry Andric   bool enableNamedBit(const MachineBasicBlock::iterator MI,
285fe6060f1SDimitry Andric                       AMDGPU::CPol::CPol Bit) const;
286fe6060f1SDimitry Andric 
2870b57cec5SDimitry Andric public:
2880b57cec5SDimitry Andric 
2890b57cec5SDimitry Andric   /// Create a cache control for the subtarget \p ST.
2900b57cec5SDimitry Andric   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
2910b57cec5SDimitry Andric 
2920b57cec5SDimitry Andric   /// Update \p MI memory load instruction to bypass any caches up to
2930b57cec5SDimitry Andric   /// the \p Scope memory scope for address spaces \p
2940b57cec5SDimitry Andric   /// AddrSpace. Return true iff the instruction was modified.
2950b57cec5SDimitry Andric   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
2960b57cec5SDimitry Andric                                      SIAtomicScope Scope,
2970b57cec5SDimitry Andric                                      SIAtomicAddrSpace AddrSpace) const = 0;
2980b57cec5SDimitry Andric 
299fe6060f1SDimitry Andric   /// Update \p MI memory store instruction to bypass any caches up to
300fe6060f1SDimitry Andric   /// the \p Scope memory scope for address spaces \p
301fe6060f1SDimitry Andric   /// AddrSpace. Return true iff the instruction was modified.
302fe6060f1SDimitry Andric   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303fe6060f1SDimitry Andric                                       SIAtomicScope Scope,
304fe6060f1SDimitry Andric                                       SIAtomicAddrSpace AddrSpace) const = 0;
305fe6060f1SDimitry Andric 
306fe6060f1SDimitry Andric   /// Update \p MI memory read-modify-write instruction to bypass any caches up
307fe6060f1SDimitry Andric   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308fe6060f1SDimitry Andric   /// iff the instruction was modified.
309fe6060f1SDimitry Andric   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310fe6060f1SDimitry Andric                                     SIAtomicScope Scope,
311fe6060f1SDimitry Andric                                     SIAtomicAddrSpace AddrSpace) const = 0;
312fe6060f1SDimitry Andric 
313e8d8bef9SDimitry Andric   /// Update \p MI memory instruction of kind \p Op associated with address
314*0fca6ea1SDimitry Andric   /// spaces \p AddrSpace to indicate it is volatile and/or
315*0fca6ea1SDimitry Andric   /// nontemporal/last-use. Return true iff the instruction was modified.
316e8d8bef9SDimitry Andric   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
3170b57cec5SDimitry Andric                                               SIAtomicAddrSpace AddrSpace,
318e8d8bef9SDimitry Andric                                               SIMemOp Op, bool IsVolatile,
319*0fca6ea1SDimitry Andric                                               bool IsNonTemporal,
320*0fca6ea1SDimitry Andric                                               bool IsLastUse = false) const = 0;
321*0fca6ea1SDimitry Andric 
expandSystemScopeStore(MachineBasicBlock::iterator & MI) const322*0fca6ea1SDimitry Andric   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323*0fca6ea1SDimitry Andric     return false;
324*0fca6ea1SDimitry Andric   };
3250b57cec5SDimitry Andric 
3260b57cec5SDimitry Andric   /// Inserts any necessary instructions at position \p Pos relative
327e8d8bef9SDimitry Andric   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328e8d8bef9SDimitry Andric   /// \p Op associated with address spaces \p AddrSpace have completed. Used
329e8d8bef9SDimitry Andric   /// between memory instructions to enforce the order they become visible as
330e8d8bef9SDimitry Andric   /// observed by other memory instructions executing in memory scope \p Scope.
331e8d8bef9SDimitry Andric   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332e8d8bef9SDimitry Andric   /// address spaces. Returns true iff any instructions inserted.
3330b57cec5SDimitry Andric   virtual bool insertWait(MachineBasicBlock::iterator &MI,
3340b57cec5SDimitry Andric                           SIAtomicScope Scope,
3350b57cec5SDimitry Andric                           SIAtomicAddrSpace AddrSpace,
3360b57cec5SDimitry Andric                           SIMemOp Op,
3370b57cec5SDimitry Andric                           bool IsCrossAddrSpaceOrdering,
3380b57cec5SDimitry Andric                           Position Pos) const = 0;
3390b57cec5SDimitry Andric 
340e8d8bef9SDimitry Andric   /// Inserts any necessary instructions at position \p Pos relative to
341e8d8bef9SDimitry Andric   /// instruction \p MI to ensure any subsequent memory instructions of this
342e8d8bef9SDimitry Andric   /// thread with address spaces \p AddrSpace will observe the previous memory
343e8d8bef9SDimitry Andric   /// operations by any thread for memory scopes up to memory scope \p Scope .
344e8d8bef9SDimitry Andric   /// Returns true iff any instructions inserted.
345e8d8bef9SDimitry Andric   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346e8d8bef9SDimitry Andric                              SIAtomicScope Scope,
347e8d8bef9SDimitry Andric                              SIAtomicAddrSpace AddrSpace,
348e8d8bef9SDimitry Andric                              Position Pos) const = 0;
349e8d8bef9SDimitry Andric 
350e8d8bef9SDimitry Andric   /// Inserts any necessary instructions at position \p Pos relative to
351e8d8bef9SDimitry Andric   /// instruction \p MI to ensure previous memory instructions by this thread
352e8d8bef9SDimitry Andric   /// with address spaces \p AddrSpace have completed and can be observed by
353e8d8bef9SDimitry Andric   /// subsequent memory instructions by any thread executing in memory scope \p
354e8d8bef9SDimitry Andric   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355e8d8bef9SDimitry Andric   /// between address spaces. Returns true iff any instructions inserted.
356e8d8bef9SDimitry Andric   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357e8d8bef9SDimitry Andric                              SIAtomicScope Scope,
358e8d8bef9SDimitry Andric                              SIAtomicAddrSpace AddrSpace,
359e8d8bef9SDimitry Andric                              bool IsCrossAddrSpaceOrdering,
360e8d8bef9SDimitry Andric                              Position Pos) const = 0;
361e8d8bef9SDimitry Andric 
3620b57cec5SDimitry Andric   /// Virtual destructor to allow derivations to be deleted.
3630b57cec5SDimitry Andric   virtual ~SICacheControl() = default;
3640b57cec5SDimitry Andric 
tryForceStoreSC0SC1(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI) const36506c3fb27SDimitry Andric   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
36606c3fb27SDimitry Andric                                    MachineBasicBlock::iterator &MI) const {
36706c3fb27SDimitry Andric     return false;
36806c3fb27SDimitry Andric   }
3690b57cec5SDimitry Andric };
3700b57cec5SDimitry Andric 
3710b57cec5SDimitry Andric class SIGfx6CacheControl : public SICacheControl {
3720b57cec5SDimitry Andric protected:
3730b57cec5SDimitry Andric 
3740b57cec5SDimitry Andric   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
3750b57cec5SDimitry Andric   /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const3760b57cec5SDimitry Andric   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377fe6060f1SDimitry Andric     return enableNamedBit(MI, AMDGPU::CPol::GLC);
3780b57cec5SDimitry Andric   }
3790b57cec5SDimitry Andric 
3800b57cec5SDimitry Andric   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
3810b57cec5SDimitry Andric   /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const3820b57cec5SDimitry Andric   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383fe6060f1SDimitry Andric     return enableNamedBit(MI, AMDGPU::CPol::SLC);
3840b57cec5SDimitry Andric   }
3850b57cec5SDimitry Andric 
3860b57cec5SDimitry Andric public:
3870b57cec5SDimitry Andric 
SIGfx6CacheControl(const GCNSubtarget & ST)388349cc55cSDimitry Andric   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
3890b57cec5SDimitry Andric 
3900b57cec5SDimitry Andric   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
3910b57cec5SDimitry Andric                              SIAtomicScope Scope,
3920b57cec5SDimitry Andric                              SIAtomicAddrSpace AddrSpace) const override;
3930b57cec5SDimitry Andric 
394fe6060f1SDimitry Andric   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395fe6060f1SDimitry Andric                               SIAtomicScope Scope,
396fe6060f1SDimitry Andric                               SIAtomicAddrSpace AddrSpace) const override;
397fe6060f1SDimitry Andric 
398fe6060f1SDimitry Andric   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399fe6060f1SDimitry Andric                             SIAtomicScope Scope,
400fe6060f1SDimitry Andric                             SIAtomicAddrSpace AddrSpace) const override;
401fe6060f1SDimitry Andric 
402e8d8bef9SDimitry Andric   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403e8d8bef9SDimitry Andric                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404*0fca6ea1SDimitry Andric                                       bool IsVolatile, bool IsNonTemporal,
405*0fca6ea1SDimitry Andric                                       bool IsLastUse) const override;
4060b57cec5SDimitry Andric 
4070b57cec5SDimitry Andric   bool insertWait(MachineBasicBlock::iterator &MI,
4080b57cec5SDimitry Andric                   SIAtomicScope Scope,
4090b57cec5SDimitry Andric                   SIAtomicAddrSpace AddrSpace,
4100b57cec5SDimitry Andric                   SIMemOp Op,
4110b57cec5SDimitry Andric                   bool IsCrossAddrSpaceOrdering,
4120b57cec5SDimitry Andric                   Position Pos) const override;
413e8d8bef9SDimitry Andric 
414e8d8bef9SDimitry Andric   bool insertAcquire(MachineBasicBlock::iterator &MI,
415e8d8bef9SDimitry Andric                      SIAtomicScope Scope,
416e8d8bef9SDimitry Andric                      SIAtomicAddrSpace AddrSpace,
417e8d8bef9SDimitry Andric                      Position Pos) const override;
418e8d8bef9SDimitry Andric 
419e8d8bef9SDimitry Andric   bool insertRelease(MachineBasicBlock::iterator &MI,
420e8d8bef9SDimitry Andric                      SIAtomicScope Scope,
421e8d8bef9SDimitry Andric                      SIAtomicAddrSpace AddrSpace,
422e8d8bef9SDimitry Andric                      bool IsCrossAddrSpaceOrdering,
423e8d8bef9SDimitry Andric                      Position Pos) const override;
4240b57cec5SDimitry Andric };
4250b57cec5SDimitry Andric 
4260b57cec5SDimitry Andric class SIGfx7CacheControl : public SIGfx6CacheControl {
4270b57cec5SDimitry Andric public:
4280b57cec5SDimitry Andric 
SIGfx7CacheControl(const GCNSubtarget & ST)429349cc55cSDimitry Andric   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
4300b57cec5SDimitry Andric 
431e8d8bef9SDimitry Andric   bool insertAcquire(MachineBasicBlock::iterator &MI,
4320b57cec5SDimitry Andric                      SIAtomicScope Scope,
4330b57cec5SDimitry Andric                      SIAtomicAddrSpace AddrSpace,
4340b57cec5SDimitry Andric                      Position Pos) const override;
4350b57cec5SDimitry Andric 
4360b57cec5SDimitry Andric };
4370b57cec5SDimitry Andric 
438fe6060f1SDimitry Andric class SIGfx90ACacheControl : public SIGfx7CacheControl {
439fe6060f1SDimitry Andric public:
440fe6060f1SDimitry Andric 
SIGfx90ACacheControl(const GCNSubtarget & ST)441349cc55cSDimitry Andric   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
442fe6060f1SDimitry Andric 
443fe6060f1SDimitry Andric   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444fe6060f1SDimitry Andric                              SIAtomicScope Scope,
445fe6060f1SDimitry Andric                              SIAtomicAddrSpace AddrSpace) const override;
446fe6060f1SDimitry Andric 
447fe6060f1SDimitry Andric   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448fe6060f1SDimitry Andric                               SIAtomicScope Scope,
449fe6060f1SDimitry Andric                               SIAtomicAddrSpace AddrSpace) const override;
450fe6060f1SDimitry Andric 
451fe6060f1SDimitry Andric   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452fe6060f1SDimitry Andric                             SIAtomicScope Scope,
453fe6060f1SDimitry Andric                             SIAtomicAddrSpace AddrSpace) const override;
454fe6060f1SDimitry Andric 
455fe6060f1SDimitry Andric   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456fe6060f1SDimitry Andric                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457*0fca6ea1SDimitry Andric                                       bool IsVolatile, bool IsNonTemporal,
458*0fca6ea1SDimitry Andric                                       bool IsLastUse) const override;
459fe6060f1SDimitry Andric 
460fe6060f1SDimitry Andric   bool insertWait(MachineBasicBlock::iterator &MI,
461fe6060f1SDimitry Andric                   SIAtomicScope Scope,
462fe6060f1SDimitry Andric                   SIAtomicAddrSpace AddrSpace,
463fe6060f1SDimitry Andric                   SIMemOp Op,
464fe6060f1SDimitry Andric                   bool IsCrossAddrSpaceOrdering,
465fe6060f1SDimitry Andric                   Position Pos) const override;
466fe6060f1SDimitry Andric 
467fe6060f1SDimitry Andric   bool insertAcquire(MachineBasicBlock::iterator &MI,
468fe6060f1SDimitry Andric                      SIAtomicScope Scope,
469fe6060f1SDimitry Andric                      SIAtomicAddrSpace AddrSpace,
470fe6060f1SDimitry Andric                      Position Pos) const override;
471fe6060f1SDimitry Andric 
472fe6060f1SDimitry Andric   bool insertRelease(MachineBasicBlock::iterator &MI,
473fe6060f1SDimitry Andric                      SIAtomicScope Scope,
474fe6060f1SDimitry Andric                      SIAtomicAddrSpace AddrSpace,
475fe6060f1SDimitry Andric                      bool IsCrossAddrSpaceOrdering,
476fe6060f1SDimitry Andric                      Position Pos) const override;
477fe6060f1SDimitry Andric };
478fe6060f1SDimitry Andric 
47981ad6265SDimitry Andric class SIGfx940CacheControl : public SIGfx90ACacheControl {
48081ad6265SDimitry Andric protected:
48181ad6265SDimitry Andric 
48281ad6265SDimitry Andric   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
48381ad6265SDimitry Andric   /// is modified, false otherwise.
enableSC0Bit(const MachineBasicBlock::iterator & MI) const48481ad6265SDimitry Andric   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
48581ad6265SDimitry Andric     return enableNamedBit(MI, AMDGPU::CPol::SC0);
48681ad6265SDimitry Andric   }
48781ad6265SDimitry Andric 
48881ad6265SDimitry Andric   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
48981ad6265SDimitry Andric   /// is modified, false otherwise.
enableSC1Bit(const MachineBasicBlock::iterator & MI) const49081ad6265SDimitry Andric   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
49181ad6265SDimitry Andric     return enableNamedBit(MI, AMDGPU::CPol::SC1);
49281ad6265SDimitry Andric   }
49381ad6265SDimitry Andric 
49481ad6265SDimitry Andric   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
49581ad6265SDimitry Andric   /// is modified, false otherwise.
enableNTBit(const MachineBasicBlock::iterator & MI) const49681ad6265SDimitry Andric   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
49781ad6265SDimitry Andric     return enableNamedBit(MI, AMDGPU::CPol::NT);
49881ad6265SDimitry Andric   }
49981ad6265SDimitry Andric 
50081ad6265SDimitry Andric public:
50181ad6265SDimitry Andric 
SIGfx940CacheControl(const GCNSubtarget & ST)50281ad6265SDimitry Andric   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
50381ad6265SDimitry Andric 
50481ad6265SDimitry Andric   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
50581ad6265SDimitry Andric                              SIAtomicScope Scope,
50681ad6265SDimitry Andric                              SIAtomicAddrSpace AddrSpace) const override;
50781ad6265SDimitry Andric 
50881ad6265SDimitry Andric   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
50981ad6265SDimitry Andric                               SIAtomicScope Scope,
51081ad6265SDimitry Andric                               SIAtomicAddrSpace AddrSpace) const override;
51181ad6265SDimitry Andric 
51281ad6265SDimitry Andric   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
51381ad6265SDimitry Andric                             SIAtomicScope Scope,
51481ad6265SDimitry Andric                             SIAtomicAddrSpace AddrSpace) const override;
51581ad6265SDimitry Andric 
51681ad6265SDimitry Andric   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
51781ad6265SDimitry Andric                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518*0fca6ea1SDimitry Andric                                       bool IsVolatile, bool IsNonTemporal,
519*0fca6ea1SDimitry Andric                                       bool IsLastUse) const override;
52081ad6265SDimitry Andric 
52181ad6265SDimitry Andric   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
52281ad6265SDimitry Andric                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
52381ad6265SDimitry Andric 
52481ad6265SDimitry Andric   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
52581ad6265SDimitry Andric                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
52681ad6265SDimitry Andric                      Position Pos) const override;
52706c3fb27SDimitry Andric 
tryForceStoreSC0SC1(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI) const52806c3fb27SDimitry Andric   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
52906c3fb27SDimitry Andric                            MachineBasicBlock::iterator &MI) const override {
53006c3fb27SDimitry Andric     bool Changed = false;
53106c3fb27SDimitry Andric     if (ST.hasForceStoreSC0SC1() &&
53206c3fb27SDimitry Andric         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
53306c3fb27SDimitry Andric                                     SIAtomicAddrSpace::GLOBAL |
53406c3fb27SDimitry Andric                                     SIAtomicAddrSpace::OTHER)) !=
53506c3fb27SDimitry Andric          SIAtomicAddrSpace::NONE) {
53606c3fb27SDimitry Andric       Changed |= enableSC0Bit(MI);
53706c3fb27SDimitry Andric       Changed |= enableSC1Bit(MI);
53806c3fb27SDimitry Andric     }
53906c3fb27SDimitry Andric     return Changed;
54006c3fb27SDimitry Andric   }
54181ad6265SDimitry Andric };
54281ad6265SDimitry Andric 
5430b57cec5SDimitry Andric class SIGfx10CacheControl : public SIGfx7CacheControl {
5440b57cec5SDimitry Andric protected:
5450b57cec5SDimitry Andric 
5460b57cec5SDimitry Andric   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
5470b57cec5SDimitry Andric   /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const5480b57cec5SDimitry Andric   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549fe6060f1SDimitry Andric     return enableNamedBit(MI, AMDGPU::CPol::DLC);
5500b57cec5SDimitry Andric   }
5510b57cec5SDimitry Andric 
5520b57cec5SDimitry Andric public:
5530b57cec5SDimitry Andric 
SIGfx10CacheControl(const GCNSubtarget & ST)554349cc55cSDimitry Andric   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
5550b57cec5SDimitry Andric 
5560b57cec5SDimitry Andric   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
5570b57cec5SDimitry Andric                              SIAtomicScope Scope,
5580b57cec5SDimitry Andric                              SIAtomicAddrSpace AddrSpace) const override;
5590b57cec5SDimitry Andric 
560e8d8bef9SDimitry Andric   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561e8d8bef9SDimitry Andric                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562*0fca6ea1SDimitry Andric                                       bool IsVolatile, bool IsNonTemporal,
563*0fca6ea1SDimitry Andric                                       bool IsLastUse) const override;
5640b57cec5SDimitry Andric 
5650b57cec5SDimitry Andric   bool insertWait(MachineBasicBlock::iterator &MI,
5660b57cec5SDimitry Andric                   SIAtomicScope Scope,
5670b57cec5SDimitry Andric                   SIAtomicAddrSpace AddrSpace,
5680b57cec5SDimitry Andric                   SIMemOp Op,
5690b57cec5SDimitry Andric                   bool IsCrossAddrSpaceOrdering,
5700b57cec5SDimitry Andric                   Position Pos) const override;
571e8d8bef9SDimitry Andric 
572e8d8bef9SDimitry Andric   bool insertAcquire(MachineBasicBlock::iterator &MI,
573e8d8bef9SDimitry Andric                      SIAtomicScope Scope,
574e8d8bef9SDimitry Andric                      SIAtomicAddrSpace AddrSpace,
575e8d8bef9SDimitry Andric                      Position Pos) const override;
5760b57cec5SDimitry Andric };
5770b57cec5SDimitry Andric 
57881ad6265SDimitry Andric class SIGfx11CacheControl : public SIGfx10CacheControl {
57981ad6265SDimitry Andric public:
SIGfx11CacheControl(const GCNSubtarget & ST)58081ad6265SDimitry Andric   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
58181ad6265SDimitry Andric 
58281ad6265SDimitry Andric   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
58381ad6265SDimitry Andric                              SIAtomicScope Scope,
58481ad6265SDimitry Andric                              SIAtomicAddrSpace AddrSpace) const override;
58581ad6265SDimitry Andric 
58681ad6265SDimitry Andric   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
58781ad6265SDimitry Andric                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588*0fca6ea1SDimitry Andric                                       bool IsVolatile, bool IsNonTemporal,
589*0fca6ea1SDimitry Andric                                       bool IsLastUse) const override;
59081ad6265SDimitry Andric };
59181ad6265SDimitry Andric 
5921db9f3b2SDimitry Andric class SIGfx12CacheControl : public SIGfx11CacheControl {
5937a6dacacSDimitry Andric protected:
5947a6dacacSDimitry Andric   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
5957a6dacacSDimitry Andric   // \returns Returns true if \p MI is modified, false otherwise.
5967a6dacacSDimitry Andric   bool setTH(const MachineBasicBlock::iterator MI,
5977a6dacacSDimitry Andric              AMDGPU::CPol::CPol Value) const;
5987a6dacacSDimitry Andric   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
5997a6dacacSDimitry Andric   // MI. \returns Returns true if \p MI is modified, false otherwise.
6007a6dacacSDimitry Andric   bool setScope(const MachineBasicBlock::iterator MI,
6017a6dacacSDimitry Andric                 AMDGPU::CPol::CPol Value) const;
6027a6dacacSDimitry Andric 
603*0fca6ea1SDimitry Andric   // Stores with system scope (SCOPE_SYS) need to wait for:
604*0fca6ea1SDimitry Andric   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
605*0fca6ea1SDimitry Andric   // - non-returning-atomics       - wait for STORECNT==0
606*0fca6ea1SDimitry Andric   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607*0fca6ea1SDimitry Andric   //   since it does not distinguish atomics-with-return from regular stores.
608*0fca6ea1SDimitry Andric   // There is no need to wait if memory is cached (mtype != UC).
609*0fca6ea1SDimitry Andric   bool
610*0fca6ea1SDimitry Andric   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611*0fca6ea1SDimitry Andric 
612*0fca6ea1SDimitry Andric   bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613*0fca6ea1SDimitry Andric                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614*0fca6ea1SDimitry Andric 
6151db9f3b2SDimitry Andric public:
SIGfx12CacheControl(const GCNSubtarget & ST)6161db9f3b2SDimitry Andric   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
6171db9f3b2SDimitry Andric 
6187a6dacacSDimitry Andric   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
6197a6dacacSDimitry Andric                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
6207a6dacacSDimitry Andric                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
6217a6dacacSDimitry Andric 
6221db9f3b2SDimitry Andric   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
6231db9f3b2SDimitry Andric                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
6247a6dacacSDimitry Andric 
6257a6dacacSDimitry Andric   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
6267a6dacacSDimitry Andric                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627*0fca6ea1SDimitry Andric                                       bool IsVolatile, bool IsNonTemporal,
628*0fca6ea1SDimitry Andric                                       bool IsLastUse) const override;
629*0fca6ea1SDimitry Andric 
630*0fca6ea1SDimitry Andric   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631*0fca6ea1SDimitry Andric 
632*0fca6ea1SDimitry Andric   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633*0fca6ea1SDimitry Andric                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634*0fca6ea1SDimitry Andric                      Position Pos) const override;
635*0fca6ea1SDimitry Andric 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const636*0fca6ea1SDimitry Andric   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637*0fca6ea1SDimitry Andric                              SIAtomicScope Scope,
638*0fca6ea1SDimitry Andric                              SIAtomicAddrSpace AddrSpace) const override {
639*0fca6ea1SDimitry Andric     return setAtomicScope(MI, Scope, AddrSpace);
640*0fca6ea1SDimitry Andric   }
641*0fca6ea1SDimitry Andric 
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const642*0fca6ea1SDimitry Andric   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643*0fca6ea1SDimitry Andric                               SIAtomicScope Scope,
644*0fca6ea1SDimitry Andric                               SIAtomicAddrSpace AddrSpace) const override {
645*0fca6ea1SDimitry Andric     return setAtomicScope(MI, Scope, AddrSpace);
646*0fca6ea1SDimitry Andric   }
647*0fca6ea1SDimitry Andric 
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const648*0fca6ea1SDimitry Andric   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649*0fca6ea1SDimitry Andric                             SIAtomicScope Scope,
650*0fca6ea1SDimitry Andric                             SIAtomicAddrSpace AddrSpace) const override {
651*0fca6ea1SDimitry Andric     return setAtomicScope(MI, Scope, AddrSpace);
652*0fca6ea1SDimitry Andric   }
6531db9f3b2SDimitry Andric };
6541db9f3b2SDimitry Andric 
6550b57cec5SDimitry Andric class SIMemoryLegalizer final : public MachineFunctionPass {
6560b57cec5SDimitry Andric private:
6570b57cec5SDimitry Andric 
6580b57cec5SDimitry Andric   /// Cache Control.
6590b57cec5SDimitry Andric   std::unique_ptr<SICacheControl> CC = nullptr;
6600b57cec5SDimitry Andric 
6610b57cec5SDimitry Andric   /// List of atomic pseudo instructions.
6620b57cec5SDimitry Andric   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
6630b57cec5SDimitry Andric 
6640b57cec5SDimitry Andric   /// Return true iff instruction \p MI is a atomic instruction that
6650b57cec5SDimitry Andric   /// returns a result.
isAtomicRet(const MachineInstr & MI) const6660b57cec5SDimitry Andric   bool isAtomicRet(const MachineInstr &MI) const {
667fe6060f1SDimitry Andric     return SIInstrInfo::isAtomicRet(MI);
6680b57cec5SDimitry Andric   }
6690b57cec5SDimitry Andric 
6700b57cec5SDimitry Andric   /// Removes all processed atomic pseudo instructions from the current
6710b57cec5SDimitry Andric   /// function. Returns true if current function is modified, false otherwise.
6720b57cec5SDimitry Andric   bool removeAtomicPseudoMIs();
6730b57cec5SDimitry Andric 
6740b57cec5SDimitry Andric   /// Expands load operation \p MI. Returns true if instructions are
6750b57cec5SDimitry Andric   /// added/deleted or \p MI is modified, false otherwise.
6760b57cec5SDimitry Andric   bool expandLoad(const SIMemOpInfo &MOI,
6770b57cec5SDimitry Andric                   MachineBasicBlock::iterator &MI);
6780b57cec5SDimitry Andric   /// Expands store operation \p MI. Returns true if instructions are
6790b57cec5SDimitry Andric   /// added/deleted or \p MI is modified, false otherwise.
6800b57cec5SDimitry Andric   bool expandStore(const SIMemOpInfo &MOI,
6810b57cec5SDimitry Andric                    MachineBasicBlock::iterator &MI);
6820b57cec5SDimitry Andric   /// Expands atomic fence operation \p MI. Returns true if
6830b57cec5SDimitry Andric   /// instructions are added/deleted or \p MI is modified, false otherwise.
6840b57cec5SDimitry Andric   bool expandAtomicFence(const SIMemOpInfo &MOI,
6850b57cec5SDimitry Andric                          MachineBasicBlock::iterator &MI);
6860b57cec5SDimitry Andric   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
6870b57cec5SDimitry Andric   /// instructions are added/deleted or \p MI is modified, false otherwise.
6880b57cec5SDimitry Andric   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
6890b57cec5SDimitry Andric                                 MachineBasicBlock::iterator &MI);
6900b57cec5SDimitry Andric 
6910b57cec5SDimitry Andric public:
6920b57cec5SDimitry Andric   static char ID;
6930b57cec5SDimitry Andric 
SIMemoryLegalizer()6940b57cec5SDimitry Andric   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
6950b57cec5SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const6960b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
6970b57cec5SDimitry Andric     AU.setPreservesCFG();
6980b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
6990b57cec5SDimitry Andric   }
7000b57cec5SDimitry Andric 
getPassName() const7010b57cec5SDimitry Andric   StringRef getPassName() const override {
7020b57cec5SDimitry Andric     return PASS_NAME;
7030b57cec5SDimitry Andric   }
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
7060b57cec5SDimitry Andric };
7070b57cec5SDimitry Andric 
708*0fca6ea1SDimitry Andric static const StringMap<SIAtomicAddrSpace> ASNames = {{
709*0fca6ea1SDimitry Andric     {"global", SIAtomicAddrSpace::GLOBAL},
710*0fca6ea1SDimitry Andric     {"local", SIAtomicAddrSpace::LDS},
711*0fca6ea1SDimitry Andric }};
712*0fca6ea1SDimitry Andric 
diagnoseUnknownMMRAASName(const MachineInstr & MI,StringRef AS)713*0fca6ea1SDimitry Andric void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
714*0fca6ea1SDimitry Andric   const MachineFunction *MF = MI.getMF();
715*0fca6ea1SDimitry Andric   const Function &Fn = MF->getFunction();
716*0fca6ea1SDimitry Andric   SmallString<128> Str;
717*0fca6ea1SDimitry Andric   raw_svector_ostream OS(Str);
718*0fca6ea1SDimitry Andric   OS << "unknown address space '" << AS << "'; expected one of ";
719*0fca6ea1SDimitry Andric   ListSeparator LS;
720*0fca6ea1SDimitry Andric   for (const auto &[Name, Val] : ASNames)
721*0fca6ea1SDimitry Andric     OS << LS << '\'' << Name << '\'';
722*0fca6ea1SDimitry Andric   DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
723*0fca6ea1SDimitry Andric   Fn.getContext().diagnose(BadTag);
724*0fca6ea1SDimitry Andric }
725*0fca6ea1SDimitry Andric 
726*0fca6ea1SDimitry Andric /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
727*0fca6ea1SDimitry Andric /// If this tag isn't present, or if it has no meaningful values, returns \p
728*0fca6ea1SDimitry Andric /// Default. Otherwise returns all the address spaces concerned by the MMRA.
getFenceAddrSpaceMMRA(const MachineInstr & MI,SIAtomicAddrSpace Default)729*0fca6ea1SDimitry Andric static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
730*0fca6ea1SDimitry Andric                                                SIAtomicAddrSpace Default) {
731*0fca6ea1SDimitry Andric   static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
732*0fca6ea1SDimitry Andric 
733*0fca6ea1SDimitry Andric   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
734*0fca6ea1SDimitry Andric   if (!MMRA)
735*0fca6ea1SDimitry Andric     return Default;
736*0fca6ea1SDimitry Andric 
737*0fca6ea1SDimitry Andric   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
738*0fca6ea1SDimitry Andric   for (const auto &[Prefix, Suffix] : MMRA) {
739*0fca6ea1SDimitry Andric     if (Prefix != FenceASPrefix)
740*0fca6ea1SDimitry Andric       continue;
741*0fca6ea1SDimitry Andric 
742*0fca6ea1SDimitry Andric     if (auto It = ASNames.find(Suffix); It != ASNames.end())
743*0fca6ea1SDimitry Andric       Result |= It->second;
744*0fca6ea1SDimitry Andric     else
745*0fca6ea1SDimitry Andric       diagnoseUnknownMMRAASName(MI, Suffix);
746*0fca6ea1SDimitry Andric   }
747*0fca6ea1SDimitry Andric 
748*0fca6ea1SDimitry Andric   return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
749*0fca6ea1SDimitry Andric }
750*0fca6ea1SDimitry Andric 
751*0fca6ea1SDimitry Andric } // end anonymous namespace
7520b57cec5SDimitry Andric 
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const7530b57cec5SDimitry Andric void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
7540b57cec5SDimitry Andric                                       const char *Msg) const {
7550b57cec5SDimitry Andric   const Function &Func = MI->getParent()->getParent()->getFunction();
7560b57cec5SDimitry Andric   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
7570b57cec5SDimitry Andric   Func.getContext().diagnose(Diag);
7580b57cec5SDimitry Andric }
7590b57cec5SDimitry Andric 
760bdd1243dSDimitry Andric std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const7610b57cec5SDimitry Andric SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
762fe6060f1SDimitry Andric                                SIAtomicAddrSpace InstrAddrSpace) const {
7630b57cec5SDimitry Andric   if (SSID == SyncScope::System)
764bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
7650b57cec5SDimitry Andric   if (SSID == MMI->getAgentSSID())
766bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
7670b57cec5SDimitry Andric   if (SSID == MMI->getWorkgroupSSID())
768bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
7690b57cec5SDimitry Andric                       true);
7700b57cec5SDimitry Andric   if (SSID == MMI->getWavefrontSSID())
771bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
7720b57cec5SDimitry Andric                       true);
7730b57cec5SDimitry Andric   if (SSID == SyncScope::SingleThread)
774bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
7750b57cec5SDimitry Andric                       true);
7760b57cec5SDimitry Andric   if (SSID == MMI->getSystemOneAddressSpaceSSID())
777bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::SYSTEM,
778bdd1243dSDimitry Andric                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
7790b57cec5SDimitry Andric   if (SSID == MMI->getAgentOneAddressSpaceSSID())
780bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::AGENT,
781bdd1243dSDimitry Andric                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
7820b57cec5SDimitry Andric   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
783bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::WORKGROUP,
784bdd1243dSDimitry Andric                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
7850b57cec5SDimitry Andric   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
786bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::WAVEFRONT,
787bdd1243dSDimitry Andric                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
7880b57cec5SDimitry Andric   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
789bdd1243dSDimitry Andric     return std::tuple(SIAtomicScope::SINGLETHREAD,
790bdd1243dSDimitry Andric                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791bdd1243dSDimitry Andric   return std::nullopt;
7920b57cec5SDimitry Andric }
7930b57cec5SDimitry Andric 
toSIAtomicAddrSpace(unsigned AS) const7940b57cec5SDimitry Andric SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
7950b57cec5SDimitry Andric   if (AS == AMDGPUAS::FLAT_ADDRESS)
7960b57cec5SDimitry Andric     return SIAtomicAddrSpace::FLAT;
7970b57cec5SDimitry Andric   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
7980b57cec5SDimitry Andric     return SIAtomicAddrSpace::GLOBAL;
7990b57cec5SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS)
8000b57cec5SDimitry Andric     return SIAtomicAddrSpace::LDS;
8010b57cec5SDimitry Andric   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
8020b57cec5SDimitry Andric     return SIAtomicAddrSpace::SCRATCH;
8030b57cec5SDimitry Andric   if (AS == AMDGPUAS::REGION_ADDRESS)
8040b57cec5SDimitry Andric     return SIAtomicAddrSpace::GDS;
8050b57cec5SDimitry Andric 
8060b57cec5SDimitry Andric   return SIAtomicAddrSpace::OTHER;
8070b57cec5SDimitry Andric }
8080b57cec5SDimitry Andric 
SIMemOpAccess(const AMDGPUMachineModuleInfo & MMI_)809*0fca6ea1SDimitry Andric SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
810*0fca6ea1SDimitry Andric     : MMI(&MMI_) {}
8110b57cec5SDimitry Andric 
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const812bdd1243dSDimitry Andric std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
8130b57cec5SDimitry Andric     const MachineBasicBlock::iterator &MI) const {
8140b57cec5SDimitry Andric   assert(MI->getNumMemOperands() > 0);
8150b57cec5SDimitry Andric 
8160b57cec5SDimitry Andric   SyncScope::ID SSID = SyncScope::SingleThread;
8170b57cec5SDimitry Andric   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
8180b57cec5SDimitry Andric   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
8190b57cec5SDimitry Andric   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
8200b57cec5SDimitry Andric   bool IsNonTemporal = true;
821e8d8bef9SDimitry Andric   bool IsVolatile = false;
822*0fca6ea1SDimitry Andric   bool IsLastUse = false;
8230b57cec5SDimitry Andric 
8240b57cec5SDimitry Andric   // Validator should check whether or not MMOs cover the entire set of
8250b57cec5SDimitry Andric   // locations accessed by the memory instruction.
8260b57cec5SDimitry Andric   for (const auto &MMO : MI->memoperands()) {
8270b57cec5SDimitry Andric     IsNonTemporal &= MMO->isNonTemporal();
828e8d8bef9SDimitry Andric     IsVolatile |= MMO->isVolatile();
829*0fca6ea1SDimitry Andric     IsLastUse |= MMO->getFlags() & MOLastUse;
8300b57cec5SDimitry Andric     InstrAddrSpace |=
8310b57cec5SDimitry Andric       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
832fe6060f1SDimitry Andric     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
8330b57cec5SDimitry Andric     if (OpOrdering != AtomicOrdering::NotAtomic) {
8340b57cec5SDimitry Andric       const auto &IsSyncScopeInclusion =
8350b57cec5SDimitry Andric           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
8360b57cec5SDimitry Andric       if (!IsSyncScopeInclusion) {
8370b57cec5SDimitry Andric         reportUnsupported(MI,
8380b57cec5SDimitry Andric           "Unsupported non-inclusive atomic synchronization scope");
839bdd1243dSDimitry Andric         return std::nullopt;
8400b57cec5SDimitry Andric       }
8410b57cec5SDimitry Andric 
84281ad6265SDimitry Andric       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
843349cc55cSDimitry Andric       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
8440b57cec5SDimitry Andric       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
8450b57cec5SDimitry Andric              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
8460b57cec5SDimitry Andric       FailureOrdering =
847349cc55cSDimitry Andric           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
8480b57cec5SDimitry Andric     }
8490b57cec5SDimitry Andric   }
8500b57cec5SDimitry Andric 
8510b57cec5SDimitry Andric   SIAtomicScope Scope = SIAtomicScope::NONE;
8520b57cec5SDimitry Andric   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
8530b57cec5SDimitry Andric   bool IsCrossAddressSpaceOrdering = false;
8540b57cec5SDimitry Andric   if (Ordering != AtomicOrdering::NotAtomic) {
8550b57cec5SDimitry Andric     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
8560b57cec5SDimitry Andric     if (!ScopeOrNone) {
8570b57cec5SDimitry Andric       reportUnsupported(MI, "Unsupported atomic synchronization scope");
858bdd1243dSDimitry Andric       return std::nullopt;
8590b57cec5SDimitry Andric     }
8600b57cec5SDimitry Andric     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
86181ad6265SDimitry Andric         *ScopeOrNone;
8620b57cec5SDimitry Andric     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
863fe6060f1SDimitry Andric         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
864fe6060f1SDimitry Andric         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
8650b57cec5SDimitry Andric       reportUnsupported(MI, "Unsupported atomic address space");
866bdd1243dSDimitry Andric       return std::nullopt;
8670b57cec5SDimitry Andric     }
8680b57cec5SDimitry Andric   }
8690b57cec5SDimitry Andric   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
870e8d8bef9SDimitry Andric                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
871*0fca6ea1SDimitry Andric                      IsNonTemporal, IsLastUse);
8720b57cec5SDimitry Andric }
8730b57cec5SDimitry Andric 
874bdd1243dSDimitry Andric std::optional<SIMemOpInfo>
getLoadInfo(const MachineBasicBlock::iterator & MI) const875bdd1243dSDimitry Andric SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
8760b57cec5SDimitry Andric   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
8770b57cec5SDimitry Andric 
8780b57cec5SDimitry Andric   if (!(MI->mayLoad() && !MI->mayStore()))
879bdd1243dSDimitry Andric     return std::nullopt;
8800b57cec5SDimitry Andric 
8810b57cec5SDimitry Andric   // Be conservative if there are no memory operands.
8820b57cec5SDimitry Andric   if (MI->getNumMemOperands() == 0)
8830b57cec5SDimitry Andric     return SIMemOpInfo();
8840b57cec5SDimitry Andric 
8850b57cec5SDimitry Andric   return constructFromMIWithMMO(MI);
8860b57cec5SDimitry Andric }
8870b57cec5SDimitry Andric 
888bdd1243dSDimitry Andric std::optional<SIMemOpInfo>
getStoreInfo(const MachineBasicBlock::iterator & MI) const889bdd1243dSDimitry Andric SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
8900b57cec5SDimitry Andric   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
8910b57cec5SDimitry Andric 
8920b57cec5SDimitry Andric   if (!(!MI->mayLoad() && MI->mayStore()))
893bdd1243dSDimitry Andric     return std::nullopt;
8940b57cec5SDimitry Andric 
8950b57cec5SDimitry Andric   // Be conservative if there are no memory operands.
8960b57cec5SDimitry Andric   if (MI->getNumMemOperands() == 0)
8970b57cec5SDimitry Andric     return SIMemOpInfo();
8980b57cec5SDimitry Andric 
8990b57cec5SDimitry Andric   return constructFromMIWithMMO(MI);
9000b57cec5SDimitry Andric }
9010b57cec5SDimitry Andric 
902bdd1243dSDimitry Andric std::optional<SIMemOpInfo>
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const903bdd1243dSDimitry Andric SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
9040b57cec5SDimitry Andric   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
9050b57cec5SDimitry Andric 
9060b57cec5SDimitry Andric   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
907bdd1243dSDimitry Andric     return std::nullopt;
9080b57cec5SDimitry Andric 
9090b57cec5SDimitry Andric   AtomicOrdering Ordering =
9100b57cec5SDimitry Andric     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
9110b57cec5SDimitry Andric 
9120b57cec5SDimitry Andric   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
9130b57cec5SDimitry Andric   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
9140b57cec5SDimitry Andric   if (!ScopeOrNone) {
9150b57cec5SDimitry Andric     reportUnsupported(MI, "Unsupported atomic synchronization scope");
916bdd1243dSDimitry Andric     return std::nullopt;
9170b57cec5SDimitry Andric   }
9180b57cec5SDimitry Andric 
9190b57cec5SDimitry Andric   SIAtomicScope Scope = SIAtomicScope::NONE;
9200b57cec5SDimitry Andric   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
9210b57cec5SDimitry Andric   bool IsCrossAddressSpaceOrdering = false;
9220b57cec5SDimitry Andric   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
92381ad6265SDimitry Andric       *ScopeOrNone;
9240b57cec5SDimitry Andric 
9250b57cec5SDimitry Andric   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
9260b57cec5SDimitry Andric       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
9270b57cec5SDimitry Andric     reportUnsupported(MI, "Unsupported atomic address space");
928bdd1243dSDimitry Andric     return std::nullopt;
9290b57cec5SDimitry Andric   }
9300b57cec5SDimitry Andric 
9310b57cec5SDimitry Andric   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
932fe6060f1SDimitry Andric                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
9330b57cec5SDimitry Andric }
9340b57cec5SDimitry Andric 
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const935bdd1243dSDimitry Andric std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
9360b57cec5SDimitry Andric     const MachineBasicBlock::iterator &MI) const {
9370b57cec5SDimitry Andric   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
9380b57cec5SDimitry Andric 
9390b57cec5SDimitry Andric   if (!(MI->mayLoad() && MI->mayStore()))
940bdd1243dSDimitry Andric     return std::nullopt;
9410b57cec5SDimitry Andric 
9420b57cec5SDimitry Andric   // Be conservative if there are no memory operands.
9430b57cec5SDimitry Andric   if (MI->getNumMemOperands() == 0)
9440b57cec5SDimitry Andric     return SIMemOpInfo();
9450b57cec5SDimitry Andric 
9460b57cec5SDimitry Andric   return constructFromMIWithMMO(MI);
9470b57cec5SDimitry Andric }
9480b57cec5SDimitry Andric 
SICacheControl(const GCNSubtarget & ST)949e8d8bef9SDimitry Andric SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
9500b57cec5SDimitry Andric   TII = ST.getInstrInfo();
9510b57cec5SDimitry Andric   IV = getIsaVersion(ST.getCPU());
952e8d8bef9SDimitry Andric   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
9530b57cec5SDimitry Andric }
9540b57cec5SDimitry Andric 
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const955fe6060f1SDimitry Andric bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
956fe6060f1SDimitry Andric                                     AMDGPU::CPol::CPol Bit) const {
957fe6060f1SDimitry Andric   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
958fe6060f1SDimitry Andric   if (!CPol)
959fe6060f1SDimitry Andric     return false;
960fe6060f1SDimitry Andric 
961fe6060f1SDimitry Andric   CPol->setImm(CPol->getImm() | Bit);
962fe6060f1SDimitry Andric   return true;
963fe6060f1SDimitry Andric }
964fe6060f1SDimitry Andric 
9650b57cec5SDimitry Andric /* static */
create(const GCNSubtarget & ST)9660b57cec5SDimitry Andric std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
9670b57cec5SDimitry Andric   GCNSubtarget::Generation Generation = ST.getGeneration();
96881ad6265SDimitry Andric   if (ST.hasGFX940Insts())
96981ad6265SDimitry Andric     return std::make_unique<SIGfx940CacheControl>(ST);
970fe6060f1SDimitry Andric   if (ST.hasGFX90AInsts())
971fe6060f1SDimitry Andric     return std::make_unique<SIGfx90ACacheControl>(ST);
9720b57cec5SDimitry Andric   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
9738bcb0991SDimitry Andric     return std::make_unique<SIGfx6CacheControl>(ST);
9740b57cec5SDimitry Andric   if (Generation < AMDGPUSubtarget::GFX10)
9758bcb0991SDimitry Andric     return std::make_unique<SIGfx7CacheControl>(ST);
97681ad6265SDimitry Andric   if (Generation < AMDGPUSubtarget::GFX11)
977e8d8bef9SDimitry Andric     return std::make_unique<SIGfx10CacheControl>(ST);
9781db9f3b2SDimitry Andric   if (Generation < AMDGPUSubtarget::GFX12)
97981ad6265SDimitry Andric     return std::make_unique<SIGfx11CacheControl>(ST);
9801db9f3b2SDimitry Andric   return std::make_unique<SIGfx12CacheControl>(ST);
9810b57cec5SDimitry Andric }
9820b57cec5SDimitry Andric 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const9830b57cec5SDimitry Andric bool SIGfx6CacheControl::enableLoadCacheBypass(
9840b57cec5SDimitry Andric     const MachineBasicBlock::iterator &MI,
9850b57cec5SDimitry Andric     SIAtomicScope Scope,
9860b57cec5SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
9870b57cec5SDimitry Andric   assert(MI->mayLoad() && !MI->mayStore());
9880b57cec5SDimitry Andric   bool Changed = false;
9890b57cec5SDimitry Andric 
9900b57cec5SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
9910b57cec5SDimitry Andric     switch (Scope) {
9920b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
9930b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
9944824e7fdSDimitry Andric       // Set L1 cache policy to MISS_EVICT.
9954824e7fdSDimitry Andric       // Note: there is no L2 cache bypass policy at the ISA level.
9960b57cec5SDimitry Andric       Changed |= enableGLCBit(MI);
9970b57cec5SDimitry Andric       break;
9980b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
9990b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
10000b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
10010b57cec5SDimitry Andric       // No cache to bypass.
10020b57cec5SDimitry Andric       break;
10030b57cec5SDimitry Andric     default:
10040b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
10050b57cec5SDimitry Andric     }
10060b57cec5SDimitry Andric   }
10070b57cec5SDimitry Andric 
10080b57cec5SDimitry Andric   /// The scratch address space does not need the global memory caches
10090b57cec5SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
10100b57cec5SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
10110b57cec5SDimitry Andric   /// memory.
10120b57cec5SDimitry Andric 
1013e8d8bef9SDimitry Andric   /// Other address spaces do not have a cache.
10140b57cec5SDimitry Andric 
10150b57cec5SDimitry Andric   return Changed;
10160b57cec5SDimitry Andric }
10170b57cec5SDimitry Andric 
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1018fe6060f1SDimitry Andric bool SIGfx6CacheControl::enableStoreCacheBypass(
1019fe6060f1SDimitry Andric     const MachineBasicBlock::iterator &MI,
1020fe6060f1SDimitry Andric     SIAtomicScope Scope,
1021fe6060f1SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
1022fe6060f1SDimitry Andric   assert(!MI->mayLoad() && MI->mayStore());
1023fe6060f1SDimitry Andric   bool Changed = false;
1024fe6060f1SDimitry Andric 
1025fe6060f1SDimitry Andric   /// The L1 cache is write through so does not need to be bypassed. There is no
1026fe6060f1SDimitry Andric   /// bypass control for the L2 cache at the isa level.
1027fe6060f1SDimitry Andric 
1028fe6060f1SDimitry Andric   return Changed;
1029fe6060f1SDimitry Andric }
1030fe6060f1SDimitry Andric 
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1031fe6060f1SDimitry Andric bool SIGfx6CacheControl::enableRMWCacheBypass(
1032fe6060f1SDimitry Andric     const MachineBasicBlock::iterator &MI,
1033fe6060f1SDimitry Andric     SIAtomicScope Scope,
1034fe6060f1SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
1035fe6060f1SDimitry Andric   assert(MI->mayLoad() && MI->mayStore());
1036fe6060f1SDimitry Andric   bool Changed = false;
1037fe6060f1SDimitry Andric 
10384824e7fdSDimitry Andric   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
10394824e7fdSDimitry Andric   /// bypassed, and the GLC bit is instead used to indicate if they are
10404824e7fdSDimitry Andric   /// return or no-return.
10414824e7fdSDimitry Andric   /// Note: there is no L2 cache coherent bypass control at the ISA level.
1042fe6060f1SDimitry Andric 
1043fe6060f1SDimitry Andric   return Changed;
1044fe6060f1SDimitry Andric }
1045fe6060f1SDimitry Andric 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1046e8d8bef9SDimitry Andric bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1047e8d8bef9SDimitry Andric     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1048*0fca6ea1SDimitry Andric     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1049e8d8bef9SDimitry Andric   // Only handle load and store, not atomic read-modify-write insructions. The
1050e8d8bef9SDimitry Andric   // latter use glc to indicate if the atomic returns a result and so must not
1051e8d8bef9SDimitry Andric   // be used for cache control.
10520b57cec5SDimitry Andric   assert(MI->mayLoad() ^ MI->mayStore());
1053e8d8bef9SDimitry Andric 
1054e8d8bef9SDimitry Andric   // Only update load and store, not LLVM IR atomic read-modify-write
1055e8d8bef9SDimitry Andric   // instructions. The latter are always marked as volatile so cannot sensibly
1056e8d8bef9SDimitry Andric   // handle it as do not want to pessimize all atomics. Also they do not support
1057e8d8bef9SDimitry Andric   // the nontemporal attribute.
1058e8d8bef9SDimitry Andric   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1059e8d8bef9SDimitry Andric 
10600b57cec5SDimitry Andric   bool Changed = false;
10610b57cec5SDimitry Andric 
1062e8d8bef9SDimitry Andric   if (IsVolatile) {
10634824e7fdSDimitry Andric     // Set L1 cache policy to be MISS_EVICT for load instructions
10644824e7fdSDimitry Andric     // and MISS_LRU for store instructions.
10654824e7fdSDimitry Andric     // Note: there is no L2 cache bypass policy at the ISA level.
1066e8d8bef9SDimitry Andric     if (Op == SIMemOp::LOAD)
10670b57cec5SDimitry Andric       Changed |= enableGLCBit(MI);
1068e8d8bef9SDimitry Andric 
1069e8d8bef9SDimitry Andric     // Ensure operation has completed at system scope to cause all volatile
1070e8d8bef9SDimitry Andric     // operations to be visible outside the program in a global order. Do not
1071e8d8bef9SDimitry Andric     // request cross address space as only the global address space can be
1072e8d8bef9SDimitry Andric     // observable outside the program, so no need to cause a waitcnt for LDS
1073e8d8bef9SDimitry Andric     // address space operations.
1074e8d8bef9SDimitry Andric     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1075e8d8bef9SDimitry Andric                           Position::AFTER);
10760b57cec5SDimitry Andric 
10770b57cec5SDimitry Andric     return Changed;
10780b57cec5SDimitry Andric   }
10790b57cec5SDimitry Andric 
1080e8d8bef9SDimitry Andric   if (IsNonTemporal) {
10814824e7fdSDimitry Andric     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
10824824e7fdSDimitry Andric     // for both loads and stores, and the L2 cache policy to STREAM.
1083e8d8bef9SDimitry Andric     Changed |= enableGLCBit(MI);
1084e8d8bef9SDimitry Andric     Changed |= enableSLCBit(MI);
1085e8d8bef9SDimitry Andric     return Changed;
1086e8d8bef9SDimitry Andric   }
1087e8d8bef9SDimitry Andric 
1088e8d8bef9SDimitry Andric   return Changed;
1089e8d8bef9SDimitry Andric }
1090e8d8bef9SDimitry Andric 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1091e8d8bef9SDimitry Andric bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1092e8d8bef9SDimitry Andric                                     SIAtomicScope Scope,
1093e8d8bef9SDimitry Andric                                     SIAtomicAddrSpace AddrSpace,
1094e8d8bef9SDimitry Andric                                     SIMemOp Op,
1095e8d8bef9SDimitry Andric                                     bool IsCrossAddrSpaceOrdering,
1096e8d8bef9SDimitry Andric                                     Position Pos) const {
1097e8d8bef9SDimitry Andric   bool Changed = false;
1098e8d8bef9SDimitry Andric 
1099e8d8bef9SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
1100e8d8bef9SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
1101e8d8bef9SDimitry Andric 
1102e8d8bef9SDimitry Andric   if (Pos == Position::AFTER)
1103e8d8bef9SDimitry Andric     ++MI;
1104e8d8bef9SDimitry Andric 
1105e8d8bef9SDimitry Andric   bool VMCnt = false;
1106e8d8bef9SDimitry Andric   bool LGKMCnt = false;
1107e8d8bef9SDimitry Andric 
1108e8d8bef9SDimitry Andric   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1109e8d8bef9SDimitry Andric       SIAtomicAddrSpace::NONE) {
1110e8d8bef9SDimitry Andric     switch (Scope) {
1111e8d8bef9SDimitry Andric     case SIAtomicScope::SYSTEM:
1112e8d8bef9SDimitry Andric     case SIAtomicScope::AGENT:
1113e8d8bef9SDimitry Andric       VMCnt |= true;
1114e8d8bef9SDimitry Andric       break;
1115e8d8bef9SDimitry Andric     case SIAtomicScope::WORKGROUP:
1116e8d8bef9SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1117e8d8bef9SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1118e8d8bef9SDimitry Andric       // The L1 cache keeps all memory operations in order for
1119e8d8bef9SDimitry Andric       // wavefronts in the same work-group.
1120e8d8bef9SDimitry Andric       break;
1121e8d8bef9SDimitry Andric     default:
1122e8d8bef9SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1123e8d8bef9SDimitry Andric     }
1124e8d8bef9SDimitry Andric   }
1125e8d8bef9SDimitry Andric 
1126e8d8bef9SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1127e8d8bef9SDimitry Andric     switch (Scope) {
1128e8d8bef9SDimitry Andric     case SIAtomicScope::SYSTEM:
1129e8d8bef9SDimitry Andric     case SIAtomicScope::AGENT:
1130e8d8bef9SDimitry Andric     case SIAtomicScope::WORKGROUP:
1131e8d8bef9SDimitry Andric       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1132e8d8bef9SDimitry Andric       // not needed as LDS operations for all waves are executed in a total
1133e8d8bef9SDimitry Andric       // global ordering as observed by all waves. Required if also
1134e8d8bef9SDimitry Andric       // synchronizing with global/GDS memory as LDS operations could be
1135e8d8bef9SDimitry Andric       // reordered with respect to later global/GDS memory operations of the
1136e8d8bef9SDimitry Andric       // same wave.
1137e8d8bef9SDimitry Andric       LGKMCnt |= IsCrossAddrSpaceOrdering;
1138e8d8bef9SDimitry Andric       break;
1139e8d8bef9SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1140e8d8bef9SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1141e8d8bef9SDimitry Andric       // The LDS keeps all memory operations in order for
114281ad6265SDimitry Andric       // the same wavefront.
1143e8d8bef9SDimitry Andric       break;
1144e8d8bef9SDimitry Andric     default:
1145e8d8bef9SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1146e8d8bef9SDimitry Andric     }
1147e8d8bef9SDimitry Andric   }
1148e8d8bef9SDimitry Andric 
1149e8d8bef9SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1150e8d8bef9SDimitry Andric     switch (Scope) {
1151e8d8bef9SDimitry Andric     case SIAtomicScope::SYSTEM:
1152e8d8bef9SDimitry Andric     case SIAtomicScope::AGENT:
1153e8d8bef9SDimitry Andric       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1154e8d8bef9SDimitry Andric       // is not needed as GDS operations for all waves are executed in a total
1155e8d8bef9SDimitry Andric       // global ordering as observed by all waves. Required if also
1156e8d8bef9SDimitry Andric       // synchronizing with global/LDS memory as GDS operations could be
1157e8d8bef9SDimitry Andric       // reordered with respect to later global/LDS memory operations of the
1158e8d8bef9SDimitry Andric       // same wave.
1159e8d8bef9SDimitry Andric       LGKMCnt |= IsCrossAddrSpaceOrdering;
1160e8d8bef9SDimitry Andric       break;
1161e8d8bef9SDimitry Andric     case SIAtomicScope::WORKGROUP:
1162e8d8bef9SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1163e8d8bef9SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1164e8d8bef9SDimitry Andric       // The GDS keeps all memory operations in order for
1165e8d8bef9SDimitry Andric       // the same work-group.
1166e8d8bef9SDimitry Andric       break;
1167e8d8bef9SDimitry Andric     default:
1168e8d8bef9SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1169e8d8bef9SDimitry Andric     }
1170e8d8bef9SDimitry Andric   }
1171e8d8bef9SDimitry Andric 
1172e8d8bef9SDimitry Andric   if (VMCnt || LGKMCnt) {
1173e8d8bef9SDimitry Andric     unsigned WaitCntImmediate =
1174e8d8bef9SDimitry Andric       AMDGPU::encodeWaitcnt(IV,
1175e8d8bef9SDimitry Andric                             VMCnt ? 0 : getVmcntBitMask(IV),
1176e8d8bef9SDimitry Andric                             getExpcntBitMask(IV),
1177e8d8bef9SDimitry Andric                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
11785f757f3fSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
11795f757f3fSDimitry Andric         .addImm(WaitCntImmediate);
1180e8d8bef9SDimitry Andric     Changed = true;
1181e8d8bef9SDimitry Andric   }
1182e8d8bef9SDimitry Andric 
1183e8d8bef9SDimitry Andric   if (Pos == Position::AFTER)
1184e8d8bef9SDimitry Andric     --MI;
1185e8d8bef9SDimitry Andric 
1186e8d8bef9SDimitry Andric   return Changed;
1187e8d8bef9SDimitry Andric }
1188e8d8bef9SDimitry Andric 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1189e8d8bef9SDimitry Andric bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
11900b57cec5SDimitry Andric                                        SIAtomicScope Scope,
11910b57cec5SDimitry Andric                                        SIAtomicAddrSpace AddrSpace,
11920b57cec5SDimitry Andric                                        Position Pos) const {
11935ffd83dbSDimitry Andric   if (!InsertCacheInv)
11945ffd83dbSDimitry Andric     return false;
11955ffd83dbSDimitry Andric 
11960b57cec5SDimitry Andric   bool Changed = false;
11970b57cec5SDimitry Andric 
11980b57cec5SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
11990b57cec5SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
12000b57cec5SDimitry Andric 
12010b57cec5SDimitry Andric   if (Pos == Position::AFTER)
12020b57cec5SDimitry Andric     ++MI;
12030b57cec5SDimitry Andric 
12040b57cec5SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
12050b57cec5SDimitry Andric     switch (Scope) {
12060b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
12070b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
12080b57cec5SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
12090b57cec5SDimitry Andric       Changed = true;
12100b57cec5SDimitry Andric       break;
12110b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
12120b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
12130b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
12140b57cec5SDimitry Andric       // No cache to invalidate.
12150b57cec5SDimitry Andric       break;
12160b57cec5SDimitry Andric     default:
12170b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
12180b57cec5SDimitry Andric     }
12190b57cec5SDimitry Andric   }
12200b57cec5SDimitry Andric 
12210b57cec5SDimitry Andric   /// The scratch address space does not need the global memory cache
12220b57cec5SDimitry Andric   /// to be flushed as all memory operations by the same thread are
12230b57cec5SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
12240b57cec5SDimitry Andric   /// memory.
12250b57cec5SDimitry Andric 
1226e8d8bef9SDimitry Andric   /// Other address spaces do not have a cache.
12270b57cec5SDimitry Andric 
12280b57cec5SDimitry Andric   if (Pos == Position::AFTER)
12290b57cec5SDimitry Andric     --MI;
12300b57cec5SDimitry Andric 
12310b57cec5SDimitry Andric   return Changed;
12320b57cec5SDimitry Andric }
12330b57cec5SDimitry Andric 
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1234e8d8bef9SDimitry Andric bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
12350b57cec5SDimitry Andric                                        SIAtomicScope Scope,
12360b57cec5SDimitry Andric                                        SIAtomicAddrSpace AddrSpace,
12370b57cec5SDimitry Andric                                        bool IsCrossAddrSpaceOrdering,
12380b57cec5SDimitry Andric                                        Position Pos) const {
1239e8d8bef9SDimitry Andric   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1240e8d8bef9SDimitry Andric                     IsCrossAddrSpaceOrdering, Pos);
12410b57cec5SDimitry Andric }
12420b57cec5SDimitry Andric 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1243e8d8bef9SDimitry Andric bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
12440b57cec5SDimitry Andric                                        SIAtomicScope Scope,
12450b57cec5SDimitry Andric                                        SIAtomicAddrSpace AddrSpace,
12460b57cec5SDimitry Andric                                        Position Pos) const {
12475ffd83dbSDimitry Andric   if (!InsertCacheInv)
12485ffd83dbSDimitry Andric     return false;
12495ffd83dbSDimitry Andric 
12500b57cec5SDimitry Andric   bool Changed = false;
12510b57cec5SDimitry Andric 
12520b57cec5SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
12530b57cec5SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
12540b57cec5SDimitry Andric 
12550b57cec5SDimitry Andric   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
12560b57cec5SDimitry Andric 
1257e8d8bef9SDimitry Andric   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
12580b57cec5SDimitry Andric                                     ? AMDGPU::BUFFER_WBINVL1
12590b57cec5SDimitry Andric                                     : AMDGPU::BUFFER_WBINVL1_VOL;
12600b57cec5SDimitry Andric 
12610b57cec5SDimitry Andric   if (Pos == Position::AFTER)
12620b57cec5SDimitry Andric     ++MI;
12630b57cec5SDimitry Andric 
12640b57cec5SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
12650b57cec5SDimitry Andric     switch (Scope) {
12660b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
12670b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
1268e8d8bef9SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
12690b57cec5SDimitry Andric       Changed = true;
12700b57cec5SDimitry Andric       break;
12710b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
12720b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
12730b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
12740b57cec5SDimitry Andric       // No cache to invalidate.
12750b57cec5SDimitry Andric       break;
12760b57cec5SDimitry Andric     default:
12770b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
12780b57cec5SDimitry Andric     }
12790b57cec5SDimitry Andric   }
12800b57cec5SDimitry Andric 
12810b57cec5SDimitry Andric   /// The scratch address space does not need the global memory cache
12820b57cec5SDimitry Andric   /// to be flushed as all memory operations by the same thread are
12830b57cec5SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
12840b57cec5SDimitry Andric   /// memory.
12850b57cec5SDimitry Andric 
1286e8d8bef9SDimitry Andric   /// Other address spaces do not have a cache.
12870b57cec5SDimitry Andric 
12880b57cec5SDimitry Andric   if (Pos == Position::AFTER)
12890b57cec5SDimitry Andric     --MI;
12900b57cec5SDimitry Andric 
12910b57cec5SDimitry Andric   return Changed;
12920b57cec5SDimitry Andric }
12930b57cec5SDimitry Andric 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1294fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableLoadCacheBypass(
1295fe6060f1SDimitry Andric     const MachineBasicBlock::iterator &MI,
1296fe6060f1SDimitry Andric     SIAtomicScope Scope,
1297fe6060f1SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
1298fe6060f1SDimitry Andric   assert(MI->mayLoad() && !MI->mayStore());
1299fe6060f1SDimitry Andric   bool Changed = false;
1300fe6060f1SDimitry Andric 
1301fe6060f1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1302fe6060f1SDimitry Andric     switch (Scope) {
1303fe6060f1SDimitry Andric     case SIAtomicScope::SYSTEM:
1304fe6060f1SDimitry Andric     case SIAtomicScope::AGENT:
13054824e7fdSDimitry Andric       // Set the L1 cache policy to MISS_LRU.
13064824e7fdSDimitry Andric       // Note: there is no L2 cache bypass policy at the ISA level.
1307fe6060f1SDimitry Andric       Changed |= enableGLCBit(MI);
1308fe6060f1SDimitry Andric       break;
1309fe6060f1SDimitry Andric     case SIAtomicScope::WORKGROUP:
1310fe6060f1SDimitry Andric       // In threadgroup split mode the waves of a work-group can be executing on
1311fe6060f1SDimitry Andric       // different CUs. Therefore need to bypass the L1 which is per CU.
1312fe6060f1SDimitry Andric       // Otherwise in non-threadgroup split mode all waves of a work-group are
1313fe6060f1SDimitry Andric       // on the same CU, and so the L1 does not need to be bypassed.
1314349cc55cSDimitry Andric       if (ST.isTgSplitEnabled())
1315349cc55cSDimitry Andric         Changed |= enableGLCBit(MI);
1316fe6060f1SDimitry Andric       break;
1317fe6060f1SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1318fe6060f1SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1319fe6060f1SDimitry Andric       // No cache to bypass.
1320fe6060f1SDimitry Andric       break;
1321fe6060f1SDimitry Andric     default:
1322fe6060f1SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1323fe6060f1SDimitry Andric     }
1324fe6060f1SDimitry Andric   }
1325fe6060f1SDimitry Andric 
1326fe6060f1SDimitry Andric   /// The scratch address space does not need the global memory caches
1327fe6060f1SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
1328fe6060f1SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
1329fe6060f1SDimitry Andric   /// memory.
1330fe6060f1SDimitry Andric 
1331fe6060f1SDimitry Andric   /// Other address spaces do not have a cache.
1332fe6060f1SDimitry Andric 
1333fe6060f1SDimitry Andric   return Changed;
1334fe6060f1SDimitry Andric }
1335fe6060f1SDimitry Andric 
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1336fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableStoreCacheBypass(
1337fe6060f1SDimitry Andric     const MachineBasicBlock::iterator &MI,
1338fe6060f1SDimitry Andric     SIAtomicScope Scope,
1339fe6060f1SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
1340fe6060f1SDimitry Andric   assert(!MI->mayLoad() && MI->mayStore());
1341fe6060f1SDimitry Andric   bool Changed = false;
1342fe6060f1SDimitry Andric 
1343fe6060f1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1344fe6060f1SDimitry Andric     switch (Scope) {
1345fe6060f1SDimitry Andric     case SIAtomicScope::SYSTEM:
1346fe6060f1SDimitry Andric     case SIAtomicScope::AGENT:
1347fe6060f1SDimitry Andric       /// Do not set glc for store atomic operations as they implicitly write
1348fe6060f1SDimitry Andric       /// through the L1 cache.
1349fe6060f1SDimitry Andric       break;
1350fe6060f1SDimitry Andric     case SIAtomicScope::WORKGROUP:
1351fe6060f1SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1352fe6060f1SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1353fe6060f1SDimitry Andric       // No cache to bypass. Store atomics implicitly write through the L1
1354fe6060f1SDimitry Andric       // cache.
1355fe6060f1SDimitry Andric       break;
1356fe6060f1SDimitry Andric     default:
1357fe6060f1SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1358fe6060f1SDimitry Andric     }
1359fe6060f1SDimitry Andric   }
1360fe6060f1SDimitry Andric 
1361fe6060f1SDimitry Andric   /// The scratch address space does not need the global memory caches
1362fe6060f1SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
1363fe6060f1SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
1364fe6060f1SDimitry Andric   /// memory.
1365fe6060f1SDimitry Andric 
1366fe6060f1SDimitry Andric   /// Other address spaces do not have a cache.
1367fe6060f1SDimitry Andric 
1368fe6060f1SDimitry Andric   return Changed;
1369fe6060f1SDimitry Andric }
1370fe6060f1SDimitry Andric 
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1371fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableRMWCacheBypass(
1372fe6060f1SDimitry Andric     const MachineBasicBlock::iterator &MI,
1373fe6060f1SDimitry Andric     SIAtomicScope Scope,
1374fe6060f1SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
1375fe6060f1SDimitry Andric   assert(MI->mayLoad() && MI->mayStore());
1376fe6060f1SDimitry Andric   bool Changed = false;
1377fe6060f1SDimitry Andric 
1378fe6060f1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1379fe6060f1SDimitry Andric     switch (Scope) {
1380fe6060f1SDimitry Andric     case SIAtomicScope::SYSTEM:
1381fe6060f1SDimitry Andric     case SIAtomicScope::AGENT:
1382fe6060f1SDimitry Andric       /// Do not set glc for RMW atomic operations as they implicitly bypass
1383fe6060f1SDimitry Andric       /// the L1 cache, and the glc bit is instead used to indicate if they are
1384fe6060f1SDimitry Andric       /// return or no-return.
1385fe6060f1SDimitry Andric       break;
1386fe6060f1SDimitry Andric     case SIAtomicScope::WORKGROUP:
1387fe6060f1SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1388fe6060f1SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1389fe6060f1SDimitry Andric       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1390fe6060f1SDimitry Andric       break;
1391fe6060f1SDimitry Andric     default:
1392fe6060f1SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1393fe6060f1SDimitry Andric     }
1394fe6060f1SDimitry Andric   }
1395fe6060f1SDimitry Andric 
1396fe6060f1SDimitry Andric   return Changed;
1397fe6060f1SDimitry Andric }
1398fe6060f1SDimitry Andric 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1399fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1400fe6060f1SDimitry Andric     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1401*0fca6ea1SDimitry Andric     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1402fe6060f1SDimitry Andric   // Only handle load and store, not atomic read-modify-write insructions. The
1403fe6060f1SDimitry Andric   // latter use glc to indicate if the atomic returns a result and so must not
1404fe6060f1SDimitry Andric   // be used for cache control.
1405fe6060f1SDimitry Andric   assert(MI->mayLoad() ^ MI->mayStore());
1406fe6060f1SDimitry Andric 
1407fe6060f1SDimitry Andric   // Only update load and store, not LLVM IR atomic read-modify-write
1408fe6060f1SDimitry Andric   // instructions. The latter are always marked as volatile so cannot sensibly
1409fe6060f1SDimitry Andric   // handle it as do not want to pessimize all atomics. Also they do not support
1410fe6060f1SDimitry Andric   // the nontemporal attribute.
1411fe6060f1SDimitry Andric   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1412fe6060f1SDimitry Andric 
1413fe6060f1SDimitry Andric   bool Changed = false;
1414fe6060f1SDimitry Andric 
1415fe6060f1SDimitry Andric   if (IsVolatile) {
14164824e7fdSDimitry Andric     // Set L1 cache policy to be MISS_EVICT for load instructions
14174824e7fdSDimitry Andric     // and MISS_LRU for store instructions.
14184824e7fdSDimitry Andric     // Note: there is no L2 cache bypass policy at the ISA level.
1419349cc55cSDimitry Andric     if (Op == SIMemOp::LOAD)
1420fe6060f1SDimitry Andric       Changed |= enableGLCBit(MI);
1421fe6060f1SDimitry Andric 
1422fe6060f1SDimitry Andric     // Ensure operation has completed at system scope to cause all volatile
1423fe6060f1SDimitry Andric     // operations to be visible outside the program in a global order. Do not
1424fe6060f1SDimitry Andric     // request cross address space as only the global address space can be
1425fe6060f1SDimitry Andric     // observable outside the program, so no need to cause a waitcnt for LDS
1426fe6060f1SDimitry Andric     // address space operations.
1427fe6060f1SDimitry Andric     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1428fe6060f1SDimitry Andric                           Position::AFTER);
1429fe6060f1SDimitry Andric 
1430fe6060f1SDimitry Andric     return Changed;
1431fe6060f1SDimitry Andric   }
1432fe6060f1SDimitry Andric 
1433fe6060f1SDimitry Andric   if (IsNonTemporal) {
14344824e7fdSDimitry Andric     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
14354824e7fdSDimitry Andric     // for both loads and stores, and the L2 cache policy to STREAM.
1436fe6060f1SDimitry Andric     Changed |= enableGLCBit(MI);
1437fe6060f1SDimitry Andric     Changed |= enableSLCBit(MI);
1438fe6060f1SDimitry Andric     return Changed;
1439fe6060f1SDimitry Andric   }
1440fe6060f1SDimitry Andric 
1441fe6060f1SDimitry Andric   return Changed;
1442fe6060f1SDimitry Andric }
1443fe6060f1SDimitry Andric 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1444fe6060f1SDimitry Andric bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1445fe6060f1SDimitry Andric                                       SIAtomicScope Scope,
1446fe6060f1SDimitry Andric                                       SIAtomicAddrSpace AddrSpace,
1447fe6060f1SDimitry Andric                                       SIMemOp Op,
1448fe6060f1SDimitry Andric                                       bool IsCrossAddrSpaceOrdering,
1449fe6060f1SDimitry Andric                                       Position Pos) const {
1450fe6060f1SDimitry Andric   if (ST.isTgSplitEnabled()) {
1451fe6060f1SDimitry Andric     // In threadgroup split mode the waves of a work-group can be executing on
1452fe6060f1SDimitry Andric     // different CUs. Therefore need to wait for global or GDS memory operations
1453fe6060f1SDimitry Andric     // to complete to ensure they are visible to waves in the other CUs.
1454fe6060f1SDimitry Andric     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1455fe6060f1SDimitry Andric     // the same CU, so no need to wait for global memory as all waves in the
1456fe6060f1SDimitry Andric     // work-group access the same the L1, nor wait for GDS as access are ordered
1457fe6060f1SDimitry Andric     // on a CU.
1458fe6060f1SDimitry Andric     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1459fe6060f1SDimitry Andric                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1460fe6060f1SDimitry Andric         (Scope == SIAtomicScope::WORKGROUP)) {
1461fe6060f1SDimitry Andric       // Same as GFX7 using agent scope.
1462fe6060f1SDimitry Andric       Scope = SIAtomicScope::AGENT;
1463fe6060f1SDimitry Andric     }
1464fe6060f1SDimitry Andric     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1465fe6060f1SDimitry Andric     // LDS memory operations.
1466fe6060f1SDimitry Andric     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1467fe6060f1SDimitry Andric   }
1468fe6060f1SDimitry Andric   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1469fe6060f1SDimitry Andric                                         IsCrossAddrSpaceOrdering, Pos);
1470fe6060f1SDimitry Andric }
1471fe6060f1SDimitry Andric 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1472fe6060f1SDimitry Andric bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1473fe6060f1SDimitry Andric                                          SIAtomicScope Scope,
1474fe6060f1SDimitry Andric                                          SIAtomicAddrSpace AddrSpace,
1475fe6060f1SDimitry Andric                                          Position Pos) const {
1476fe6060f1SDimitry Andric   if (!InsertCacheInv)
1477fe6060f1SDimitry Andric     return false;
1478fe6060f1SDimitry Andric 
1479fe6060f1SDimitry Andric   bool Changed = false;
1480fe6060f1SDimitry Andric 
1481fe6060f1SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
1482fe6060f1SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
1483fe6060f1SDimitry Andric 
1484fe6060f1SDimitry Andric   if (Pos == Position::AFTER)
1485fe6060f1SDimitry Andric     ++MI;
1486fe6060f1SDimitry Andric 
1487fe6060f1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1488fe6060f1SDimitry Andric     switch (Scope) {
1489fe6060f1SDimitry Andric     case SIAtomicScope::SYSTEM:
1490fe6060f1SDimitry Andric       // Ensures that following loads will not see stale remote VMEM data or
1491fe6060f1SDimitry Andric       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1492fe6060f1SDimitry Andric       // CC will never be stale due to the local memory probes.
1493fe6060f1SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1494fe6060f1SDimitry Andric       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1495fe6060f1SDimitry Andric       // hardware does not reorder memory operations by the same wave with
1496fe6060f1SDimitry Andric       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1497fe6060f1SDimitry Andric       // remove any cache lines of earlier writes by the same wave and ensures
1498fe6060f1SDimitry Andric       // later reads by the same wave will refetch the cache lines.
1499fe6060f1SDimitry Andric       Changed = true;
1500fe6060f1SDimitry Andric       break;
1501fe6060f1SDimitry Andric     case SIAtomicScope::AGENT:
1502fe6060f1SDimitry Andric       // Same as GFX7.
1503fe6060f1SDimitry Andric       break;
1504fe6060f1SDimitry Andric     case SIAtomicScope::WORKGROUP:
1505fe6060f1SDimitry Andric       // In threadgroup split mode the waves of a work-group can be executing on
1506fe6060f1SDimitry Andric       // different CUs. Therefore need to invalidate the L1 which is per CU.
1507fe6060f1SDimitry Andric       // Otherwise in non-threadgroup split mode all waves of a work-group are
1508fe6060f1SDimitry Andric       // on the same CU, and so the L1 does not need to be invalidated.
1509fe6060f1SDimitry Andric       if (ST.isTgSplitEnabled()) {
1510fe6060f1SDimitry Andric         // Same as GFX7 using agent scope.
1511fe6060f1SDimitry Andric         Scope = SIAtomicScope::AGENT;
1512fe6060f1SDimitry Andric       }
1513fe6060f1SDimitry Andric       break;
1514fe6060f1SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1515fe6060f1SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1516fe6060f1SDimitry Andric       // Same as GFX7.
1517fe6060f1SDimitry Andric       break;
1518fe6060f1SDimitry Andric     default:
1519fe6060f1SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1520fe6060f1SDimitry Andric     }
1521fe6060f1SDimitry Andric   }
1522fe6060f1SDimitry Andric 
1523fe6060f1SDimitry Andric   /// The scratch address space does not need the global memory cache
1524fe6060f1SDimitry Andric   /// to be flushed as all memory operations by the same thread are
1525fe6060f1SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
1526fe6060f1SDimitry Andric   /// memory.
1527fe6060f1SDimitry Andric 
1528fe6060f1SDimitry Andric   /// Other address spaces do not have a cache.
1529fe6060f1SDimitry Andric 
1530fe6060f1SDimitry Andric   if (Pos == Position::AFTER)
1531fe6060f1SDimitry Andric     --MI;
1532fe6060f1SDimitry Andric 
1533fe6060f1SDimitry Andric   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1534fe6060f1SDimitry Andric 
1535fe6060f1SDimitry Andric   return Changed;
1536fe6060f1SDimitry Andric }
1537fe6060f1SDimitry Andric 
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1538fe6060f1SDimitry Andric bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1539fe6060f1SDimitry Andric                                          SIAtomicScope Scope,
1540fe6060f1SDimitry Andric                                          SIAtomicAddrSpace AddrSpace,
1541fe6060f1SDimitry Andric                                          bool IsCrossAddrSpaceOrdering,
1542fe6060f1SDimitry Andric                                          Position Pos) const {
1543fe6060f1SDimitry Andric   bool Changed = false;
1544fe6060f1SDimitry Andric 
1545fe6060f1SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
15461db9f3b2SDimitry Andric   const DebugLoc &DL = MI->getDebugLoc();
1547fe6060f1SDimitry Andric 
1548fe6060f1SDimitry Andric   if (Pos == Position::AFTER)
1549fe6060f1SDimitry Andric     ++MI;
1550fe6060f1SDimitry Andric 
1551fe6060f1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1552fe6060f1SDimitry Andric     switch (Scope) {
1553fe6060f1SDimitry Andric     case SIAtomicScope::SYSTEM:
1554fe6060f1SDimitry Andric       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1555fe6060f1SDimitry Andric       // hardware does not reorder memory operations by the same wave with
1556fe6060f1SDimitry Andric       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1557fe6060f1SDimitry Andric       // to initiate writeback of any dirty cache lines of earlier writes by the
1558fe6060f1SDimitry Andric       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1559fe6060f1SDimitry Andric       // writeback has completed.
156081ad6265SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
156181ad6265SDimitry Andric         // Set SC bits to indicate system scope.
156281ad6265SDimitry Andric         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1563fe6060f1SDimitry Andric       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1564fe6060f1SDimitry Andric       // vmcnt(0)" needed by the "BUFFER_WBL2".
1565fe6060f1SDimitry Andric       Changed = true;
1566fe6060f1SDimitry Andric       break;
1567fe6060f1SDimitry Andric     case SIAtomicScope::AGENT:
1568fe6060f1SDimitry Andric     case SIAtomicScope::WORKGROUP:
1569fe6060f1SDimitry Andric     case SIAtomicScope::WAVEFRONT:
1570fe6060f1SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
1571fe6060f1SDimitry Andric       // Same as GFX7.
1572fe6060f1SDimitry Andric       break;
1573fe6060f1SDimitry Andric     default:
1574fe6060f1SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
1575fe6060f1SDimitry Andric     }
1576fe6060f1SDimitry Andric   }
1577fe6060f1SDimitry Andric 
1578fe6060f1SDimitry Andric   if (Pos == Position::AFTER)
1579fe6060f1SDimitry Andric     --MI;
1580fe6060f1SDimitry Andric 
1581fe6060f1SDimitry Andric   Changed |=
1582fe6060f1SDimitry Andric       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1583fe6060f1SDimitry Andric                                         IsCrossAddrSpaceOrdering, Pos);
1584fe6060f1SDimitry Andric 
1585fe6060f1SDimitry Andric   return Changed;
1586fe6060f1SDimitry Andric }
1587fe6060f1SDimitry Andric 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const158881ad6265SDimitry Andric bool SIGfx940CacheControl::enableLoadCacheBypass(
158981ad6265SDimitry Andric     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
159081ad6265SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
159181ad6265SDimitry Andric   assert(MI->mayLoad() && !MI->mayStore());
159281ad6265SDimitry Andric   bool Changed = false;
159381ad6265SDimitry Andric 
159481ad6265SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
159581ad6265SDimitry Andric     switch (Scope) {
159681ad6265SDimitry Andric     case SIAtomicScope::SYSTEM:
159781ad6265SDimitry Andric       // Set SC bits to indicate system scope.
159881ad6265SDimitry Andric       Changed |= enableSC0Bit(MI);
159981ad6265SDimitry Andric       Changed |= enableSC1Bit(MI);
160081ad6265SDimitry Andric       break;
160181ad6265SDimitry Andric     case SIAtomicScope::AGENT:
160281ad6265SDimitry Andric       // Set SC bits to indicate agent scope.
160381ad6265SDimitry Andric       Changed |= enableSC1Bit(MI);
160481ad6265SDimitry Andric       break;
160581ad6265SDimitry Andric     case SIAtomicScope::WORKGROUP:
160681ad6265SDimitry Andric       // In threadgroup split mode the waves of a work-group can be executing on
160781ad6265SDimitry Andric       // different CUs. Therefore need to bypass the L1 which is per CU.
160881ad6265SDimitry Andric       // Otherwise in non-threadgroup split mode all waves of a work-group are
160981ad6265SDimitry Andric       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
161081ad6265SDimitry Andric       // bits to indicate work-group scope will do this automatically.
161181ad6265SDimitry Andric       Changed |= enableSC0Bit(MI);
161281ad6265SDimitry Andric       break;
161381ad6265SDimitry Andric     case SIAtomicScope::WAVEFRONT:
161481ad6265SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
161581ad6265SDimitry Andric       // Leave SC bits unset to indicate wavefront scope.
161681ad6265SDimitry Andric       break;
161781ad6265SDimitry Andric     default:
161881ad6265SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
161981ad6265SDimitry Andric     }
162081ad6265SDimitry Andric   }
162181ad6265SDimitry Andric 
162281ad6265SDimitry Andric   /// The scratch address space does not need the global memory caches
162381ad6265SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
162481ad6265SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
162581ad6265SDimitry Andric   /// memory.
162681ad6265SDimitry Andric 
162781ad6265SDimitry Andric   /// Other address spaces do not have a cache.
162881ad6265SDimitry Andric 
162981ad6265SDimitry Andric   return Changed;
163081ad6265SDimitry Andric }
163181ad6265SDimitry Andric 
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const163281ad6265SDimitry Andric bool SIGfx940CacheControl::enableStoreCacheBypass(
163381ad6265SDimitry Andric     const MachineBasicBlock::iterator &MI,
163481ad6265SDimitry Andric     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
163581ad6265SDimitry Andric   assert(!MI->mayLoad() && MI->mayStore());
163681ad6265SDimitry Andric   bool Changed = false;
163781ad6265SDimitry Andric 
163881ad6265SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
163981ad6265SDimitry Andric     switch (Scope) {
164081ad6265SDimitry Andric     case SIAtomicScope::SYSTEM:
164181ad6265SDimitry Andric       // Set SC bits to indicate system scope.
164281ad6265SDimitry Andric       Changed |= enableSC0Bit(MI);
164381ad6265SDimitry Andric       Changed |= enableSC1Bit(MI);
164481ad6265SDimitry Andric       break;
164581ad6265SDimitry Andric     case SIAtomicScope::AGENT:
164681ad6265SDimitry Andric       // Set SC bits to indicate agent scope.
164781ad6265SDimitry Andric       Changed |= enableSC1Bit(MI);
164881ad6265SDimitry Andric       break;
164981ad6265SDimitry Andric     case SIAtomicScope::WORKGROUP:
165081ad6265SDimitry Andric       // Set SC bits to indicate workgroup scope.
165181ad6265SDimitry Andric       Changed |= enableSC0Bit(MI);
165281ad6265SDimitry Andric       break;
165381ad6265SDimitry Andric     case SIAtomicScope::WAVEFRONT:
165481ad6265SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
165581ad6265SDimitry Andric       // Leave SC bits unset to indicate wavefront scope.
165681ad6265SDimitry Andric       break;
165781ad6265SDimitry Andric     default:
165881ad6265SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
165981ad6265SDimitry Andric     }
166081ad6265SDimitry Andric   }
166181ad6265SDimitry Andric 
166281ad6265SDimitry Andric   /// The scratch address space does not need the global memory caches
166381ad6265SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
166481ad6265SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
166581ad6265SDimitry Andric   /// memory.
166681ad6265SDimitry Andric 
166781ad6265SDimitry Andric   /// Other address spaces do not have a cache.
166881ad6265SDimitry Andric 
166981ad6265SDimitry Andric   return Changed;
167081ad6265SDimitry Andric }
167181ad6265SDimitry Andric 
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const167281ad6265SDimitry Andric bool SIGfx940CacheControl::enableRMWCacheBypass(
167381ad6265SDimitry Andric     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
167481ad6265SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
167581ad6265SDimitry Andric   assert(MI->mayLoad() && MI->mayStore());
167681ad6265SDimitry Andric   bool Changed = false;
167781ad6265SDimitry Andric 
167881ad6265SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
167981ad6265SDimitry Andric     switch (Scope) {
168081ad6265SDimitry Andric     case SIAtomicScope::SYSTEM:
168181ad6265SDimitry Andric       // Set SC1 bit to indicate system scope.
168281ad6265SDimitry Andric       Changed |= enableSC1Bit(MI);
168381ad6265SDimitry Andric       break;
168481ad6265SDimitry Andric     case SIAtomicScope::AGENT:
168581ad6265SDimitry Andric     case SIAtomicScope::WORKGROUP:
168681ad6265SDimitry Andric     case SIAtomicScope::WAVEFRONT:
168781ad6265SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
168881ad6265SDimitry Andric       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
168981ad6265SDimitry Andric       // to indicate system or agent scope. The SC0 bit is used to indicate if
169081ad6265SDimitry Andric       // they are return or no-return. Leave SC1 bit unset to indicate agent
169181ad6265SDimitry Andric       // scope.
169281ad6265SDimitry Andric       break;
169381ad6265SDimitry Andric     default:
169481ad6265SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
169581ad6265SDimitry Andric     }
169681ad6265SDimitry Andric   }
169781ad6265SDimitry Andric 
169881ad6265SDimitry Andric   return Changed;
169981ad6265SDimitry Andric }
170081ad6265SDimitry Andric 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const170181ad6265SDimitry Andric bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
170281ad6265SDimitry Andric     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1703*0fca6ea1SDimitry Andric     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
170481ad6265SDimitry Andric   // Only handle load and store, not atomic read-modify-write insructions. The
170581ad6265SDimitry Andric   // latter use glc to indicate if the atomic returns a result and so must not
170681ad6265SDimitry Andric   // be used for cache control.
170781ad6265SDimitry Andric   assert(MI->mayLoad() ^ MI->mayStore());
170881ad6265SDimitry Andric 
170981ad6265SDimitry Andric   // Only update load and store, not LLVM IR atomic read-modify-write
171081ad6265SDimitry Andric   // instructions. The latter are always marked as volatile so cannot sensibly
171181ad6265SDimitry Andric   // handle it as do not want to pessimize all atomics. Also they do not support
171281ad6265SDimitry Andric   // the nontemporal attribute.
171381ad6265SDimitry Andric   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
171481ad6265SDimitry Andric 
171581ad6265SDimitry Andric   bool Changed = false;
171681ad6265SDimitry Andric 
171781ad6265SDimitry Andric   if (IsVolatile) {
171881ad6265SDimitry Andric     // Set SC bits to indicate system scope.
171981ad6265SDimitry Andric     Changed |= enableSC0Bit(MI);
172081ad6265SDimitry Andric     Changed |= enableSC1Bit(MI);
172181ad6265SDimitry Andric 
172281ad6265SDimitry Andric     // Ensure operation has completed at system scope to cause all volatile
172381ad6265SDimitry Andric     // operations to be visible outside the program in a global order. Do not
172481ad6265SDimitry Andric     // request cross address space as only the global address space can be
172581ad6265SDimitry Andric     // observable outside the program, so no need to cause a waitcnt for LDS
172681ad6265SDimitry Andric     // address space operations.
172781ad6265SDimitry Andric     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
172881ad6265SDimitry Andric                           Position::AFTER);
172981ad6265SDimitry Andric 
173081ad6265SDimitry Andric     return Changed;
173181ad6265SDimitry Andric   }
173281ad6265SDimitry Andric 
173381ad6265SDimitry Andric   if (IsNonTemporal) {
173481ad6265SDimitry Andric     Changed |= enableNTBit(MI);
173581ad6265SDimitry Andric     return Changed;
173681ad6265SDimitry Andric   }
173781ad6265SDimitry Andric 
173881ad6265SDimitry Andric   return Changed;
173981ad6265SDimitry Andric }
174081ad6265SDimitry Andric 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const174181ad6265SDimitry Andric bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
174281ad6265SDimitry Andric                                          SIAtomicScope Scope,
174381ad6265SDimitry Andric                                          SIAtomicAddrSpace AddrSpace,
174481ad6265SDimitry Andric                                          Position Pos) const {
174581ad6265SDimitry Andric   if (!InsertCacheInv)
174681ad6265SDimitry Andric     return false;
174781ad6265SDimitry Andric 
174881ad6265SDimitry Andric   bool Changed = false;
174981ad6265SDimitry Andric 
175081ad6265SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
175181ad6265SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
175281ad6265SDimitry Andric 
175381ad6265SDimitry Andric   if (Pos == Position::AFTER)
175481ad6265SDimitry Andric     ++MI;
175581ad6265SDimitry Andric 
175681ad6265SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
175781ad6265SDimitry Andric     switch (Scope) {
175881ad6265SDimitry Andric     case SIAtomicScope::SYSTEM:
175981ad6265SDimitry Andric       // Ensures that following loads will not see stale remote VMEM data or
176081ad6265SDimitry Andric       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
176181ad6265SDimitry Andric       // CC will never be stale due to the local memory probes.
176281ad6265SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
176381ad6265SDimitry Andric           // Set SC bits to indicate system scope.
176481ad6265SDimitry Andric           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
176581ad6265SDimitry Andric       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
176681ad6265SDimitry Andric       // hardware does not reorder memory operations by the same wave with
176781ad6265SDimitry Andric       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
176881ad6265SDimitry Andric       // remove any cache lines of earlier writes by the same wave and ensures
176981ad6265SDimitry Andric       // later reads by the same wave will refetch the cache lines.
177081ad6265SDimitry Andric       Changed = true;
177181ad6265SDimitry Andric       break;
177281ad6265SDimitry Andric     case SIAtomicScope::AGENT:
177381ad6265SDimitry Andric       // Ensures that following loads will not see stale remote date or local
177481ad6265SDimitry Andric       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
177581ad6265SDimitry Andric       // due to the memory probes.
177681ad6265SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
177781ad6265SDimitry Andric           // Set SC bits to indicate agent scope.
177881ad6265SDimitry Andric           .addImm(AMDGPU::CPol::SC1);
177981ad6265SDimitry Andric       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
178081ad6265SDimitry Andric       // does not reorder memory operations with respect to preceeding buffer
178181ad6265SDimitry Andric       // invalidate. The invalidate is guaranteed to remove any cache lines of
178281ad6265SDimitry Andric       // earlier writes and ensures later writes will refetch the cache lines.
178381ad6265SDimitry Andric       Changed = true;
178481ad6265SDimitry Andric       break;
178581ad6265SDimitry Andric     case SIAtomicScope::WORKGROUP:
178681ad6265SDimitry Andric       // In threadgroup split mode the waves of a work-group can be executing on
178781ad6265SDimitry Andric       // different CUs. Therefore need to invalidate the L1 which is per CU.
178881ad6265SDimitry Andric       // Otherwise in non-threadgroup split mode all waves of a work-group are
178981ad6265SDimitry Andric       // on the same CU, and so the L1 does not need to be invalidated.
179081ad6265SDimitry Andric       if (ST.isTgSplitEnabled()) {
179181ad6265SDimitry Andric         // Ensures L1 is invalidated if in threadgroup split mode. In
179281ad6265SDimitry Andric         // non-threadgroup split mode it is a NOP, but no point generating it in
179381ad6265SDimitry Andric         // that case if know not in that mode.
179481ad6265SDimitry Andric         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
179581ad6265SDimitry Andric             // Set SC bits to indicate work-group scope.
179681ad6265SDimitry Andric             .addImm(AMDGPU::CPol::SC0);
179781ad6265SDimitry Andric         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
179881ad6265SDimitry Andric         // does not reorder memory operations with respect to preceeding buffer
179981ad6265SDimitry Andric         // invalidate. The invalidate is guaranteed to remove any cache lines of
180081ad6265SDimitry Andric         // earlier writes and ensures later writes will refetch the cache lines.
180181ad6265SDimitry Andric         Changed = true;
180281ad6265SDimitry Andric       }
180381ad6265SDimitry Andric       break;
180481ad6265SDimitry Andric     case SIAtomicScope::WAVEFRONT:
180581ad6265SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
180681ad6265SDimitry Andric       // Could generate "BUFFER_INV" but it would do nothing as there are no
180781ad6265SDimitry Andric       // caches to invalidate.
180881ad6265SDimitry Andric       break;
180981ad6265SDimitry Andric     default:
181081ad6265SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
181181ad6265SDimitry Andric     }
181281ad6265SDimitry Andric   }
181381ad6265SDimitry Andric 
181481ad6265SDimitry Andric   /// The scratch address space does not need the global memory cache
181581ad6265SDimitry Andric   /// to be flushed as all memory operations by the same thread are
181681ad6265SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
181781ad6265SDimitry Andric   /// memory.
181881ad6265SDimitry Andric 
181981ad6265SDimitry Andric   /// Other address spaces do not have a cache.
182081ad6265SDimitry Andric 
182181ad6265SDimitry Andric   if (Pos == Position::AFTER)
182281ad6265SDimitry Andric     --MI;
182381ad6265SDimitry Andric 
182481ad6265SDimitry Andric   return Changed;
182581ad6265SDimitry Andric }
182681ad6265SDimitry Andric 
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const182781ad6265SDimitry Andric bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
182881ad6265SDimitry Andric                                          SIAtomicScope Scope,
182981ad6265SDimitry Andric                                          SIAtomicAddrSpace AddrSpace,
183081ad6265SDimitry Andric                                          bool IsCrossAddrSpaceOrdering,
183181ad6265SDimitry Andric                                          Position Pos) const {
183281ad6265SDimitry Andric   bool Changed = false;
183381ad6265SDimitry Andric 
183481ad6265SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
183581ad6265SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
183681ad6265SDimitry Andric 
183781ad6265SDimitry Andric   if (Pos == Position::AFTER)
183881ad6265SDimitry Andric     ++MI;
183981ad6265SDimitry Andric 
184081ad6265SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
184181ad6265SDimitry Andric     switch (Scope) {
184281ad6265SDimitry Andric     case SIAtomicScope::SYSTEM:
184381ad6265SDimitry Andric       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
184481ad6265SDimitry Andric       // hardware does not reorder memory operations by the same wave with
184581ad6265SDimitry Andric       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
184681ad6265SDimitry Andric       // to initiate writeback of any dirty cache lines of earlier writes by the
184781ad6265SDimitry Andric       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
184881ad6265SDimitry Andric       // writeback has completed.
184981ad6265SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
185081ad6265SDimitry Andric           // Set SC bits to indicate system scope.
185181ad6265SDimitry Andric           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
185281ad6265SDimitry Andric       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
185381ad6265SDimitry Andric       // SIAtomicScope::SYSTEM, the following insertWait will generate the
185481ad6265SDimitry Andric       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
185581ad6265SDimitry Andric       Changed = true;
185681ad6265SDimitry Andric       break;
185781ad6265SDimitry Andric     case SIAtomicScope::AGENT:
185881ad6265SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
185981ad6265SDimitry Andric           // Set SC bits to indicate agent scope.
186081ad6265SDimitry Andric           .addImm(AMDGPU::CPol::SC1);
186181ad6265SDimitry Andric 
186281ad6265SDimitry Andric       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
186381ad6265SDimitry Andric       // SIAtomicScope::AGENT, the following insertWait will generate the
186481ad6265SDimitry Andric       // required "S_WAITCNT vmcnt(0)".
186581ad6265SDimitry Andric       Changed = true;
186681ad6265SDimitry Andric       break;
186781ad6265SDimitry Andric     case SIAtomicScope::WORKGROUP:
186881ad6265SDimitry Andric     case SIAtomicScope::WAVEFRONT:
186981ad6265SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
187081ad6265SDimitry Andric       // Do not generate "BUFFER_WBL2" as there are no caches it would
187181ad6265SDimitry Andric       // writeback, and would require an otherwise unnecessary
187281ad6265SDimitry Andric       // "S_WAITCNT vmcnt(0)".
187381ad6265SDimitry Andric       break;
187481ad6265SDimitry Andric     default:
187581ad6265SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
187681ad6265SDimitry Andric     }
187781ad6265SDimitry Andric   }
187881ad6265SDimitry Andric 
187981ad6265SDimitry Andric   if (Pos == Position::AFTER)
188081ad6265SDimitry Andric     --MI;
188181ad6265SDimitry Andric 
188281ad6265SDimitry Andric   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
188381ad6265SDimitry Andric   // S_WAITCNT needed.
188481ad6265SDimitry Andric   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
188581ad6265SDimitry Andric                         IsCrossAddrSpaceOrdering, Pos);
188681ad6265SDimitry Andric 
188781ad6265SDimitry Andric   return Changed;
188881ad6265SDimitry Andric }
188981ad6265SDimitry Andric 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const18900b57cec5SDimitry Andric bool SIGfx10CacheControl::enableLoadCacheBypass(
18910b57cec5SDimitry Andric     const MachineBasicBlock::iterator &MI,
18920b57cec5SDimitry Andric     SIAtomicScope Scope,
18930b57cec5SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
18940b57cec5SDimitry Andric   assert(MI->mayLoad() && !MI->mayStore());
18950b57cec5SDimitry Andric   bool Changed = false;
18960b57cec5SDimitry Andric 
18970b57cec5SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
18980b57cec5SDimitry Andric     switch (Scope) {
18990b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
19000b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
19014824e7fdSDimitry Andric       // Set the L0 and L1 cache policies to MISS_EVICT.
19024824e7fdSDimitry Andric       // Note: there is no L2 cache coherent bypass control at the ISA level.
19030b57cec5SDimitry Andric       Changed |= enableGLCBit(MI);
19040b57cec5SDimitry Andric       Changed |= enableDLCBit(MI);
19050b57cec5SDimitry Andric       break;
19060b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
19070b57cec5SDimitry Andric       // In WGP mode the waves of a work-group can be executing on either CU of
19080b57cec5SDimitry Andric       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1909e8d8bef9SDimitry Andric       // CU mode all waves of a work-group are on the same CU, and so the L0
1910e8d8bef9SDimitry Andric       // does not need to be bypassed.
1911349cc55cSDimitry Andric       if (!ST.isCuModeEnabled())
1912349cc55cSDimitry Andric         Changed |= enableGLCBit(MI);
19130b57cec5SDimitry Andric       break;
19140b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
19150b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
19160b57cec5SDimitry Andric       // No cache to bypass.
19170b57cec5SDimitry Andric       break;
19180b57cec5SDimitry Andric     default:
19190b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
19200b57cec5SDimitry Andric     }
19210b57cec5SDimitry Andric   }
19220b57cec5SDimitry Andric 
19230b57cec5SDimitry Andric   /// The scratch address space does not need the global memory caches
19240b57cec5SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
19250b57cec5SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
19260b57cec5SDimitry Andric   /// memory.
19270b57cec5SDimitry Andric 
1928e8d8bef9SDimitry Andric   /// Other address spaces do not have a cache.
19290b57cec5SDimitry Andric 
19300b57cec5SDimitry Andric   return Changed;
19310b57cec5SDimitry Andric }
19320b57cec5SDimitry Andric 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1933e8d8bef9SDimitry Andric bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1934e8d8bef9SDimitry Andric     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1935*0fca6ea1SDimitry Andric     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1936e8d8bef9SDimitry Andric 
1937e8d8bef9SDimitry Andric   // Only handle load and store, not atomic read-modify-write insructions. The
1938e8d8bef9SDimitry Andric   // latter use glc to indicate if the atomic returns a result and so must not
1939e8d8bef9SDimitry Andric   // be used for cache control.
19400b57cec5SDimitry Andric   assert(MI->mayLoad() ^ MI->mayStore());
1941e8d8bef9SDimitry Andric 
1942e8d8bef9SDimitry Andric   // Only update load and store, not LLVM IR atomic read-modify-write
1943e8d8bef9SDimitry Andric   // instructions. The latter are always marked as volatile so cannot sensibly
1944e8d8bef9SDimitry Andric   // handle it as do not want to pessimize all atomics. Also they do not support
1945e8d8bef9SDimitry Andric   // the nontemporal attribute.
1946e8d8bef9SDimitry Andric   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1947e8d8bef9SDimitry Andric 
19480b57cec5SDimitry Andric   bool Changed = false;
19490b57cec5SDimitry Andric 
1950e8d8bef9SDimitry Andric   if (IsVolatile) {
19514824e7fdSDimitry Andric     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
19524824e7fdSDimitry Andric     // and MISS_LRU for store instructions.
19534824e7fdSDimitry Andric     // Note: there is no L2 cache coherent bypass control at the ISA level.
1954e8d8bef9SDimitry Andric     if (Op == SIMemOp::LOAD) {
1955e8d8bef9SDimitry Andric       Changed |= enableGLCBit(MI);
1956e8d8bef9SDimitry Andric       Changed |= enableDLCBit(MI);
1957e8d8bef9SDimitry Andric     }
1958e8d8bef9SDimitry Andric 
1959e8d8bef9SDimitry Andric     // Ensure operation has completed at system scope to cause all volatile
1960e8d8bef9SDimitry Andric     // operations to be visible outside the program in a global order. Do not
1961e8d8bef9SDimitry Andric     // request cross address space as only the global address space can be
1962e8d8bef9SDimitry Andric     // observable outside the program, so no need to cause a waitcnt for LDS
1963e8d8bef9SDimitry Andric     // address space operations.
1964e8d8bef9SDimitry Andric     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1965e8d8bef9SDimitry Andric                           Position::AFTER);
19660b57cec5SDimitry Andric     return Changed;
19670b57cec5SDimitry Andric   }
19680b57cec5SDimitry Andric 
1969e8d8bef9SDimitry Andric   if (IsNonTemporal) {
19704824e7fdSDimitry Andric     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
19714824e7fdSDimitry Andric     // and L2 cache policy to STREAM.
19724824e7fdSDimitry Andric     // For stores setting both GLC and SLC configures L0 and L1 cache policy
19734824e7fdSDimitry Andric     // to MISS_EVICT and the L2 cache policy to STREAM.
19744824e7fdSDimitry Andric     if (Op == SIMemOp::STORE)
19754824e7fdSDimitry Andric       Changed |= enableGLCBit(MI);
1976e8d8bef9SDimitry Andric     Changed |= enableSLCBit(MI);
19774824e7fdSDimitry Andric 
1978e8d8bef9SDimitry Andric     return Changed;
19790b57cec5SDimitry Andric   }
19800b57cec5SDimitry Andric 
19810b57cec5SDimitry Andric   return Changed;
19820b57cec5SDimitry Andric }
19830b57cec5SDimitry Andric 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const19840b57cec5SDimitry Andric bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
19850b57cec5SDimitry Andric                                      SIAtomicScope Scope,
19860b57cec5SDimitry Andric                                      SIAtomicAddrSpace AddrSpace,
19870b57cec5SDimitry Andric                                      SIMemOp Op,
19880b57cec5SDimitry Andric                                      bool IsCrossAddrSpaceOrdering,
19890b57cec5SDimitry Andric                                      Position Pos) const {
19900b57cec5SDimitry Andric   bool Changed = false;
19910b57cec5SDimitry Andric 
19920b57cec5SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
19930b57cec5SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
19940b57cec5SDimitry Andric 
19950b57cec5SDimitry Andric   if (Pos == Position::AFTER)
19960b57cec5SDimitry Andric     ++MI;
19970b57cec5SDimitry Andric 
19980b57cec5SDimitry Andric   bool VMCnt = false;
19990b57cec5SDimitry Andric   bool VSCnt = false;
20000b57cec5SDimitry Andric   bool LGKMCnt = false;
20010b57cec5SDimitry Andric 
2002e8d8bef9SDimitry Andric   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2003e8d8bef9SDimitry Andric       SIAtomicAddrSpace::NONE) {
20040b57cec5SDimitry Andric     switch (Scope) {
20050b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
20060b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
20070b57cec5SDimitry Andric       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
20080b57cec5SDimitry Andric         VMCnt |= true;
20090b57cec5SDimitry Andric       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
20100b57cec5SDimitry Andric         VSCnt |= true;
20110b57cec5SDimitry Andric       break;
20120b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
20130b57cec5SDimitry Andric       // In WGP mode the waves of a work-group can be executing on either CU of
20140b57cec5SDimitry Andric       // the WGP. Therefore need to wait for operations to complete to ensure
20150b57cec5SDimitry Andric       // they are visible to waves in the other CU as the L0 is per CU.
20160b57cec5SDimitry Andric       // Otherwise in CU mode and all waves of a work-group are on the same CU
20170b57cec5SDimitry Andric       // which shares the same L0.
2018e8d8bef9SDimitry Andric       if (!ST.isCuModeEnabled()) {
20190b57cec5SDimitry Andric         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
20200b57cec5SDimitry Andric           VMCnt |= true;
20210b57cec5SDimitry Andric         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
20220b57cec5SDimitry Andric           VSCnt |= true;
20230b57cec5SDimitry Andric       }
20240b57cec5SDimitry Andric       break;
20250b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
20260b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
20270b57cec5SDimitry Andric       // The L0 cache keeps all memory operations in order for
20280b57cec5SDimitry Andric       // work-items in the same wavefront.
20290b57cec5SDimitry Andric       break;
20300b57cec5SDimitry Andric     default:
20310b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
20320b57cec5SDimitry Andric     }
20330b57cec5SDimitry Andric   }
20340b57cec5SDimitry Andric 
20350b57cec5SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
20360b57cec5SDimitry Andric     switch (Scope) {
20370b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
20380b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
20390b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
2040e8d8bef9SDimitry Andric       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2041e8d8bef9SDimitry Andric       // not needed as LDS operations for all waves are executed in a total
2042e8d8bef9SDimitry Andric       // global ordering as observed by all waves. Required if also
2043e8d8bef9SDimitry Andric       // synchronizing with global/GDS memory as LDS operations could be
2044e8d8bef9SDimitry Andric       // reordered with respect to later global/GDS memory operations of the
2045e8d8bef9SDimitry Andric       // same wave.
20460b57cec5SDimitry Andric       LGKMCnt |= IsCrossAddrSpaceOrdering;
20470b57cec5SDimitry Andric       break;
20480b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
20490b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
20500b57cec5SDimitry Andric       // The LDS keeps all memory operations in order for
205181ad6265SDimitry Andric       // the same wavefront.
20520b57cec5SDimitry Andric       break;
20530b57cec5SDimitry Andric     default:
20540b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
20550b57cec5SDimitry Andric     }
20560b57cec5SDimitry Andric   }
20570b57cec5SDimitry Andric 
20580b57cec5SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
20590b57cec5SDimitry Andric     switch (Scope) {
20600b57cec5SDimitry Andric     case SIAtomicScope::SYSTEM:
20610b57cec5SDimitry Andric     case SIAtomicScope::AGENT:
2062e8d8bef9SDimitry Andric       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2063e8d8bef9SDimitry Andric       // is not needed as GDS operations for all waves are executed in a total
2064e8d8bef9SDimitry Andric       // global ordering as observed by all waves. Required if also
2065e8d8bef9SDimitry Andric       // synchronizing with global/LDS memory as GDS operations could be
2066e8d8bef9SDimitry Andric       // reordered with respect to later global/LDS memory operations of the
2067e8d8bef9SDimitry Andric       // same wave.
20680b57cec5SDimitry Andric       LGKMCnt |= IsCrossAddrSpaceOrdering;
20690b57cec5SDimitry Andric       break;
20700b57cec5SDimitry Andric     case SIAtomicScope::WORKGROUP:
20710b57cec5SDimitry Andric     case SIAtomicScope::WAVEFRONT:
20720b57cec5SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
20730b57cec5SDimitry Andric       // The GDS keeps all memory operations in order for
20740b57cec5SDimitry Andric       // the same work-group.
20750b57cec5SDimitry Andric       break;
20760b57cec5SDimitry Andric     default:
20770b57cec5SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
20780b57cec5SDimitry Andric     }
20790b57cec5SDimitry Andric   }
20800b57cec5SDimitry Andric 
20810b57cec5SDimitry Andric   if (VMCnt || LGKMCnt) {
20820b57cec5SDimitry Andric     unsigned WaitCntImmediate =
20830b57cec5SDimitry Andric       AMDGPU::encodeWaitcnt(IV,
20840b57cec5SDimitry Andric                             VMCnt ? 0 : getVmcntBitMask(IV),
20850b57cec5SDimitry Andric                             getExpcntBitMask(IV),
20860b57cec5SDimitry Andric                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
20875f757f3fSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
20885f757f3fSDimitry Andric         .addImm(WaitCntImmediate);
20890b57cec5SDimitry Andric     Changed = true;
20900b57cec5SDimitry Andric   }
20910b57cec5SDimitry Andric 
20920b57cec5SDimitry Andric   if (VSCnt) {
20935f757f3fSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
20940b57cec5SDimitry Andric         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
20950b57cec5SDimitry Andric         .addImm(0);
20960b57cec5SDimitry Andric     Changed = true;
20970b57cec5SDimitry Andric   }
20980b57cec5SDimitry Andric 
20990b57cec5SDimitry Andric   if (Pos == Position::AFTER)
21000b57cec5SDimitry Andric     --MI;
21010b57cec5SDimitry Andric 
21020b57cec5SDimitry Andric   return Changed;
21030b57cec5SDimitry Andric }
21040b57cec5SDimitry Andric 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const2105e8d8bef9SDimitry Andric bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2106e8d8bef9SDimitry Andric                                         SIAtomicScope Scope,
2107e8d8bef9SDimitry Andric                                         SIAtomicAddrSpace AddrSpace,
2108e8d8bef9SDimitry Andric                                         Position Pos) const {
2109e8d8bef9SDimitry Andric   if (!InsertCacheInv)
2110e8d8bef9SDimitry Andric     return false;
2111e8d8bef9SDimitry Andric 
2112e8d8bef9SDimitry Andric   bool Changed = false;
2113e8d8bef9SDimitry Andric 
2114e8d8bef9SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
2115e8d8bef9SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
2116e8d8bef9SDimitry Andric 
2117e8d8bef9SDimitry Andric   if (Pos == Position::AFTER)
2118e8d8bef9SDimitry Andric     ++MI;
2119e8d8bef9SDimitry Andric 
2120e8d8bef9SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2121e8d8bef9SDimitry Andric     switch (Scope) {
2122e8d8bef9SDimitry Andric     case SIAtomicScope::SYSTEM:
2123e8d8bef9SDimitry Andric     case SIAtomicScope::AGENT:
2124*0fca6ea1SDimitry Andric       // The order of invalidates matter here. We must invalidate "outer in"
2125*0fca6ea1SDimitry Andric       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2126*0fca6ea1SDimitry Andric       // invalidated.
2127e8d8bef9SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2128*0fca6ea1SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2129e8d8bef9SDimitry Andric       Changed = true;
2130e8d8bef9SDimitry Andric       break;
2131e8d8bef9SDimitry Andric     case SIAtomicScope::WORKGROUP:
2132e8d8bef9SDimitry Andric       // In WGP mode the waves of a work-group can be executing on either CU of
2133e8d8bef9SDimitry Andric       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2134e8d8bef9SDimitry Andric       // in CU mode and all waves of a work-group are on the same CU, and so the
2135e8d8bef9SDimitry Andric       // L0 does not need to be invalidated.
2136e8d8bef9SDimitry Andric       if (!ST.isCuModeEnabled()) {
2137e8d8bef9SDimitry Andric         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2138e8d8bef9SDimitry Andric         Changed = true;
2139e8d8bef9SDimitry Andric       }
2140e8d8bef9SDimitry Andric       break;
2141e8d8bef9SDimitry Andric     case SIAtomicScope::WAVEFRONT:
2142e8d8bef9SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
2143e8d8bef9SDimitry Andric       // No cache to invalidate.
2144e8d8bef9SDimitry Andric       break;
2145e8d8bef9SDimitry Andric     default:
2146e8d8bef9SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
2147e8d8bef9SDimitry Andric     }
2148e8d8bef9SDimitry Andric   }
2149e8d8bef9SDimitry Andric 
2150e8d8bef9SDimitry Andric   /// The scratch address space does not need the global memory cache
2151e8d8bef9SDimitry Andric   /// to be flushed as all memory operations by the same thread are
2152e8d8bef9SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
2153e8d8bef9SDimitry Andric   /// memory.
2154e8d8bef9SDimitry Andric 
2155e8d8bef9SDimitry Andric   /// Other address spaces do not have a cache.
2156e8d8bef9SDimitry Andric 
2157e8d8bef9SDimitry Andric   if (Pos == Position::AFTER)
2158e8d8bef9SDimitry Andric     --MI;
2159e8d8bef9SDimitry Andric 
2160e8d8bef9SDimitry Andric   return Changed;
2161e8d8bef9SDimitry Andric }
2162e8d8bef9SDimitry Andric 
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const216381ad6265SDimitry Andric bool SIGfx11CacheControl::enableLoadCacheBypass(
216481ad6265SDimitry Andric     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
216581ad6265SDimitry Andric     SIAtomicAddrSpace AddrSpace) const {
216681ad6265SDimitry Andric   assert(MI->mayLoad() && !MI->mayStore());
216781ad6265SDimitry Andric   bool Changed = false;
216881ad6265SDimitry Andric 
216981ad6265SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
217081ad6265SDimitry Andric     switch (Scope) {
217181ad6265SDimitry Andric     case SIAtomicScope::SYSTEM:
217281ad6265SDimitry Andric     case SIAtomicScope::AGENT:
217381ad6265SDimitry Andric       // Set the L0 and L1 cache policies to MISS_EVICT.
217481ad6265SDimitry Andric       // Note: there is no L2 cache coherent bypass control at the ISA level.
217581ad6265SDimitry Andric       Changed |= enableGLCBit(MI);
217681ad6265SDimitry Andric       break;
217781ad6265SDimitry Andric     case SIAtomicScope::WORKGROUP:
217881ad6265SDimitry Andric       // In WGP mode the waves of a work-group can be executing on either CU of
217981ad6265SDimitry Andric       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
218081ad6265SDimitry Andric       // CU mode all waves of a work-group are on the same CU, and so the L0
218181ad6265SDimitry Andric       // does not need to be bypassed.
218281ad6265SDimitry Andric       if (!ST.isCuModeEnabled())
218381ad6265SDimitry Andric         Changed |= enableGLCBit(MI);
218481ad6265SDimitry Andric       break;
218581ad6265SDimitry Andric     case SIAtomicScope::WAVEFRONT:
218681ad6265SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
218781ad6265SDimitry Andric       // No cache to bypass.
218881ad6265SDimitry Andric       break;
218981ad6265SDimitry Andric     default:
219081ad6265SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
219181ad6265SDimitry Andric     }
219281ad6265SDimitry Andric   }
219381ad6265SDimitry Andric 
219481ad6265SDimitry Andric   /// The scratch address space does not need the global memory caches
219581ad6265SDimitry Andric   /// to be bypassed as all memory operations by the same thread are
219681ad6265SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
219781ad6265SDimitry Andric   /// memory.
219881ad6265SDimitry Andric 
219981ad6265SDimitry Andric   /// Other address spaces do not have a cache.
220081ad6265SDimitry Andric 
220181ad6265SDimitry Andric   return Changed;
220281ad6265SDimitry Andric }
220381ad6265SDimitry Andric 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const220481ad6265SDimitry Andric bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
220581ad6265SDimitry Andric     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2206*0fca6ea1SDimitry Andric     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
220781ad6265SDimitry Andric 
220881ad6265SDimitry Andric   // Only handle load and store, not atomic read-modify-write insructions. The
220981ad6265SDimitry Andric   // latter use glc to indicate if the atomic returns a result and so must not
221081ad6265SDimitry Andric   // be used for cache control.
221181ad6265SDimitry Andric   assert(MI->mayLoad() ^ MI->mayStore());
221281ad6265SDimitry Andric 
221381ad6265SDimitry Andric   // Only update load and store, not LLVM IR atomic read-modify-write
221481ad6265SDimitry Andric   // instructions. The latter are always marked as volatile so cannot sensibly
221581ad6265SDimitry Andric   // handle it as do not want to pessimize all atomics. Also they do not support
221681ad6265SDimitry Andric   // the nontemporal attribute.
221781ad6265SDimitry Andric   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
221881ad6265SDimitry Andric 
221981ad6265SDimitry Andric   bool Changed = false;
222081ad6265SDimitry Andric 
222181ad6265SDimitry Andric   if (IsVolatile) {
222281ad6265SDimitry Andric     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
222381ad6265SDimitry Andric     // and MISS_LRU for store instructions.
222481ad6265SDimitry Andric     // Note: there is no L2 cache coherent bypass control at the ISA level.
222581ad6265SDimitry Andric     if (Op == SIMemOp::LOAD)
222681ad6265SDimitry Andric       Changed |= enableGLCBit(MI);
222781ad6265SDimitry Andric 
222881ad6265SDimitry Andric     // Set MALL NOALLOC for load and store instructions.
222981ad6265SDimitry Andric     Changed |= enableDLCBit(MI);
223081ad6265SDimitry Andric 
223181ad6265SDimitry Andric     // Ensure operation has completed at system scope to cause all volatile
223281ad6265SDimitry Andric     // operations to be visible outside the program in a global order. Do not
223381ad6265SDimitry Andric     // request cross address space as only the global address space can be
223481ad6265SDimitry Andric     // observable outside the program, so no need to cause a waitcnt for LDS
223581ad6265SDimitry Andric     // address space operations.
223681ad6265SDimitry Andric     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
223781ad6265SDimitry Andric                           Position::AFTER);
223881ad6265SDimitry Andric     return Changed;
223981ad6265SDimitry Andric   }
224081ad6265SDimitry Andric 
224181ad6265SDimitry Andric   if (IsNonTemporal) {
224281ad6265SDimitry Andric     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
224381ad6265SDimitry Andric     // and L2 cache policy to STREAM.
224481ad6265SDimitry Andric     // For stores setting both GLC and SLC configures L0 and L1 cache policy
224581ad6265SDimitry Andric     // to MISS_EVICT and the L2 cache policy to STREAM.
224681ad6265SDimitry Andric     if (Op == SIMemOp::STORE)
224781ad6265SDimitry Andric       Changed |= enableGLCBit(MI);
224881ad6265SDimitry Andric     Changed |= enableSLCBit(MI);
224981ad6265SDimitry Andric 
225081ad6265SDimitry Andric     // Set MALL NOALLOC for load and store instructions.
225181ad6265SDimitry Andric     Changed |= enableDLCBit(MI);
225281ad6265SDimitry Andric     return Changed;
225381ad6265SDimitry Andric   }
225481ad6265SDimitry Andric 
225581ad6265SDimitry Andric   return Changed;
225681ad6265SDimitry Andric }
225781ad6265SDimitry Andric 
setTH(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Value) const22587a6dacacSDimitry Andric bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
22597a6dacacSDimitry Andric                                 AMDGPU::CPol::CPol Value) const {
22607a6dacacSDimitry Andric   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
22617a6dacacSDimitry Andric   if (!CPol)
22627a6dacacSDimitry Andric     return false;
22637a6dacacSDimitry Andric 
22647a6dacacSDimitry Andric   uint64_t NewTH = Value & AMDGPU::CPol::TH;
22657a6dacacSDimitry Andric   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
22667a6dacacSDimitry Andric     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
22677a6dacacSDimitry Andric     return true;
22687a6dacacSDimitry Andric   }
22697a6dacacSDimitry Andric 
22707a6dacacSDimitry Andric   return false;
22717a6dacacSDimitry Andric }
22727a6dacacSDimitry Andric 
setScope(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Value) const22737a6dacacSDimitry Andric bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
22747a6dacacSDimitry Andric                                    AMDGPU::CPol::CPol Value) const {
22757a6dacacSDimitry Andric   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
22767a6dacacSDimitry Andric   if (!CPol)
22777a6dacacSDimitry Andric     return false;
22787a6dacacSDimitry Andric 
22797a6dacacSDimitry Andric   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
22807a6dacacSDimitry Andric   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
22817a6dacacSDimitry Andric     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
22827a6dacacSDimitry Andric     return true;
22837a6dacacSDimitry Andric   }
22847a6dacacSDimitry Andric 
22857a6dacacSDimitry Andric   return false;
22867a6dacacSDimitry Andric }
22877a6dacacSDimitry Andric 
insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const2288*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2289*0fca6ea1SDimitry Andric     const MachineBasicBlock::iterator MI) const {
2290*0fca6ea1SDimitry Andric   // TODO: implement flag for frontend to give us a hint not to insert waits.
2291*0fca6ea1SDimitry Andric 
2292*0fca6ea1SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
2293*0fca6ea1SDimitry Andric   const DebugLoc &DL = MI->getDebugLoc();
2294*0fca6ea1SDimitry Andric 
2295*0fca6ea1SDimitry Andric   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2296*0fca6ea1SDimitry Andric   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2297*0fca6ea1SDimitry Andric   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2298*0fca6ea1SDimitry Andric   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2299*0fca6ea1SDimitry Andric   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2300*0fca6ea1SDimitry Andric 
2301*0fca6ea1SDimitry Andric   return true;
2302*0fca6ea1SDimitry Andric }
2303*0fca6ea1SDimitry Andric 
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const23047a6dacacSDimitry Andric bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23057a6dacacSDimitry Andric                                      SIAtomicScope Scope,
23067a6dacacSDimitry Andric                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
23077a6dacacSDimitry Andric                                      bool IsCrossAddrSpaceOrdering,
23087a6dacacSDimitry Andric                                      Position Pos) const {
23097a6dacacSDimitry Andric   bool Changed = false;
23107a6dacacSDimitry Andric 
23117a6dacacSDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
23127a6dacacSDimitry Andric   DebugLoc DL = MI->getDebugLoc();
23137a6dacacSDimitry Andric 
23147a6dacacSDimitry Andric   bool LOADCnt = false;
23157a6dacacSDimitry Andric   bool DSCnt = false;
23167a6dacacSDimitry Andric   bool STORECnt = false;
23177a6dacacSDimitry Andric 
23187a6dacacSDimitry Andric   if (Pos == Position::AFTER)
23197a6dacacSDimitry Andric     ++MI;
23207a6dacacSDimitry Andric 
23217a6dacacSDimitry Andric   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
23227a6dacacSDimitry Andric       SIAtomicAddrSpace::NONE) {
23237a6dacacSDimitry Andric     switch (Scope) {
23247a6dacacSDimitry Andric     case SIAtomicScope::SYSTEM:
23257a6dacacSDimitry Andric     case SIAtomicScope::AGENT:
23267a6dacacSDimitry Andric       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
23277a6dacacSDimitry Andric         LOADCnt |= true;
23287a6dacacSDimitry Andric       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
23297a6dacacSDimitry Andric         STORECnt |= true;
23307a6dacacSDimitry Andric       break;
23317a6dacacSDimitry Andric     case SIAtomicScope::WORKGROUP:
23327a6dacacSDimitry Andric       // In WGP mode the waves of a work-group can be executing on either CU of
23337a6dacacSDimitry Andric       // the WGP. Therefore need to wait for operations to complete to ensure
23347a6dacacSDimitry Andric       // they are visible to waves in the other CU as the L0 is per CU.
23357a6dacacSDimitry Andric       // Otherwise in CU mode and all waves of a work-group are on the same CU
23367a6dacacSDimitry Andric       // which shares the same L0.
23377a6dacacSDimitry Andric       if (!ST.isCuModeEnabled()) {
23387a6dacacSDimitry Andric         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
23397a6dacacSDimitry Andric           LOADCnt |= true;
23407a6dacacSDimitry Andric         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
23417a6dacacSDimitry Andric           STORECnt |= true;
23427a6dacacSDimitry Andric       }
23437a6dacacSDimitry Andric       break;
23447a6dacacSDimitry Andric     case SIAtomicScope::WAVEFRONT:
23457a6dacacSDimitry Andric     case SIAtomicScope::SINGLETHREAD:
23467a6dacacSDimitry Andric       // The L0 cache keeps all memory operations in order for
23477a6dacacSDimitry Andric       // work-items in the same wavefront.
23487a6dacacSDimitry Andric       break;
23497a6dacacSDimitry Andric     default:
23507a6dacacSDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
23517a6dacacSDimitry Andric     }
23527a6dacacSDimitry Andric   }
23537a6dacacSDimitry Andric 
23547a6dacacSDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
23557a6dacacSDimitry Andric     switch (Scope) {
23567a6dacacSDimitry Andric     case SIAtomicScope::SYSTEM:
23577a6dacacSDimitry Andric     case SIAtomicScope::AGENT:
23587a6dacacSDimitry Andric     case SIAtomicScope::WORKGROUP:
23597a6dacacSDimitry Andric       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
23607a6dacacSDimitry Andric       // not needed as LDS operations for all waves are executed in a total
23617a6dacacSDimitry Andric       // global ordering as observed by all waves. Required if also
23627a6dacacSDimitry Andric       // synchronizing with global/GDS memory as LDS operations could be
23637a6dacacSDimitry Andric       // reordered with respect to later global/GDS memory operations of the
23647a6dacacSDimitry Andric       // same wave.
23657a6dacacSDimitry Andric       DSCnt |= IsCrossAddrSpaceOrdering;
23667a6dacacSDimitry Andric       break;
23677a6dacacSDimitry Andric     case SIAtomicScope::WAVEFRONT:
23687a6dacacSDimitry Andric     case SIAtomicScope::SINGLETHREAD:
23697a6dacacSDimitry Andric       // The LDS keeps all memory operations in order for
23707a6dacacSDimitry Andric       // the same wavefront.
23717a6dacacSDimitry Andric       break;
23727a6dacacSDimitry Andric     default:
23737a6dacacSDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
23747a6dacacSDimitry Andric     }
23757a6dacacSDimitry Andric   }
23767a6dacacSDimitry Andric 
23777a6dacacSDimitry Andric   if (LOADCnt) {
23787a6dacacSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
23797a6dacacSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
23807a6dacacSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
23817a6dacacSDimitry Andric     Changed = true;
23827a6dacacSDimitry Andric   }
23837a6dacacSDimitry Andric 
23847a6dacacSDimitry Andric   if (STORECnt) {
23857a6dacacSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
23867a6dacacSDimitry Andric     Changed = true;
23877a6dacacSDimitry Andric   }
23887a6dacacSDimitry Andric 
23897a6dacacSDimitry Andric   if (DSCnt) {
23907a6dacacSDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
23917a6dacacSDimitry Andric     Changed = true;
23927a6dacacSDimitry Andric   }
23937a6dacacSDimitry Andric 
23947a6dacacSDimitry Andric   if (Pos == Position::AFTER)
23957a6dacacSDimitry Andric     --MI;
23967a6dacacSDimitry Andric 
23977a6dacacSDimitry Andric   return Changed;
23987a6dacacSDimitry Andric }
23997a6dacacSDimitry Andric 
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const24001db9f3b2SDimitry Andric bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
24011db9f3b2SDimitry Andric                                         SIAtomicScope Scope,
24021db9f3b2SDimitry Andric                                         SIAtomicAddrSpace AddrSpace,
24031db9f3b2SDimitry Andric                                         Position Pos) const {
24041db9f3b2SDimitry Andric   if (!InsertCacheInv)
24051db9f3b2SDimitry Andric     return false;
24061db9f3b2SDimitry Andric 
24071db9f3b2SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
24081db9f3b2SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
24091db9f3b2SDimitry Andric 
24101db9f3b2SDimitry Andric   /// The scratch address space does not need the global memory cache
24111db9f3b2SDimitry Andric   /// to be flushed as all memory operations by the same thread are
24121db9f3b2SDimitry Andric   /// sequentially consistent, and no other thread can access scratch
24131db9f3b2SDimitry Andric   /// memory.
24141db9f3b2SDimitry Andric 
24151db9f3b2SDimitry Andric   /// Other address spaces do not have a cache.
24161db9f3b2SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
24171db9f3b2SDimitry Andric     return false;
24181db9f3b2SDimitry Andric 
24191db9f3b2SDimitry Andric   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
24201db9f3b2SDimitry Andric   switch (Scope) {
24211db9f3b2SDimitry Andric   case SIAtomicScope::SYSTEM:
24221db9f3b2SDimitry Andric     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
24231db9f3b2SDimitry Andric     break;
24241db9f3b2SDimitry Andric   case SIAtomicScope::AGENT:
24251db9f3b2SDimitry Andric     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
24261db9f3b2SDimitry Andric     break;
24271db9f3b2SDimitry Andric   case SIAtomicScope::WORKGROUP:
24281db9f3b2SDimitry Andric     // In WGP mode the waves of a work-group can be executing on either CU of
24291db9f3b2SDimitry Andric     // the WGP. Therefore we need to invalidate the L0 which is per CU.
24301db9f3b2SDimitry Andric     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
24311db9f3b2SDimitry Andric     // the L0 does not need to be invalidated.
24321db9f3b2SDimitry Andric     if (ST.isCuModeEnabled())
24331db9f3b2SDimitry Andric       return false;
24341db9f3b2SDimitry Andric 
24351db9f3b2SDimitry Andric     ScopeImm = AMDGPU::CPol::SCOPE_SE;
24361db9f3b2SDimitry Andric     break;
24371db9f3b2SDimitry Andric   case SIAtomicScope::WAVEFRONT:
24381db9f3b2SDimitry Andric   case SIAtomicScope::SINGLETHREAD:
24391db9f3b2SDimitry Andric     // No cache to invalidate.
24401db9f3b2SDimitry Andric     return false;
24411db9f3b2SDimitry Andric   default:
24421db9f3b2SDimitry Andric     llvm_unreachable("Unsupported synchronization scope");
24431db9f3b2SDimitry Andric   }
24441db9f3b2SDimitry Andric 
24451db9f3b2SDimitry Andric   if (Pos == Position::AFTER)
24461db9f3b2SDimitry Andric     ++MI;
24471db9f3b2SDimitry Andric 
24481db9f3b2SDimitry Andric   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
24491db9f3b2SDimitry Andric 
24501db9f3b2SDimitry Andric   if (Pos == Position::AFTER)
24511db9f3b2SDimitry Andric     --MI;
24521db9f3b2SDimitry Andric 
24531db9f3b2SDimitry Andric   return true;
24541db9f3b2SDimitry Andric }
24551db9f3b2SDimitry Andric 
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const2456*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2457*0fca6ea1SDimitry Andric                                         SIAtomicScope Scope,
2458*0fca6ea1SDimitry Andric                                         SIAtomicAddrSpace AddrSpace,
2459*0fca6ea1SDimitry Andric                                         bool IsCrossAddrSpaceOrdering,
2460*0fca6ea1SDimitry Andric                                         Position Pos) const {
2461*0fca6ea1SDimitry Andric   MachineBasicBlock &MBB = *MI->getParent();
2462*0fca6ea1SDimitry Andric   DebugLoc DL = MI->getDebugLoc();
2463*0fca6ea1SDimitry Andric 
2464*0fca6ea1SDimitry Andric   // The scratch address space does not need the global memory cache
2465*0fca6ea1SDimitry Andric   // writeback as all memory operations by the same thread are
2466*0fca6ea1SDimitry Andric   // sequentially consistent, and no other thread can access scratch
2467*0fca6ea1SDimitry Andric   // memory.
2468*0fca6ea1SDimitry Andric 
2469*0fca6ea1SDimitry Andric   // Other address spaces do not have a cache.
2470*0fca6ea1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2471*0fca6ea1SDimitry Andric     return false;
2472*0fca6ea1SDimitry Andric 
2473*0fca6ea1SDimitry Andric   if (Pos == Position::AFTER)
2474*0fca6ea1SDimitry Andric     ++MI;
2475*0fca6ea1SDimitry Andric 
2476*0fca6ea1SDimitry Andric   // GLOBAL_WB is always needed, even for write-through caches, as it
2477*0fca6ea1SDimitry Andric   // additionally ensures all operations have reached the desired cache level.
2478*0fca6ea1SDimitry Andric   bool SkipWB = false;
2479*0fca6ea1SDimitry Andric   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2480*0fca6ea1SDimitry Andric   switch (Scope) {
2481*0fca6ea1SDimitry Andric   case SIAtomicScope::SYSTEM:
2482*0fca6ea1SDimitry Andric     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2483*0fca6ea1SDimitry Andric     break;
2484*0fca6ea1SDimitry Andric   case SIAtomicScope::AGENT:
2485*0fca6ea1SDimitry Andric     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2486*0fca6ea1SDimitry Andric     break;
2487*0fca6ea1SDimitry Andric   case SIAtomicScope::WORKGROUP:
2488*0fca6ea1SDimitry Andric     // In WGP mode the waves of a work-group can be executing on either CU of
2489*0fca6ea1SDimitry Andric     // the WGP. Therefore we need to ensure all operations have reached L1,
2490*0fca6ea1SDimitry Andric     // hence the SCOPE_SE WB.
2491*0fca6ea1SDimitry Andric     // For CU mode, we need operations to reach L0, so the wait is enough -
2492*0fca6ea1SDimitry Andric     // there are no ways for an operation to report completion without reaching
2493*0fca6ea1SDimitry Andric     // at least L0.
2494*0fca6ea1SDimitry Andric     if (ST.isCuModeEnabled())
2495*0fca6ea1SDimitry Andric       SkipWB = true;
2496*0fca6ea1SDimitry Andric     else
2497*0fca6ea1SDimitry Andric       ScopeImm = AMDGPU::CPol::SCOPE_SE;
2498*0fca6ea1SDimitry Andric     break;
2499*0fca6ea1SDimitry Andric   case SIAtomicScope::WAVEFRONT:
2500*0fca6ea1SDimitry Andric   case SIAtomicScope::SINGLETHREAD:
2501*0fca6ea1SDimitry Andric     // No cache to invalidate.
2502*0fca6ea1SDimitry Andric     return false;
2503*0fca6ea1SDimitry Andric   default:
2504*0fca6ea1SDimitry Andric     llvm_unreachable("Unsupported synchronization scope");
2505*0fca6ea1SDimitry Andric   }
2506*0fca6ea1SDimitry Andric 
2507*0fca6ea1SDimitry Andric   if (!SkipWB)
2508*0fca6ea1SDimitry Andric     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2509*0fca6ea1SDimitry Andric 
2510*0fca6ea1SDimitry Andric   if (Pos == Position::AFTER)
2511*0fca6ea1SDimitry Andric     --MI;
2512*0fca6ea1SDimitry Andric 
2513*0fca6ea1SDimitry Andric   // We always have to wait for previous memory operations (load/store) to
2514*0fca6ea1SDimitry Andric   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2515*0fca6ea1SDimitry Andric   // we of course need to wait for that as well.
2516*0fca6ea1SDimitry Andric   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2517*0fca6ea1SDimitry Andric              IsCrossAddrSpaceOrdering, Pos);
2518*0fca6ea1SDimitry Andric 
2519*0fca6ea1SDimitry Andric   return true;
2520*0fca6ea1SDimitry Andric }
2521*0fca6ea1SDimitry Andric 
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const25227a6dacacSDimitry Andric bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25237a6dacacSDimitry Andric     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2524*0fca6ea1SDimitry Andric     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
25257a6dacacSDimitry Andric 
25267a6dacacSDimitry Andric   // Only handle load and store, not atomic read-modify-write instructions.
25277a6dacacSDimitry Andric   assert(MI->mayLoad() ^ MI->mayStore());
25287a6dacacSDimitry Andric 
25297a6dacacSDimitry Andric   // Only update load and store, not LLVM IR atomic read-modify-write
25307a6dacacSDimitry Andric   // instructions. The latter are always marked as volatile so cannot sensibly
25317a6dacacSDimitry Andric   // handle it as do not want to pessimize all atomics. Also they do not support
25327a6dacacSDimitry Andric   // the nontemporal attribute.
25337a6dacacSDimitry Andric   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
25347a6dacacSDimitry Andric 
25357a6dacacSDimitry Andric   bool Changed = false;
25367a6dacacSDimitry Andric 
2537*0fca6ea1SDimitry Andric   if (IsLastUse) {
2538*0fca6ea1SDimitry Andric     // Set last-use hint.
2539*0fca6ea1SDimitry Andric     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2540*0fca6ea1SDimitry Andric   } else if (IsNonTemporal) {
25415678d1d9SDimitry Andric     // Set non-temporal hint for all cache levels.
25425678d1d9SDimitry Andric     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
25435678d1d9SDimitry Andric   }
25445678d1d9SDimitry Andric 
25457a6dacacSDimitry Andric   if (IsVolatile) {
25467a6dacacSDimitry Andric     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
25477a6dacacSDimitry Andric 
2548*0fca6ea1SDimitry Andric     if (Op == SIMemOp::STORE)
2549*0fca6ea1SDimitry Andric       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2550*0fca6ea1SDimitry Andric 
25517a6dacacSDimitry Andric     // Ensure operation has completed at system scope to cause all volatile
25527a6dacacSDimitry Andric     // operations to be visible outside the program in a global order. Do not
25537a6dacacSDimitry Andric     // request cross address space as only the global address space can be
25547a6dacacSDimitry Andric     // observable outside the program, so no need to cause a waitcnt for LDS
25557a6dacacSDimitry Andric     // address space operations.
25567a6dacacSDimitry Andric     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
25577a6dacacSDimitry Andric                           Position::AFTER);
25587a6dacacSDimitry Andric   }
25597a6dacacSDimitry Andric 
25607a6dacacSDimitry Andric   return Changed;
25617a6dacacSDimitry Andric }
25627a6dacacSDimitry Andric 
expandSystemScopeStore(MachineBasicBlock::iterator & MI) const2563*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::expandSystemScopeStore(
2564*0fca6ea1SDimitry Andric     MachineBasicBlock::iterator &MI) const {
2565*0fca6ea1SDimitry Andric   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2566*0fca6ea1SDimitry Andric   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2567*0fca6ea1SDimitry Andric     return insertWaitsBeforeSystemScopeStore(MI);
2568*0fca6ea1SDimitry Andric 
2569*0fca6ea1SDimitry Andric   return false;
2570*0fca6ea1SDimitry Andric }
2571*0fca6ea1SDimitry Andric 
setAtomicScope(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const2572*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2573*0fca6ea1SDimitry Andric                                          SIAtomicScope Scope,
2574*0fca6ea1SDimitry Andric                                          SIAtomicAddrSpace AddrSpace) const {
2575*0fca6ea1SDimitry Andric   bool Changed = false;
2576*0fca6ea1SDimitry Andric 
2577*0fca6ea1SDimitry Andric   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2578*0fca6ea1SDimitry Andric     switch (Scope) {
2579*0fca6ea1SDimitry Andric     case SIAtomicScope::SYSTEM:
2580*0fca6ea1SDimitry Andric       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2581*0fca6ea1SDimitry Andric       break;
2582*0fca6ea1SDimitry Andric     case SIAtomicScope::AGENT:
2583*0fca6ea1SDimitry Andric       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2584*0fca6ea1SDimitry Andric       break;
2585*0fca6ea1SDimitry Andric     case SIAtomicScope::WORKGROUP:
2586*0fca6ea1SDimitry Andric       // In workgroup mode, SCOPE_SE is needed as waves can executes on
2587*0fca6ea1SDimitry Andric       // different CUs that access different L0s.
2588*0fca6ea1SDimitry Andric       if (!ST.isCuModeEnabled())
2589*0fca6ea1SDimitry Andric         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2590*0fca6ea1SDimitry Andric       break;
2591*0fca6ea1SDimitry Andric     case SIAtomicScope::WAVEFRONT:
2592*0fca6ea1SDimitry Andric     case SIAtomicScope::SINGLETHREAD:
2593*0fca6ea1SDimitry Andric       // No cache to bypass.
2594*0fca6ea1SDimitry Andric       break;
2595*0fca6ea1SDimitry Andric     default:
2596*0fca6ea1SDimitry Andric       llvm_unreachable("Unsupported synchronization scope");
2597*0fca6ea1SDimitry Andric     }
2598*0fca6ea1SDimitry Andric   }
2599*0fca6ea1SDimitry Andric 
2600*0fca6ea1SDimitry Andric   // The scratch address space does not need the global memory caches
2601*0fca6ea1SDimitry Andric   // to be bypassed as all memory operations by the same thread are
2602*0fca6ea1SDimitry Andric   // sequentially consistent, and no other thread can access scratch
2603*0fca6ea1SDimitry Andric   // memory.
2604*0fca6ea1SDimitry Andric 
2605*0fca6ea1SDimitry Andric   // Other address spaces do not have a cache.
2606*0fca6ea1SDimitry Andric 
2607*0fca6ea1SDimitry Andric   return Changed;
2608*0fca6ea1SDimitry Andric }
2609*0fca6ea1SDimitry Andric 
removeAtomicPseudoMIs()26100b57cec5SDimitry Andric bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
26110b57cec5SDimitry Andric   if (AtomicPseudoMIs.empty())
26120b57cec5SDimitry Andric     return false;
26130b57cec5SDimitry Andric 
26140b57cec5SDimitry Andric   for (auto &MI : AtomicPseudoMIs)
26150b57cec5SDimitry Andric     MI->eraseFromParent();
26160b57cec5SDimitry Andric 
26170b57cec5SDimitry Andric   AtomicPseudoMIs.clear();
26180b57cec5SDimitry Andric   return true;
26190b57cec5SDimitry Andric }
26200b57cec5SDimitry Andric 
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)26210b57cec5SDimitry Andric bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
26220b57cec5SDimitry Andric                                    MachineBasicBlock::iterator &MI) {
26230b57cec5SDimitry Andric   assert(MI->mayLoad() && !MI->mayStore());
26240b57cec5SDimitry Andric 
26250b57cec5SDimitry Andric   bool Changed = false;
26260b57cec5SDimitry Andric 
26270b57cec5SDimitry Andric   if (MOI.isAtomic()) {
26280b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
26290b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::Acquire ||
26300b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
26310b57cec5SDimitry Andric       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
26320b57cec5SDimitry Andric                                            MOI.getOrderingAddrSpace());
26330b57cec5SDimitry Andric     }
26340b57cec5SDimitry Andric 
26350b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
26360b57cec5SDimitry Andric       Changed |= CC->insertWait(MI, MOI.getScope(),
26370b57cec5SDimitry Andric                                 MOI.getOrderingAddrSpace(),
26380b57cec5SDimitry Andric                                 SIMemOp::LOAD | SIMemOp::STORE,
26390b57cec5SDimitry Andric                                 MOI.getIsCrossAddressSpaceOrdering(),
26400b57cec5SDimitry Andric                                 Position::BEFORE);
26410b57cec5SDimitry Andric 
26420b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
26430b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
26440b57cec5SDimitry Andric       Changed |= CC->insertWait(MI, MOI.getScope(),
26450b57cec5SDimitry Andric                                 MOI.getInstrAddrSpace(),
26460b57cec5SDimitry Andric                                 SIMemOp::LOAD,
26470b57cec5SDimitry Andric                                 MOI.getIsCrossAddressSpaceOrdering(),
26480b57cec5SDimitry Andric                                 Position::AFTER);
2649e8d8bef9SDimitry Andric       Changed |= CC->insertAcquire(MI, MOI.getScope(),
26500b57cec5SDimitry Andric                                    MOI.getOrderingAddrSpace(),
26510b57cec5SDimitry Andric                                    Position::AFTER);
26520b57cec5SDimitry Andric     }
26530b57cec5SDimitry Andric 
26540b57cec5SDimitry Andric     return Changed;
26550b57cec5SDimitry Andric   }
26560b57cec5SDimitry Andric 
2657e8d8bef9SDimitry Andric   // Atomic instructions already bypass caches to the scope specified by the
2658*0fca6ea1SDimitry Andric   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2659*0fca6ea1SDimitry Andric   // instructions need additional treatment.
2660*0fca6ea1SDimitry Andric   Changed |= CC->enableVolatileAndOrNonTemporal(
2661*0fca6ea1SDimitry Andric       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2662*0fca6ea1SDimitry Andric       MOI.isNonTemporal(), MOI.isLastUse());
2663*0fca6ea1SDimitry Andric 
26640b57cec5SDimitry Andric   return Changed;
26650b57cec5SDimitry Andric }
26660b57cec5SDimitry Andric 
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)26670b57cec5SDimitry Andric bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26680b57cec5SDimitry Andric                                     MachineBasicBlock::iterator &MI) {
26690b57cec5SDimitry Andric   assert(!MI->mayLoad() && MI->mayStore());
26700b57cec5SDimitry Andric 
26710b57cec5SDimitry Andric   bool Changed = false;
26720b57cec5SDimitry Andric 
26730b57cec5SDimitry Andric   if (MOI.isAtomic()) {
2674fe6060f1SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2675fe6060f1SDimitry Andric         MOI.getOrdering() == AtomicOrdering::Release ||
2676fe6060f1SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2677fe6060f1SDimitry Andric       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2678fe6060f1SDimitry Andric                                             MOI.getOrderingAddrSpace());
2679fe6060f1SDimitry Andric     }
2680fe6060f1SDimitry Andric 
26810b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Release ||
26820b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2683e8d8bef9SDimitry Andric       Changed |= CC->insertRelease(MI, MOI.getScope(),
26840b57cec5SDimitry Andric                                    MOI.getOrderingAddrSpace(),
26850b57cec5SDimitry Andric                                    MOI.getIsCrossAddressSpaceOrdering(),
26860b57cec5SDimitry Andric                                    Position::BEFORE);
26870b57cec5SDimitry Andric 
26880b57cec5SDimitry Andric     return Changed;
26890b57cec5SDimitry Andric   }
26900b57cec5SDimitry Andric 
2691e8d8bef9SDimitry Andric   // Atomic instructions already bypass caches to the scope specified by the
2692e8d8bef9SDimitry Andric   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2693e8d8bef9SDimitry Andric   // need additional treatment.
2694e8d8bef9SDimitry Andric   Changed |= CC->enableVolatileAndOrNonTemporal(
2695e8d8bef9SDimitry Andric       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2696e8d8bef9SDimitry Andric       MOI.isNonTemporal());
2697*0fca6ea1SDimitry Andric 
2698*0fca6ea1SDimitry Andric   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2699*0fca6ea1SDimitry Andric   // instruction field, do not confuse it with atomic scope.
2700*0fca6ea1SDimitry Andric   Changed |= CC->expandSystemScopeStore(MI);
27010b57cec5SDimitry Andric   return Changed;
27020b57cec5SDimitry Andric }
27030b57cec5SDimitry Andric 
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)27040b57cec5SDimitry Andric bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
27050b57cec5SDimitry Andric                                           MachineBasicBlock::iterator &MI) {
27060b57cec5SDimitry Andric   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
27070b57cec5SDimitry Andric 
27080b57cec5SDimitry Andric   AtomicPseudoMIs.push_back(MI);
27090b57cec5SDimitry Andric   bool Changed = false;
27100b57cec5SDimitry Andric 
2711*0fca6ea1SDimitry Andric   // Refine fenced address space based on MMRAs.
2712*0fca6ea1SDimitry Andric   //
2713*0fca6ea1SDimitry Andric   // TODO: Should we support this MMRA on other atomic operations?
2714*0fca6ea1SDimitry Andric   auto OrderingAddrSpace =
2715*0fca6ea1SDimitry Andric       getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2716*0fca6ea1SDimitry Andric 
27170b57cec5SDimitry Andric   if (MOI.isAtomic()) {
271806c3fb27SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2719*0fca6ea1SDimitry Andric       Changed |= CC->insertWait(
2720*0fca6ea1SDimitry Andric           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2721*0fca6ea1SDimitry Andric           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
272206c3fb27SDimitry Andric 
272306c3fb27SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Release ||
27240b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
27250b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
27260b57cec5SDimitry Andric       /// TODO: This relies on a barrier always generating a waitcnt
27270b57cec5SDimitry Andric       /// for LDS to ensure it is not reordered with the completion of
27280b57cec5SDimitry Andric       /// the proceeding LDS operations. If barrier had a memory
27290b57cec5SDimitry Andric       /// ordering and memory scope, then library does not need to
27300b57cec5SDimitry Andric       /// generate a fence. Could add support in this file for
27310b57cec5SDimitry Andric       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2732e8d8bef9SDimitry Andric       /// adding S_WAITCNT before a S_BARRIER.
2733*0fca6ea1SDimitry Andric       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
27340b57cec5SDimitry Andric                                    MOI.getIsCrossAddressSpaceOrdering(),
27350b57cec5SDimitry Andric                                    Position::BEFORE);
27360b57cec5SDimitry Andric 
2737e8d8bef9SDimitry Andric     // TODO: If both release and invalidate are happening they could be combined
2738fe6060f1SDimitry Andric     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2739e8d8bef9SDimitry Andric     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2740e8d8bef9SDimitry Andric     // track cache invalidate and write back instructions.
2741e8d8bef9SDimitry Andric 
27420b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
27430b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
27440b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2745*0fca6ea1SDimitry Andric       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
27460b57cec5SDimitry Andric                                    Position::BEFORE);
27470b57cec5SDimitry Andric 
27480b57cec5SDimitry Andric     return Changed;
27490b57cec5SDimitry Andric   }
27500b57cec5SDimitry Andric 
27510b57cec5SDimitry Andric   return Changed;
27520b57cec5SDimitry Andric }
27530b57cec5SDimitry Andric 
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)27540b57cec5SDimitry Andric bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
27550b57cec5SDimitry Andric   MachineBasicBlock::iterator &MI) {
27560b57cec5SDimitry Andric   assert(MI->mayLoad() && MI->mayStore());
27570b57cec5SDimitry Andric 
27580b57cec5SDimitry Andric   bool Changed = false;
27590b57cec5SDimitry Andric 
27600b57cec5SDimitry Andric   if (MOI.isAtomic()) {
2761fe6060f1SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2762fe6060f1SDimitry Andric         MOI.getOrdering() == AtomicOrdering::Acquire ||
2763fe6060f1SDimitry Andric         MOI.getOrdering() == AtomicOrdering::Release ||
2764fe6060f1SDimitry Andric         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2765fe6060f1SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2766fe6060f1SDimitry Andric       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2767fe6060f1SDimitry Andric                                           MOI.getInstrAddrSpace());
2768fe6060f1SDimitry Andric     }
2769fe6060f1SDimitry Andric 
27700b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Release ||
27710b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
27720b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
27730b57cec5SDimitry Andric         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2774e8d8bef9SDimitry Andric       Changed |= CC->insertRelease(MI, MOI.getScope(),
27750b57cec5SDimitry Andric                                    MOI.getOrderingAddrSpace(),
27760b57cec5SDimitry Andric                                    MOI.getIsCrossAddressSpaceOrdering(),
27770b57cec5SDimitry Andric                                    Position::BEFORE);
27780b57cec5SDimitry Andric 
27790b57cec5SDimitry Andric     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
27800b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
27810b57cec5SDimitry Andric         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
27820b57cec5SDimitry Andric         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
27830b57cec5SDimitry Andric         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
27840b57cec5SDimitry Andric       Changed |= CC->insertWait(MI, MOI.getScope(),
2785fe6060f1SDimitry Andric                                 MOI.getInstrAddrSpace(),
27860b57cec5SDimitry Andric                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
27870b57cec5SDimitry Andric                                                    SIMemOp::STORE,
27880b57cec5SDimitry Andric                                 MOI.getIsCrossAddressSpaceOrdering(),
27890b57cec5SDimitry Andric                                 Position::AFTER);
2790e8d8bef9SDimitry Andric       Changed |= CC->insertAcquire(MI, MOI.getScope(),
27910b57cec5SDimitry Andric                                    MOI.getOrderingAddrSpace(),
27920b57cec5SDimitry Andric                                    Position::AFTER);
27930b57cec5SDimitry Andric     }
27940b57cec5SDimitry Andric 
27950b57cec5SDimitry Andric     return Changed;
27960b57cec5SDimitry Andric   }
27970b57cec5SDimitry Andric 
27980b57cec5SDimitry Andric   return Changed;
27990b57cec5SDimitry Andric }
28000b57cec5SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)28010b57cec5SDimitry Andric bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
28020b57cec5SDimitry Andric   bool Changed = false;
28030b57cec5SDimitry Andric 
2804*0fca6ea1SDimitry Andric   const MachineModuleInfo &MMI =
2805*0fca6ea1SDimitry Andric       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2806*0fca6ea1SDimitry Andric 
2807*0fca6ea1SDimitry Andric   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
28080b57cec5SDimitry Andric   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
28090b57cec5SDimitry Andric 
28100b57cec5SDimitry Andric   for (auto &MBB : MF) {
28110b57cec5SDimitry Andric     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
28125ffd83dbSDimitry Andric 
2813e8d8bef9SDimitry Andric       // Unbundle instructions after the post-RA scheduler.
2814fe6060f1SDimitry Andric       if (MI->isBundle() && MI->mayLoadOrStore()) {
28155ffd83dbSDimitry Andric         MachineBasicBlock::instr_iterator II(MI->getIterator());
28165ffd83dbSDimitry Andric         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
28175ffd83dbSDimitry Andric              I != E && I->isBundledWithPred(); ++I) {
28185ffd83dbSDimitry Andric           I->unbundleFromPred();
28195ffd83dbSDimitry Andric           for (MachineOperand &MO : I->operands())
28205ffd83dbSDimitry Andric             if (MO.isReg())
28215ffd83dbSDimitry Andric               MO.setIsInternalRead(false);
28225ffd83dbSDimitry Andric         }
28235ffd83dbSDimitry Andric 
28245ffd83dbSDimitry Andric         MI->eraseFromParent();
28255ffd83dbSDimitry Andric         MI = II->getIterator();
28265ffd83dbSDimitry Andric       }
28275ffd83dbSDimitry Andric 
28280b57cec5SDimitry Andric       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
28290b57cec5SDimitry Andric         continue;
28300b57cec5SDimitry Andric 
28310b57cec5SDimitry Andric       if (const auto &MOI = MOA.getLoadInfo(MI))
2832bdd1243dSDimitry Andric         Changed |= expandLoad(*MOI, MI);
283306c3fb27SDimitry Andric       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2834bdd1243dSDimitry Andric         Changed |= expandStore(*MOI, MI);
283506c3fb27SDimitry Andric         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
283606c3fb27SDimitry Andric       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2837bdd1243dSDimitry Andric         Changed |= expandAtomicFence(*MOI, MI);
28380b57cec5SDimitry Andric       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2839bdd1243dSDimitry Andric         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
28400b57cec5SDimitry Andric     }
28410b57cec5SDimitry Andric   }
28420b57cec5SDimitry Andric 
28430b57cec5SDimitry Andric   Changed |= removeAtomicPseudoMIs();
28440b57cec5SDimitry Andric   return Changed;
28450b57cec5SDimitry Andric }
28460b57cec5SDimitry Andric 
28470b57cec5SDimitry Andric INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
28480b57cec5SDimitry Andric 
28490b57cec5SDimitry Andric char SIMemoryLegalizer::ID = 0;
28500b57cec5SDimitry Andric char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
28510b57cec5SDimitry Andric 
createSIMemoryLegalizerPass()28520b57cec5SDimitry Andric FunctionPass *llvm::createSIMemoryLegalizerPass() {
28530b57cec5SDimitry Andric   return new SIMemoryLegalizer();
28540b57cec5SDimitry Andric }
2855