xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/CodeGen/MachinePassManager.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27 #include "llvm/IR/PassManager.h"
28 #include "llvm/Support/AtomicOrdering.h"
29 #include "llvm/TargetParser/TargetParser.h"
30 
31 using namespace llvm;
32 using namespace llvm::AMDGPU;
33 
34 #define DEBUG_TYPE "si-memory-legalizer"
35 #define PASS_NAME "SI Memory Legalizer"
36 
37 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
38     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
39     cl::desc("Use this to skip inserting cache invalidating instructions."));
40 
41 namespace {
42 
43 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
44 
45 /// Memory operation flags. Can be ORed together.
46 enum class SIMemOp {
47   NONE = 0u,
48   LOAD = 1u << 0,
49   STORE = 1u << 1,
50   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
51 };
52 
53 /// Position to insert a new instruction relative to an existing
54 /// instruction.
55 enum class Position {
56   BEFORE,
57   AFTER
58 };
59 
60 /// The atomic synchronization scopes supported by the AMDGPU target.
61 enum class SIAtomicScope {
62   NONE,
63   SINGLETHREAD,
64   WAVEFRONT,
65   WORKGROUP,
66   AGENT,
67   SYSTEM
68 };
69 
70 /// The distinct address spaces supported by the AMDGPU target for
71 /// atomic memory operation. Can be ORed together.
72 enum class SIAtomicAddrSpace {
73   NONE = 0u,
74   GLOBAL = 1u << 0,
75   LDS = 1u << 1,
76   SCRATCH = 1u << 2,
77   GDS = 1u << 3,
78   OTHER = 1u << 4,
79 
80   /// The address spaces that can be accessed by a FLAT instruction.
81   FLAT = GLOBAL | LDS | SCRATCH,
82 
83   /// The address spaces that support atomic instructions.
84   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
85 
86   /// All address spaces.
87   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
88 
89   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
90 };
91 
92 class SIMemOpInfo final {
93 private:
94 
95   friend class SIMemOpAccess;
96 
97   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
98   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
99   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
100   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
101   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
102   bool IsCrossAddressSpaceOrdering = false;
103   bool IsVolatile = false;
104   bool IsNonTemporal = false;
105   bool IsLastUse = false;
106 
107   SIMemOpInfo(
108       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
109       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
110       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
111       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
112       bool IsCrossAddressSpaceOrdering = true,
113       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
114       bool IsVolatile = false, bool IsNonTemporal = false,
115       bool IsLastUse = false)
116       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
117         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
118         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
119         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
120         IsLastUse(IsLastUse) {
121 
122     if (Ordering == AtomicOrdering::NotAtomic) {
123       assert(Scope == SIAtomicScope::NONE &&
124              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
125              !IsCrossAddressSpaceOrdering &&
126              FailureOrdering == AtomicOrdering::NotAtomic);
127       return;
128     }
129 
130     assert(Scope != SIAtomicScope::NONE &&
131            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132                SIAtomicAddrSpace::NONE &&
133            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
134                SIAtomicAddrSpace::NONE);
135 
136     // There is also no cross address space ordering if the ordering
137     // address space is the same as the instruction address space and
138     // only contains a single address space.
139     if ((OrderingAddrSpace == InstrAddrSpace) &&
140         isPowerOf2_32(uint32_t(InstrAddrSpace)))
141       this->IsCrossAddressSpaceOrdering = false;
142 
143     // Limit the scope to the maximum supported by the instruction's address
144     // spaces.
145     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
146         SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
150                SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
152     } else if ((InstrAddrSpace &
153                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
154                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
155       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
156     }
157   }
158 
159 public:
160   /// \returns Atomic synchronization scope of the machine instruction used to
161   /// create this SIMemOpInfo.
162   SIAtomicScope getScope() const {
163     return Scope;
164   }
165 
166   /// \returns Ordering constraint of the machine instruction used to
167   /// create this SIMemOpInfo.
168   AtomicOrdering getOrdering() const {
169     return Ordering;
170   }
171 
172   /// \returns Failure ordering constraint of the machine instruction used to
173   /// create this SIMemOpInfo.
174   AtomicOrdering getFailureOrdering() const {
175     return FailureOrdering;
176   }
177 
178   /// \returns The address spaces be accessed by the machine
179   /// instruction used to create this SIMemOpInfo.
180   SIAtomicAddrSpace getInstrAddrSpace() const {
181     return InstrAddrSpace;
182   }
183 
184   /// \returns The address spaces that must be ordered by the machine
185   /// instruction used to create this SIMemOpInfo.
186   SIAtomicAddrSpace getOrderingAddrSpace() const {
187     return OrderingAddrSpace;
188   }
189 
190   /// \returns Return true iff memory ordering of operations on
191   /// different address spaces is required.
192   bool getIsCrossAddressSpaceOrdering() const {
193     return IsCrossAddressSpaceOrdering;
194   }
195 
196   /// \returns True if memory access of the machine instruction used to
197   /// create this SIMemOpInfo is volatile, false otherwise.
198   bool isVolatile() const {
199     return IsVolatile;
200   }
201 
202   /// \returns True if memory access of the machine instruction used to
203   /// create this SIMemOpInfo is nontemporal, false otherwise.
204   bool isNonTemporal() const {
205     return IsNonTemporal;
206   }
207 
208   /// \returns True if memory access of the machine instruction used to
209   /// create this SIMemOpInfo is last use, false otherwise.
210   bool isLastUse() const { return IsLastUse; }
211 
212   /// \returns True if ordering constraint of the machine instruction used to
213   /// create this SIMemOpInfo is unordered or higher, false otherwise.
214   bool isAtomic() const {
215     return Ordering != AtomicOrdering::NotAtomic;
216   }
217 
218 };
219 
220 class SIMemOpAccess final {
221 private:
222   const AMDGPUMachineModuleInfo *MMI = nullptr;
223 
224   /// Reports unsupported message \p Msg for \p MI to LLVM context.
225   void reportUnsupported(const MachineBasicBlock::iterator &MI,
226                          const char *Msg) const;
227 
228   /// Inspects the target synchronization scope \p SSID and determines
229   /// the SI atomic scope it corresponds to, the address spaces it
230   /// covers, and whether the memory ordering applies between address
231   /// spaces.
232   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
233   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
234 
235   /// \return Return a bit set of the address spaces accessed by \p AS.
236   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
237 
238   /// \returns Info constructed from \p MI, which has at least machine memory
239   /// operand.
240   std::optional<SIMemOpInfo>
241   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
242 
243 public:
244   /// Construct class to support accessing the machine memory operands
245   /// of instructions in the machine function \p MF.
246   SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
247 
248   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
249   std::optional<SIMemOpInfo>
250   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
251 
252   /// \returns Store info if \p MI is a store operation, "std::nullopt"
253   /// otherwise.
254   std::optional<SIMemOpInfo>
255   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
256 
257   /// \returns Atomic fence info if \p MI is an atomic fence operation,
258   /// "std::nullopt" otherwise.
259   std::optional<SIMemOpInfo>
260   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
261 
262   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
263   /// rmw operation, "std::nullopt" otherwise.
264   std::optional<SIMemOpInfo>
265   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
266 };
267 
268 class SICacheControl {
269 protected:
270 
271   /// AMDGPU subtarget info.
272   const GCNSubtarget &ST;
273 
274   /// Instruction info.
275   const SIInstrInfo *TII = nullptr;
276 
277   IsaVersion IV;
278 
279   /// Whether to insert cache invalidating instructions.
280   bool InsertCacheInv;
281 
282   SICacheControl(const GCNSubtarget &ST);
283 
284   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
285   /// \returns Returns true if \p MI is modified, false otherwise.
286   bool enableNamedBit(const MachineBasicBlock::iterator MI,
287                       AMDGPU::CPol::CPol Bit) const;
288 
289 public:
290 
291   /// Create a cache control for the subtarget \p ST.
292   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
293 
294   /// Update \p MI memory load instruction to bypass any caches up to
295   /// the \p Scope memory scope for address spaces \p
296   /// AddrSpace. Return true iff the instruction was modified.
297   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
298                                      SIAtomicScope Scope,
299                                      SIAtomicAddrSpace AddrSpace) const = 0;
300 
301   /// Update \p MI memory store instruction to bypass any caches up to
302   /// the \p Scope memory scope for address spaces \p
303   /// AddrSpace. Return true iff the instruction was modified.
304   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
305                                       SIAtomicScope Scope,
306                                       SIAtomicAddrSpace AddrSpace) const = 0;
307 
308   /// Update \p MI memory read-modify-write instruction to bypass any caches up
309   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
310   /// iff the instruction was modified.
311   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
312                                     SIAtomicScope Scope,
313                                     SIAtomicAddrSpace AddrSpace) const = 0;
314 
315   /// Update \p MI memory instruction of kind \p Op associated with address
316   /// spaces \p AddrSpace to indicate it is volatile and/or
317   /// nontemporal/last-use. Return true iff the instruction was modified.
318   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
319                                               SIAtomicAddrSpace AddrSpace,
320                                               SIMemOp Op, bool IsVolatile,
321                                               bool IsNonTemporal,
322                                               bool IsLastUse = false) const = 0;
323 
324   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
325     return false;
326   };
327 
328   /// Inserts any necessary instructions at position \p Pos relative
329   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
330   /// \p Op associated with address spaces \p AddrSpace have completed. Used
331   /// between memory instructions to enforce the order they become visible as
332   /// observed by other memory instructions executing in memory scope \p Scope.
333   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
334   /// address spaces. Returns true iff any instructions inserted.
335   virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
336                           SIAtomicAddrSpace AddrSpace, SIMemOp Op,
337                           bool IsCrossAddrSpaceOrdering, Position Pos,
338                           AtomicOrdering Order) const = 0;
339 
340   /// Inserts any necessary instructions at position \p Pos relative to
341   /// instruction \p MI to ensure any subsequent memory instructions of this
342   /// thread with address spaces \p AddrSpace will observe the previous memory
343   /// operations by any thread for memory scopes up to memory scope \p Scope .
344   /// Returns true iff any instructions inserted.
345   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              Position Pos) const = 0;
349 
350   /// Inserts any necessary instructions at position \p Pos relative to
351   /// instruction \p MI to ensure previous memory instructions by this thread
352   /// with address spaces \p AddrSpace have completed and can be observed by
353   /// subsequent memory instructions by any thread executing in memory scope \p
354   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355   /// between address spaces. Returns true iff any instructions inserted.
356   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357                              SIAtomicScope Scope,
358                              SIAtomicAddrSpace AddrSpace,
359                              bool IsCrossAddrSpaceOrdering,
360                              Position Pos) const = 0;
361 
362   /// Virtual destructor to allow derivations to be deleted.
363   virtual ~SICacheControl() = default;
364 };
365 
366 class SIGfx6CacheControl : public SICacheControl {
367 protected:
368 
369   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
370   /// is modified, false otherwise.
371   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
372     return enableNamedBit(MI, AMDGPU::CPol::GLC);
373   }
374 
375   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
376   /// is modified, false otherwise.
377   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
378     return enableNamedBit(MI, AMDGPU::CPol::SLC);
379   }
380 
381 public:
382 
383   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
384 
385   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
386                              SIAtomicScope Scope,
387                              SIAtomicAddrSpace AddrSpace) const override;
388 
389   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
390                               SIAtomicScope Scope,
391                               SIAtomicAddrSpace AddrSpace) const override;
392 
393   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
394                             SIAtomicScope Scope,
395                             SIAtomicAddrSpace AddrSpace) const override;
396 
397   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
398                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
399                                       bool IsVolatile, bool IsNonTemporal,
400                                       bool IsLastUse) const override;
401 
402   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
403                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404                   bool IsCrossAddrSpaceOrdering, Position Pos,
405                   AtomicOrdering Order) const override;
406 
407   bool insertAcquire(MachineBasicBlock::iterator &MI,
408                      SIAtomicScope Scope,
409                      SIAtomicAddrSpace AddrSpace,
410                      Position Pos) const override;
411 
412   bool insertRelease(MachineBasicBlock::iterator &MI,
413                      SIAtomicScope Scope,
414                      SIAtomicAddrSpace AddrSpace,
415                      bool IsCrossAddrSpaceOrdering,
416                      Position Pos) const override;
417 };
418 
419 class SIGfx7CacheControl : public SIGfx6CacheControl {
420 public:
421 
422   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
423 
424   bool insertAcquire(MachineBasicBlock::iterator &MI,
425                      SIAtomicScope Scope,
426                      SIAtomicAddrSpace AddrSpace,
427                      Position Pos) const override;
428 
429 };
430 
431 class SIGfx90ACacheControl : public SIGfx7CacheControl {
432 public:
433 
434   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
435 
436   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
437                              SIAtomicScope Scope,
438                              SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
441                               SIAtomicScope Scope,
442                               SIAtomicAddrSpace AddrSpace) const override;
443 
444   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
445                             SIAtomicScope Scope,
446                             SIAtomicAddrSpace AddrSpace) const override;
447 
448   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
449                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
450                                       bool IsVolatile, bool IsNonTemporal,
451                                       bool IsLastUse) const override;
452 
453   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
454                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
455                   bool IsCrossAddrSpaceOrdering, Position Pos,
456                   AtomicOrdering Order) const override;
457 
458   bool insertAcquire(MachineBasicBlock::iterator &MI,
459                      SIAtomicScope Scope,
460                      SIAtomicAddrSpace AddrSpace,
461                      Position Pos) const override;
462 
463   bool insertRelease(MachineBasicBlock::iterator &MI,
464                      SIAtomicScope Scope,
465                      SIAtomicAddrSpace AddrSpace,
466                      bool IsCrossAddrSpaceOrdering,
467                      Position Pos) const override;
468 };
469 
470 class SIGfx940CacheControl : public SIGfx90ACacheControl {
471 protected:
472 
473   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
474   /// is modified, false otherwise.
475   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
476     return enableNamedBit(MI, AMDGPU::CPol::SC0);
477   }
478 
479   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
480   /// is modified, false otherwise.
481   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
482     return enableNamedBit(MI, AMDGPU::CPol::SC1);
483   }
484 
485   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
486   /// is modified, false otherwise.
487   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
488     return enableNamedBit(MI, AMDGPU::CPol::NT);
489   }
490 
491 public:
492   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
493 
494   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
495                              SIAtomicScope Scope,
496                              SIAtomicAddrSpace AddrSpace) const override;
497 
498   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
499                               SIAtomicScope Scope,
500                               SIAtomicAddrSpace AddrSpace) const override;
501 
502   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
503                             SIAtomicScope Scope,
504                             SIAtomicAddrSpace AddrSpace) const override;
505 
506   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
507                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
508                                       bool IsVolatile, bool IsNonTemporal,
509                                       bool IsLastUse) const override;
510 
511   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
512                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
513 
514   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
515                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
516                      Position Pos) const override;
517 };
518 
519 class SIGfx10CacheControl : public SIGfx7CacheControl {
520 protected:
521 
522   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
523   /// is modified, false otherwise.
524   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
525     return enableNamedBit(MI, AMDGPU::CPol::DLC);
526   }
527 
528 public:
529 
530   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
531 
532   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
533                              SIAtomicScope Scope,
534                              SIAtomicAddrSpace AddrSpace) const override;
535 
536   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
537                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
538                                       bool IsVolatile, bool IsNonTemporal,
539                                       bool IsLastUse) const override;
540 
541   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
542                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
543                   bool IsCrossAddrSpaceOrdering, Position Pos,
544                   AtomicOrdering Order) const override;
545 
546   bool insertAcquire(MachineBasicBlock::iterator &MI,
547                      SIAtomicScope Scope,
548                      SIAtomicAddrSpace AddrSpace,
549                      Position Pos) const override;
550 };
551 
552 class SIGfx11CacheControl : public SIGfx10CacheControl {
553 public:
554   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
555 
556   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557                              SIAtomicScope Scope,
558                              SIAtomicAddrSpace AddrSpace) const override;
559 
560   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562                                       bool IsVolatile, bool IsNonTemporal,
563                                       bool IsLastUse) const override;
564 };
565 
566 class SIGfx12CacheControl : public SIGfx11CacheControl {
567 protected:
568   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
569   // \returns Returns true if \p MI is modified, false otherwise.
570   bool setTH(const MachineBasicBlock::iterator MI,
571              AMDGPU::CPol::CPol Value) const;
572   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
573   // MI. \returns Returns true if \p MI is modified, false otherwise.
574   bool setScope(const MachineBasicBlock::iterator MI,
575                 AMDGPU::CPol::CPol Value) const;
576 
577   // Stores with system scope (SCOPE_SYS) need to wait for:
578   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
579   // - non-returning-atomics       - wait for STORECNT==0
580   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
581   //   since it does not distinguish atomics-with-return from regular stores.
582   // There is no need to wait if memory is cached (mtype != UC).
583   bool
584   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
585 
586   bool setAtomicScope(const MachineBasicBlock::iterator &MI,
587                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
588 
589 public:
590   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
591 
592   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
593                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
594                   bool IsCrossAddrSpaceOrdering, Position Pos,
595                   AtomicOrdering Order) const override;
596 
597   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
598                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
599 
600   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
601                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
602                                       bool IsVolatile, bool IsNonTemporal,
603                                       bool IsLastUse) const override;
604 
605   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
606 
607   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
608                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
609                      Position Pos) const override;
610 
611   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
612                              SIAtomicScope Scope,
613                              SIAtomicAddrSpace AddrSpace) const override {
614     return setAtomicScope(MI, Scope, AddrSpace);
615   }
616 
617   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
618                               SIAtomicScope Scope,
619                               SIAtomicAddrSpace AddrSpace) const override {
620     return setAtomicScope(MI, Scope, AddrSpace);
621   }
622 
623   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
624                             SIAtomicScope Scope,
625                             SIAtomicAddrSpace AddrSpace) const override {
626     return setAtomicScope(MI, Scope, AddrSpace);
627   }
628 };
629 
630 class SIMemoryLegalizer final {
631 private:
632   const MachineModuleInfo &MMI;
633   /// Cache Control.
634   std::unique_ptr<SICacheControl> CC = nullptr;
635 
636   /// List of atomic pseudo instructions.
637   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
638 
639   /// Return true iff instruction \p MI is a atomic instruction that
640   /// returns a result.
641   bool isAtomicRet(const MachineInstr &MI) const {
642     return SIInstrInfo::isAtomicRet(MI);
643   }
644 
645   /// Removes all processed atomic pseudo instructions from the current
646   /// function. Returns true if current function is modified, false otherwise.
647   bool removeAtomicPseudoMIs();
648 
649   /// Expands load operation \p MI. Returns true if instructions are
650   /// added/deleted or \p MI is modified, false otherwise.
651   bool expandLoad(const SIMemOpInfo &MOI,
652                   MachineBasicBlock::iterator &MI);
653   /// Expands store operation \p MI. Returns true if instructions are
654   /// added/deleted or \p MI is modified, false otherwise.
655   bool expandStore(const SIMemOpInfo &MOI,
656                    MachineBasicBlock::iterator &MI);
657   /// Expands atomic fence operation \p MI. Returns true if
658   /// instructions are added/deleted or \p MI is modified, false otherwise.
659   bool expandAtomicFence(const SIMemOpInfo &MOI,
660                          MachineBasicBlock::iterator &MI);
661   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
662   /// instructions are added/deleted or \p MI is modified, false otherwise.
663   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
664                                 MachineBasicBlock::iterator &MI);
665 
666 public:
667   SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
668   bool run(MachineFunction &MF);
669 };
670 
671 class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
672 public:
673   static char ID;
674 
675   SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
676 
677   void getAnalysisUsage(AnalysisUsage &AU) const override {
678     AU.setPreservesCFG();
679     MachineFunctionPass::getAnalysisUsage(AU);
680   }
681 
682   StringRef getPassName() const override {
683     return PASS_NAME;
684   }
685 
686   bool runOnMachineFunction(MachineFunction &MF) override;
687 };
688 
689 static const StringMap<SIAtomicAddrSpace> ASNames = {{
690     {"global", SIAtomicAddrSpace::GLOBAL},
691     {"local", SIAtomicAddrSpace::LDS},
692 }};
693 
694 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
695   const MachineFunction *MF = MI.getMF();
696   const Function &Fn = MF->getFunction();
697   SmallString<128> Str;
698   raw_svector_ostream OS(Str);
699   OS << "unknown address space '" << AS << "'; expected one of ";
700   ListSeparator LS;
701   for (const auto &[Name, Val] : ASNames)
702     OS << LS << '\'' << Name << '\'';
703   Fn.getContext().diagnose(
704       DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
705 }
706 
707 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
708 /// If this tag isn't present, or if it has no meaningful values, returns \p
709 /// Default. Otherwise returns all the address spaces concerned by the MMRA.
710 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
711                                                SIAtomicAddrSpace Default) {
712   static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
713 
714   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
715   if (!MMRA)
716     return Default;
717 
718   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
719   for (const auto &[Prefix, Suffix] : MMRA) {
720     if (Prefix != FenceASPrefix)
721       continue;
722 
723     if (auto It = ASNames.find(Suffix); It != ASNames.end())
724       Result |= It->second;
725     else
726       diagnoseUnknownMMRAASName(MI, Suffix);
727   }
728 
729   return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
730 }
731 
732 } // end anonymous namespace
733 
734 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
735                                       const char *Msg) const {
736   const Function &Func = MI->getParent()->getParent()->getFunction();
737   Func.getContext().diagnose(
738       DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
739 }
740 
741 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
742 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
743                                SIAtomicAddrSpace InstrAddrSpace) const {
744   if (SSID == SyncScope::System)
745     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
746   if (SSID == MMI->getAgentSSID())
747     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
748   if (SSID == MMI->getWorkgroupSSID())
749     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
750                       true);
751   if (SSID == MMI->getWavefrontSSID())
752     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
753                       true);
754   if (SSID == SyncScope::SingleThread)
755     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
756                       true);
757   if (SSID == MMI->getSystemOneAddressSpaceSSID())
758     return std::tuple(SIAtomicScope::SYSTEM,
759                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
760   if (SSID == MMI->getAgentOneAddressSpaceSSID())
761     return std::tuple(SIAtomicScope::AGENT,
762                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
763   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
764     return std::tuple(SIAtomicScope::WORKGROUP,
765                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
766   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
767     return std::tuple(SIAtomicScope::WAVEFRONT,
768                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
769   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
770     return std::tuple(SIAtomicScope::SINGLETHREAD,
771                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
772   return std::nullopt;
773 }
774 
775 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
776   if (AS == AMDGPUAS::FLAT_ADDRESS)
777     return SIAtomicAddrSpace::FLAT;
778   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
779     return SIAtomicAddrSpace::GLOBAL;
780   if (AS == AMDGPUAS::LOCAL_ADDRESS)
781     return SIAtomicAddrSpace::LDS;
782   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
783     return SIAtomicAddrSpace::SCRATCH;
784   if (AS == AMDGPUAS::REGION_ADDRESS)
785     return SIAtomicAddrSpace::GDS;
786 
787   return SIAtomicAddrSpace::OTHER;
788 }
789 
790 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
791     : MMI(&MMI_) {}
792 
793 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
794     const MachineBasicBlock::iterator &MI) const {
795   assert(MI->getNumMemOperands() > 0);
796 
797   SyncScope::ID SSID = SyncScope::SingleThread;
798   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
799   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
800   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
801   bool IsNonTemporal = true;
802   bool IsVolatile = false;
803   bool IsLastUse = false;
804 
805   // Validator should check whether or not MMOs cover the entire set of
806   // locations accessed by the memory instruction.
807   for (const auto &MMO : MI->memoperands()) {
808     IsNonTemporal &= MMO->isNonTemporal();
809     IsVolatile |= MMO->isVolatile();
810     IsLastUse |= MMO->getFlags() & MOLastUse;
811     InstrAddrSpace |=
812       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
813     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
814     if (OpOrdering != AtomicOrdering::NotAtomic) {
815       const auto &IsSyncScopeInclusion =
816           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
817       if (!IsSyncScopeInclusion) {
818         reportUnsupported(MI,
819           "Unsupported non-inclusive atomic synchronization scope");
820         return std::nullopt;
821       }
822 
823       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
824       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
825       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
826              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
827       FailureOrdering =
828           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
829     }
830   }
831 
832   SIAtomicScope Scope = SIAtomicScope::NONE;
833   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
834   bool IsCrossAddressSpaceOrdering = false;
835   if (Ordering != AtomicOrdering::NotAtomic) {
836     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
837     if (!ScopeOrNone) {
838       reportUnsupported(MI, "Unsupported atomic synchronization scope");
839       return std::nullopt;
840     }
841     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
842         *ScopeOrNone;
843     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
844         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
845         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
846       reportUnsupported(MI, "Unsupported atomic address space");
847       return std::nullopt;
848     }
849   }
850   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
851                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
852                      IsNonTemporal, IsLastUse);
853 }
854 
855 std::optional<SIMemOpInfo>
856 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
857   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
858 
859   if (!(MI->mayLoad() && !MI->mayStore()))
860     return std::nullopt;
861 
862   // Be conservative if there are no memory operands.
863   if (MI->getNumMemOperands() == 0)
864     return SIMemOpInfo();
865 
866   return constructFromMIWithMMO(MI);
867 }
868 
869 std::optional<SIMemOpInfo>
870 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
871   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
872 
873   if (!(!MI->mayLoad() && MI->mayStore()))
874     return std::nullopt;
875 
876   // Be conservative if there are no memory operands.
877   if (MI->getNumMemOperands() == 0)
878     return SIMemOpInfo();
879 
880   return constructFromMIWithMMO(MI);
881 }
882 
883 std::optional<SIMemOpInfo>
884 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
885   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
886 
887   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
888     return std::nullopt;
889 
890   AtomicOrdering Ordering =
891     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
892 
893   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
894   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
895   if (!ScopeOrNone) {
896     reportUnsupported(MI, "Unsupported atomic synchronization scope");
897     return std::nullopt;
898   }
899 
900   SIAtomicScope Scope = SIAtomicScope::NONE;
901   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
902   bool IsCrossAddressSpaceOrdering = false;
903   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
904       *ScopeOrNone;
905 
906   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
907       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
908     reportUnsupported(MI, "Unsupported atomic address space");
909     return std::nullopt;
910   }
911 
912   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
913                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
914 }
915 
916 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
917     const MachineBasicBlock::iterator &MI) const {
918   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
919 
920   if (!(MI->mayLoad() && MI->mayStore()))
921     return std::nullopt;
922 
923   // Be conservative if there are no memory operands.
924   if (MI->getNumMemOperands() == 0)
925     return SIMemOpInfo();
926 
927   return constructFromMIWithMMO(MI);
928 }
929 
930 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
931   TII = ST.getInstrInfo();
932   IV = getIsaVersion(ST.getCPU());
933   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
934 }
935 
936 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
937                                     AMDGPU::CPol::CPol Bit) const {
938   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
939   if (!CPol)
940     return false;
941 
942   CPol->setImm(CPol->getImm() | Bit);
943   return true;
944 }
945 
946 /* static */
947 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
948   GCNSubtarget::Generation Generation = ST.getGeneration();
949   if (ST.hasGFX940Insts())
950     return std::make_unique<SIGfx940CacheControl>(ST);
951   if (ST.hasGFX90AInsts())
952     return std::make_unique<SIGfx90ACacheControl>(ST);
953   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
954     return std::make_unique<SIGfx6CacheControl>(ST);
955   if (Generation < AMDGPUSubtarget::GFX10)
956     return std::make_unique<SIGfx7CacheControl>(ST);
957   if (Generation < AMDGPUSubtarget::GFX11)
958     return std::make_unique<SIGfx10CacheControl>(ST);
959   if (Generation < AMDGPUSubtarget::GFX12)
960     return std::make_unique<SIGfx11CacheControl>(ST);
961   return std::make_unique<SIGfx12CacheControl>(ST);
962 }
963 
964 bool SIGfx6CacheControl::enableLoadCacheBypass(
965     const MachineBasicBlock::iterator &MI,
966     SIAtomicScope Scope,
967     SIAtomicAddrSpace AddrSpace) const {
968   assert(MI->mayLoad() && !MI->mayStore());
969   bool Changed = false;
970 
971   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
972     switch (Scope) {
973     case SIAtomicScope::SYSTEM:
974     case SIAtomicScope::AGENT:
975       // Set L1 cache policy to MISS_EVICT.
976       // Note: there is no L2 cache bypass policy at the ISA level.
977       Changed |= enableGLCBit(MI);
978       break;
979     case SIAtomicScope::WORKGROUP:
980     case SIAtomicScope::WAVEFRONT:
981     case SIAtomicScope::SINGLETHREAD:
982       // No cache to bypass.
983       break;
984     default:
985       llvm_unreachable("Unsupported synchronization scope");
986     }
987   }
988 
989   /// The scratch address space does not need the global memory caches
990   /// to be bypassed as all memory operations by the same thread are
991   /// sequentially consistent, and no other thread can access scratch
992   /// memory.
993 
994   /// Other address spaces do not have a cache.
995 
996   return Changed;
997 }
998 
999 bool SIGfx6CacheControl::enableStoreCacheBypass(
1000     const MachineBasicBlock::iterator &MI,
1001     SIAtomicScope Scope,
1002     SIAtomicAddrSpace AddrSpace) const {
1003   assert(!MI->mayLoad() && MI->mayStore());
1004   bool Changed = false;
1005 
1006   /// The L1 cache is write through so does not need to be bypassed. There is no
1007   /// bypass control for the L2 cache at the isa level.
1008 
1009   return Changed;
1010 }
1011 
1012 bool SIGfx6CacheControl::enableRMWCacheBypass(
1013     const MachineBasicBlock::iterator &MI,
1014     SIAtomicScope Scope,
1015     SIAtomicAddrSpace AddrSpace) const {
1016   assert(MI->mayLoad() && MI->mayStore());
1017   bool Changed = false;
1018 
1019   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1020   /// bypassed, and the GLC bit is instead used to indicate if they are
1021   /// return or no-return.
1022   /// Note: there is no L2 cache coherent bypass control at the ISA level.
1023 
1024   return Changed;
1025 }
1026 
1027 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1028     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1029     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1030   // Only handle load and store, not atomic read-modify-write insructions. The
1031   // latter use glc to indicate if the atomic returns a result and so must not
1032   // be used for cache control.
1033   assert(MI->mayLoad() ^ MI->mayStore());
1034 
1035   // Only update load and store, not LLVM IR atomic read-modify-write
1036   // instructions. The latter are always marked as volatile so cannot sensibly
1037   // handle it as do not want to pessimize all atomics. Also they do not support
1038   // the nontemporal attribute.
1039   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1040 
1041   bool Changed = false;
1042 
1043   if (IsVolatile) {
1044     // Set L1 cache policy to be MISS_EVICT for load instructions
1045     // and MISS_LRU for store instructions.
1046     // Note: there is no L2 cache bypass policy at the ISA level.
1047     if (Op == SIMemOp::LOAD)
1048       Changed |= enableGLCBit(MI);
1049 
1050     // Ensure operation has completed at system scope to cause all volatile
1051     // operations to be visible outside the program in a global order. Do not
1052     // request cross address space as only the global address space can be
1053     // observable outside the program, so no need to cause a waitcnt for LDS
1054     // address space operations.
1055     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1056                           Position::AFTER, AtomicOrdering::Unordered);
1057 
1058     return Changed;
1059   }
1060 
1061   if (IsNonTemporal) {
1062     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1063     // for both loads and stores, and the L2 cache policy to STREAM.
1064     Changed |= enableGLCBit(MI);
1065     Changed |= enableSLCBit(MI);
1066     return Changed;
1067   }
1068 
1069   return Changed;
1070 }
1071 
1072 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1073                                     SIAtomicScope Scope,
1074                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1075                                     bool IsCrossAddrSpaceOrdering, Position Pos,
1076                                     AtomicOrdering Order) const {
1077   bool Changed = false;
1078 
1079   MachineBasicBlock &MBB = *MI->getParent();
1080   DebugLoc DL = MI->getDebugLoc();
1081 
1082   if (Pos == Position::AFTER)
1083     ++MI;
1084 
1085   bool VMCnt = false;
1086   bool LGKMCnt = false;
1087 
1088   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1089       SIAtomicAddrSpace::NONE) {
1090     switch (Scope) {
1091     case SIAtomicScope::SYSTEM:
1092     case SIAtomicScope::AGENT:
1093       VMCnt |= true;
1094       break;
1095     case SIAtomicScope::WORKGROUP:
1096     case SIAtomicScope::WAVEFRONT:
1097     case SIAtomicScope::SINGLETHREAD:
1098       // The L1 cache keeps all memory operations in order for
1099       // wavefronts in the same work-group.
1100       break;
1101     default:
1102       llvm_unreachable("Unsupported synchronization scope");
1103     }
1104   }
1105 
1106   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1107     switch (Scope) {
1108     case SIAtomicScope::SYSTEM:
1109     case SIAtomicScope::AGENT:
1110     case SIAtomicScope::WORKGROUP:
1111       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1112       // not needed as LDS operations for all waves are executed in a total
1113       // global ordering as observed by all waves. Required if also
1114       // synchronizing with global/GDS memory as LDS operations could be
1115       // reordered with respect to later global/GDS memory operations of the
1116       // same wave.
1117       LGKMCnt |= IsCrossAddrSpaceOrdering;
1118       break;
1119     case SIAtomicScope::WAVEFRONT:
1120     case SIAtomicScope::SINGLETHREAD:
1121       // The LDS keeps all memory operations in order for
1122       // the same wavefront.
1123       break;
1124     default:
1125       llvm_unreachable("Unsupported synchronization scope");
1126     }
1127   }
1128 
1129   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1130     switch (Scope) {
1131     case SIAtomicScope::SYSTEM:
1132     case SIAtomicScope::AGENT:
1133       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1134       // is not needed as GDS operations for all waves are executed in a total
1135       // global ordering as observed by all waves. Required if also
1136       // synchronizing with global/LDS memory as GDS operations could be
1137       // reordered with respect to later global/LDS memory operations of the
1138       // same wave.
1139       LGKMCnt |= IsCrossAddrSpaceOrdering;
1140       break;
1141     case SIAtomicScope::WORKGROUP:
1142     case SIAtomicScope::WAVEFRONT:
1143     case SIAtomicScope::SINGLETHREAD:
1144       // The GDS keeps all memory operations in order for
1145       // the same work-group.
1146       break;
1147     default:
1148       llvm_unreachable("Unsupported synchronization scope");
1149     }
1150   }
1151 
1152   if (VMCnt || LGKMCnt) {
1153     unsigned WaitCntImmediate =
1154       AMDGPU::encodeWaitcnt(IV,
1155                             VMCnt ? 0 : getVmcntBitMask(IV),
1156                             getExpcntBitMask(IV),
1157                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1158     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1159         .addImm(WaitCntImmediate);
1160     Changed = true;
1161   }
1162 
1163   if (Pos == Position::AFTER)
1164     --MI;
1165 
1166   return Changed;
1167 }
1168 
1169 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1170                                        SIAtomicScope Scope,
1171                                        SIAtomicAddrSpace AddrSpace,
1172                                        Position Pos) const {
1173   if (!InsertCacheInv)
1174     return false;
1175 
1176   bool Changed = false;
1177 
1178   MachineBasicBlock &MBB = *MI->getParent();
1179   DebugLoc DL = MI->getDebugLoc();
1180 
1181   if (Pos == Position::AFTER)
1182     ++MI;
1183 
1184   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1185     switch (Scope) {
1186     case SIAtomicScope::SYSTEM:
1187     case SIAtomicScope::AGENT:
1188       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1189       Changed = true;
1190       break;
1191     case SIAtomicScope::WORKGROUP:
1192     case SIAtomicScope::WAVEFRONT:
1193     case SIAtomicScope::SINGLETHREAD:
1194       // No cache to invalidate.
1195       break;
1196     default:
1197       llvm_unreachable("Unsupported synchronization scope");
1198     }
1199   }
1200 
1201   /// The scratch address space does not need the global memory cache
1202   /// to be flushed as all memory operations by the same thread are
1203   /// sequentially consistent, and no other thread can access scratch
1204   /// memory.
1205 
1206   /// Other address spaces do not have a cache.
1207 
1208   if (Pos == Position::AFTER)
1209     --MI;
1210 
1211   return Changed;
1212 }
1213 
1214 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1215                                        SIAtomicScope Scope,
1216                                        SIAtomicAddrSpace AddrSpace,
1217                                        bool IsCrossAddrSpaceOrdering,
1218                                        Position Pos) const {
1219   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1220                     IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1221 }
1222 
1223 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1224                                        SIAtomicScope Scope,
1225                                        SIAtomicAddrSpace AddrSpace,
1226                                        Position Pos) const {
1227   if (!InsertCacheInv)
1228     return false;
1229 
1230   bool Changed = false;
1231 
1232   MachineBasicBlock &MBB = *MI->getParent();
1233   DebugLoc DL = MI->getDebugLoc();
1234 
1235   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1236 
1237   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1238                                     ? AMDGPU::BUFFER_WBINVL1
1239                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1240 
1241   if (Pos == Position::AFTER)
1242     ++MI;
1243 
1244   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1245     switch (Scope) {
1246     case SIAtomicScope::SYSTEM:
1247     case SIAtomicScope::AGENT:
1248       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1249       Changed = true;
1250       break;
1251     case SIAtomicScope::WORKGROUP:
1252     case SIAtomicScope::WAVEFRONT:
1253     case SIAtomicScope::SINGLETHREAD:
1254       // No cache to invalidate.
1255       break;
1256     default:
1257       llvm_unreachable("Unsupported synchronization scope");
1258     }
1259   }
1260 
1261   /// The scratch address space does not need the global memory cache
1262   /// to be flushed as all memory operations by the same thread are
1263   /// sequentially consistent, and no other thread can access scratch
1264   /// memory.
1265 
1266   /// Other address spaces do not have a cache.
1267 
1268   if (Pos == Position::AFTER)
1269     --MI;
1270 
1271   return Changed;
1272 }
1273 
1274 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1275     const MachineBasicBlock::iterator &MI,
1276     SIAtomicScope Scope,
1277     SIAtomicAddrSpace AddrSpace) const {
1278   assert(MI->mayLoad() && !MI->mayStore());
1279   bool Changed = false;
1280 
1281   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1282     switch (Scope) {
1283     case SIAtomicScope::SYSTEM:
1284     case SIAtomicScope::AGENT:
1285       // Set the L1 cache policy to MISS_LRU.
1286       // Note: there is no L2 cache bypass policy at the ISA level.
1287       Changed |= enableGLCBit(MI);
1288       break;
1289     case SIAtomicScope::WORKGROUP:
1290       // In threadgroup split mode the waves of a work-group can be executing on
1291       // different CUs. Therefore need to bypass the L1 which is per CU.
1292       // Otherwise in non-threadgroup split mode all waves of a work-group are
1293       // on the same CU, and so the L1 does not need to be bypassed.
1294       if (ST.isTgSplitEnabled())
1295         Changed |= enableGLCBit(MI);
1296       break;
1297     case SIAtomicScope::WAVEFRONT:
1298     case SIAtomicScope::SINGLETHREAD:
1299       // No cache to bypass.
1300       break;
1301     default:
1302       llvm_unreachable("Unsupported synchronization scope");
1303     }
1304   }
1305 
1306   /// The scratch address space does not need the global memory caches
1307   /// to be bypassed as all memory operations by the same thread are
1308   /// sequentially consistent, and no other thread can access scratch
1309   /// memory.
1310 
1311   /// Other address spaces do not have a cache.
1312 
1313   return Changed;
1314 }
1315 
1316 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1317     const MachineBasicBlock::iterator &MI,
1318     SIAtomicScope Scope,
1319     SIAtomicAddrSpace AddrSpace) const {
1320   assert(!MI->mayLoad() && MI->mayStore());
1321   bool Changed = false;
1322 
1323   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1324     switch (Scope) {
1325     case SIAtomicScope::SYSTEM:
1326     case SIAtomicScope::AGENT:
1327       /// Do not set glc for store atomic operations as they implicitly write
1328       /// through the L1 cache.
1329       break;
1330     case SIAtomicScope::WORKGROUP:
1331     case SIAtomicScope::WAVEFRONT:
1332     case SIAtomicScope::SINGLETHREAD:
1333       // No cache to bypass. Store atomics implicitly write through the L1
1334       // cache.
1335       break;
1336     default:
1337       llvm_unreachable("Unsupported synchronization scope");
1338     }
1339   }
1340 
1341   /// The scratch address space does not need the global memory caches
1342   /// to be bypassed as all memory operations by the same thread are
1343   /// sequentially consistent, and no other thread can access scratch
1344   /// memory.
1345 
1346   /// Other address spaces do not have a cache.
1347 
1348   return Changed;
1349 }
1350 
1351 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1352     const MachineBasicBlock::iterator &MI,
1353     SIAtomicScope Scope,
1354     SIAtomicAddrSpace AddrSpace) const {
1355   assert(MI->mayLoad() && MI->mayStore());
1356   bool Changed = false;
1357 
1358   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1359     switch (Scope) {
1360     case SIAtomicScope::SYSTEM:
1361     case SIAtomicScope::AGENT:
1362       /// Do not set glc for RMW atomic operations as they implicitly bypass
1363       /// the L1 cache, and the glc bit is instead used to indicate if they are
1364       /// return or no-return.
1365       break;
1366     case SIAtomicScope::WORKGROUP:
1367     case SIAtomicScope::WAVEFRONT:
1368     case SIAtomicScope::SINGLETHREAD:
1369       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1370       break;
1371     default:
1372       llvm_unreachable("Unsupported synchronization scope");
1373     }
1374   }
1375 
1376   return Changed;
1377 }
1378 
1379 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1380     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1381     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1382   // Only handle load and store, not atomic read-modify-write insructions. The
1383   // latter use glc to indicate if the atomic returns a result and so must not
1384   // be used for cache control.
1385   assert(MI->mayLoad() ^ MI->mayStore());
1386 
1387   // Only update load and store, not LLVM IR atomic read-modify-write
1388   // instructions. The latter are always marked as volatile so cannot sensibly
1389   // handle it as do not want to pessimize all atomics. Also they do not support
1390   // the nontemporal attribute.
1391   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1392 
1393   bool Changed = false;
1394 
1395   if (IsVolatile) {
1396     // Set L1 cache policy to be MISS_EVICT for load instructions
1397     // and MISS_LRU for store instructions.
1398     // Note: there is no L2 cache bypass policy at the ISA level.
1399     if (Op == SIMemOp::LOAD)
1400       Changed |= enableGLCBit(MI);
1401 
1402     // Ensure operation has completed at system scope to cause all volatile
1403     // operations to be visible outside the program in a global order. Do not
1404     // request cross address space as only the global address space can be
1405     // observable outside the program, so no need to cause a waitcnt for LDS
1406     // address space operations.
1407     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1408                           Position::AFTER, AtomicOrdering::Unordered);
1409 
1410     return Changed;
1411   }
1412 
1413   if (IsNonTemporal) {
1414     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1415     // for both loads and stores, and the L2 cache policy to STREAM.
1416     Changed |= enableGLCBit(MI);
1417     Changed |= enableSLCBit(MI);
1418     return Changed;
1419   }
1420 
1421   return Changed;
1422 }
1423 
1424 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1425                                       SIAtomicScope Scope,
1426                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1427                                       bool IsCrossAddrSpaceOrdering,
1428                                       Position Pos,
1429                                       AtomicOrdering Order) const {
1430   if (ST.isTgSplitEnabled()) {
1431     // In threadgroup split mode the waves of a work-group can be executing on
1432     // different CUs. Therefore need to wait for global or GDS memory operations
1433     // to complete to ensure they are visible to waves in the other CUs.
1434     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1435     // the same CU, so no need to wait for global memory as all waves in the
1436     // work-group access the same the L1, nor wait for GDS as access are ordered
1437     // on a CU.
1438     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1439                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1440         (Scope == SIAtomicScope::WORKGROUP)) {
1441       // Same as GFX7 using agent scope.
1442       Scope = SIAtomicScope::AGENT;
1443     }
1444     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1445     // LDS memory operations.
1446     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1447   }
1448   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1449                                         IsCrossAddrSpaceOrdering, Pos, Order);
1450 }
1451 
1452 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1453                                          SIAtomicScope Scope,
1454                                          SIAtomicAddrSpace AddrSpace,
1455                                          Position Pos) const {
1456   if (!InsertCacheInv)
1457     return false;
1458 
1459   bool Changed = false;
1460 
1461   MachineBasicBlock &MBB = *MI->getParent();
1462   DebugLoc DL = MI->getDebugLoc();
1463 
1464   if (Pos == Position::AFTER)
1465     ++MI;
1466 
1467   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1468     switch (Scope) {
1469     case SIAtomicScope::SYSTEM:
1470       // Ensures that following loads will not see stale remote VMEM data or
1471       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1472       // CC will never be stale due to the local memory probes.
1473       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1474       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1475       // hardware does not reorder memory operations by the same wave with
1476       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1477       // remove any cache lines of earlier writes by the same wave and ensures
1478       // later reads by the same wave will refetch the cache lines.
1479       Changed = true;
1480       break;
1481     case SIAtomicScope::AGENT:
1482       // Same as GFX7.
1483       break;
1484     case SIAtomicScope::WORKGROUP:
1485       // In threadgroup split mode the waves of a work-group can be executing on
1486       // different CUs. Therefore need to invalidate the L1 which is per CU.
1487       // Otherwise in non-threadgroup split mode all waves of a work-group are
1488       // on the same CU, and so the L1 does not need to be invalidated.
1489       if (ST.isTgSplitEnabled()) {
1490         // Same as GFX7 using agent scope.
1491         Scope = SIAtomicScope::AGENT;
1492       }
1493       break;
1494     case SIAtomicScope::WAVEFRONT:
1495     case SIAtomicScope::SINGLETHREAD:
1496       // Same as GFX7.
1497       break;
1498     default:
1499       llvm_unreachable("Unsupported synchronization scope");
1500     }
1501   }
1502 
1503   /// The scratch address space does not need the global memory cache
1504   /// to be flushed as all memory operations by the same thread are
1505   /// sequentially consistent, and no other thread can access scratch
1506   /// memory.
1507 
1508   /// Other address spaces do not have a cache.
1509 
1510   if (Pos == Position::AFTER)
1511     --MI;
1512 
1513   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1514 
1515   return Changed;
1516 }
1517 
1518 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1519                                          SIAtomicScope Scope,
1520                                          SIAtomicAddrSpace AddrSpace,
1521                                          bool IsCrossAddrSpaceOrdering,
1522                                          Position Pos) const {
1523   bool Changed = false;
1524 
1525   MachineBasicBlock &MBB = *MI->getParent();
1526   const DebugLoc &DL = MI->getDebugLoc();
1527 
1528   if (Pos == Position::AFTER)
1529     ++MI;
1530 
1531   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1532     switch (Scope) {
1533     case SIAtomicScope::SYSTEM:
1534       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1535       // hardware does not reorder memory operations by the same wave with
1536       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1537       // to initiate writeback of any dirty cache lines of earlier writes by the
1538       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1539       // writeback has completed.
1540       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1541         // Set SC bits to indicate system scope.
1542         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1543       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1544       // vmcnt(0)" needed by the "BUFFER_WBL2".
1545       Changed = true;
1546       break;
1547     case SIAtomicScope::AGENT:
1548     case SIAtomicScope::WORKGROUP:
1549     case SIAtomicScope::WAVEFRONT:
1550     case SIAtomicScope::SINGLETHREAD:
1551       // Same as GFX7.
1552       break;
1553     default:
1554       llvm_unreachable("Unsupported synchronization scope");
1555     }
1556   }
1557 
1558   if (Pos == Position::AFTER)
1559     --MI;
1560 
1561   Changed |=
1562       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1563                                         IsCrossAddrSpaceOrdering, Pos);
1564 
1565   return Changed;
1566 }
1567 
1568 bool SIGfx940CacheControl::enableLoadCacheBypass(
1569     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1570     SIAtomicAddrSpace AddrSpace) const {
1571   assert(MI->mayLoad() && !MI->mayStore());
1572   bool Changed = false;
1573 
1574   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1575     switch (Scope) {
1576     case SIAtomicScope::SYSTEM:
1577       // Set SC bits to indicate system scope.
1578       Changed |= enableSC0Bit(MI);
1579       Changed |= enableSC1Bit(MI);
1580       break;
1581     case SIAtomicScope::AGENT:
1582       // Set SC bits to indicate agent scope.
1583       Changed |= enableSC1Bit(MI);
1584       break;
1585     case SIAtomicScope::WORKGROUP:
1586       // In threadgroup split mode the waves of a work-group can be executing on
1587       // different CUs. Therefore need to bypass the L1 which is per CU.
1588       // Otherwise in non-threadgroup split mode all waves of a work-group are
1589       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1590       // bits to indicate work-group scope will do this automatically.
1591       Changed |= enableSC0Bit(MI);
1592       break;
1593     case SIAtomicScope::WAVEFRONT:
1594     case SIAtomicScope::SINGLETHREAD:
1595       // Leave SC bits unset to indicate wavefront scope.
1596       break;
1597     default:
1598       llvm_unreachable("Unsupported synchronization scope");
1599     }
1600   }
1601 
1602   /// The scratch address space does not need the global memory caches
1603   /// to be bypassed as all memory operations by the same thread are
1604   /// sequentially consistent, and no other thread can access scratch
1605   /// memory.
1606 
1607   /// Other address spaces do not have a cache.
1608 
1609   return Changed;
1610 }
1611 
1612 bool SIGfx940CacheControl::enableStoreCacheBypass(
1613     const MachineBasicBlock::iterator &MI,
1614     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1615   assert(!MI->mayLoad() && MI->mayStore());
1616   bool Changed = false;
1617 
1618   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1619     switch (Scope) {
1620     case SIAtomicScope::SYSTEM:
1621       // Set SC bits to indicate system scope.
1622       Changed |= enableSC0Bit(MI);
1623       Changed |= enableSC1Bit(MI);
1624       break;
1625     case SIAtomicScope::AGENT:
1626       // Set SC bits to indicate agent scope.
1627       Changed |= enableSC1Bit(MI);
1628       break;
1629     case SIAtomicScope::WORKGROUP:
1630       // Set SC bits to indicate workgroup scope.
1631       Changed |= enableSC0Bit(MI);
1632       break;
1633     case SIAtomicScope::WAVEFRONT:
1634     case SIAtomicScope::SINGLETHREAD:
1635       // Leave SC bits unset to indicate wavefront scope.
1636       break;
1637     default:
1638       llvm_unreachable("Unsupported synchronization scope");
1639     }
1640   }
1641 
1642   /// The scratch address space does not need the global memory caches
1643   /// to be bypassed as all memory operations by the same thread are
1644   /// sequentially consistent, and no other thread can access scratch
1645   /// memory.
1646 
1647   /// Other address spaces do not have a cache.
1648 
1649   return Changed;
1650 }
1651 
1652 bool SIGfx940CacheControl::enableRMWCacheBypass(
1653     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1654     SIAtomicAddrSpace AddrSpace) const {
1655   assert(MI->mayLoad() && MI->mayStore());
1656   bool Changed = false;
1657 
1658   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1659     switch (Scope) {
1660     case SIAtomicScope::SYSTEM:
1661       // Set SC1 bit to indicate system scope.
1662       Changed |= enableSC1Bit(MI);
1663       break;
1664     case SIAtomicScope::AGENT:
1665     case SIAtomicScope::WORKGROUP:
1666     case SIAtomicScope::WAVEFRONT:
1667     case SIAtomicScope::SINGLETHREAD:
1668       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1669       // to indicate system or agent scope. The SC0 bit is used to indicate if
1670       // they are return or no-return. Leave SC1 bit unset to indicate agent
1671       // scope.
1672       break;
1673     default:
1674       llvm_unreachable("Unsupported synchronization scope");
1675     }
1676   }
1677 
1678   return Changed;
1679 }
1680 
1681 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1682     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1683     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1684   // Only handle load and store, not atomic read-modify-write insructions. The
1685   // latter use glc to indicate if the atomic returns a result and so must not
1686   // be used for cache control.
1687   assert(MI->mayLoad() ^ MI->mayStore());
1688 
1689   // Only update load and store, not LLVM IR atomic read-modify-write
1690   // instructions. The latter are always marked as volatile so cannot sensibly
1691   // handle it as do not want to pessimize all atomics. Also they do not support
1692   // the nontemporal attribute.
1693   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1694 
1695   bool Changed = false;
1696 
1697   if (IsVolatile) {
1698     // Set SC bits to indicate system scope.
1699     Changed |= enableSC0Bit(MI);
1700     Changed |= enableSC1Bit(MI);
1701 
1702     // Ensure operation has completed at system scope to cause all volatile
1703     // operations to be visible outside the program in a global order. Do not
1704     // request cross address space as only the global address space can be
1705     // observable outside the program, so no need to cause a waitcnt for LDS
1706     // address space operations.
1707     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1708                           Position::AFTER, AtomicOrdering::Unordered);
1709 
1710     return Changed;
1711   }
1712 
1713   if (IsNonTemporal) {
1714     Changed |= enableNTBit(MI);
1715     return Changed;
1716   }
1717 
1718   return Changed;
1719 }
1720 
1721 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1722                                          SIAtomicScope Scope,
1723                                          SIAtomicAddrSpace AddrSpace,
1724                                          Position Pos) const {
1725   if (!InsertCacheInv)
1726     return false;
1727 
1728   bool Changed = false;
1729 
1730   MachineBasicBlock &MBB = *MI->getParent();
1731   DebugLoc DL = MI->getDebugLoc();
1732 
1733   if (Pos == Position::AFTER)
1734     ++MI;
1735 
1736   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1737     switch (Scope) {
1738     case SIAtomicScope::SYSTEM:
1739       // Ensures that following loads will not see stale remote VMEM data or
1740       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1741       // CC will never be stale due to the local memory probes.
1742       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1743           // Set SC bits to indicate system scope.
1744           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1745       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1746       // hardware does not reorder memory operations by the same wave with
1747       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1748       // remove any cache lines of earlier writes by the same wave and ensures
1749       // later reads by the same wave will refetch the cache lines.
1750       Changed = true;
1751       break;
1752     case SIAtomicScope::AGENT:
1753       // Ensures that following loads will not see stale remote date or local
1754       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1755       // due to the memory probes.
1756       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1757           // Set SC bits to indicate agent scope.
1758           .addImm(AMDGPU::CPol::SC1);
1759       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1760       // does not reorder memory operations with respect to preceeding buffer
1761       // invalidate. The invalidate is guaranteed to remove any cache lines of
1762       // earlier writes and ensures later writes will refetch the cache lines.
1763       Changed = true;
1764       break;
1765     case SIAtomicScope::WORKGROUP:
1766       // In threadgroup split mode the waves of a work-group can be executing on
1767       // different CUs. Therefore need to invalidate the L1 which is per CU.
1768       // Otherwise in non-threadgroup split mode all waves of a work-group are
1769       // on the same CU, and so the L1 does not need to be invalidated.
1770       if (ST.isTgSplitEnabled()) {
1771         // Ensures L1 is invalidated if in threadgroup split mode. In
1772         // non-threadgroup split mode it is a NOP, but no point generating it in
1773         // that case if know not in that mode.
1774         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1775             // Set SC bits to indicate work-group scope.
1776             .addImm(AMDGPU::CPol::SC0);
1777         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1778         // does not reorder memory operations with respect to preceeding buffer
1779         // invalidate. The invalidate is guaranteed to remove any cache lines of
1780         // earlier writes and ensures later writes will refetch the cache lines.
1781         Changed = true;
1782       }
1783       break;
1784     case SIAtomicScope::WAVEFRONT:
1785     case SIAtomicScope::SINGLETHREAD:
1786       // Could generate "BUFFER_INV" but it would do nothing as there are no
1787       // caches to invalidate.
1788       break;
1789     default:
1790       llvm_unreachable("Unsupported synchronization scope");
1791     }
1792   }
1793 
1794   /// The scratch address space does not need the global memory cache
1795   /// to be flushed as all memory operations by the same thread are
1796   /// sequentially consistent, and no other thread can access scratch
1797   /// memory.
1798 
1799   /// Other address spaces do not have a cache.
1800 
1801   if (Pos == Position::AFTER)
1802     --MI;
1803 
1804   return Changed;
1805 }
1806 
1807 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1808                                          SIAtomicScope Scope,
1809                                          SIAtomicAddrSpace AddrSpace,
1810                                          bool IsCrossAddrSpaceOrdering,
1811                                          Position Pos) const {
1812   bool Changed = false;
1813 
1814   MachineBasicBlock &MBB = *MI->getParent();
1815   DebugLoc DL = MI->getDebugLoc();
1816 
1817   if (Pos == Position::AFTER)
1818     ++MI;
1819 
1820   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1821     switch (Scope) {
1822     case SIAtomicScope::SYSTEM:
1823       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1824       // hardware does not reorder memory operations by the same wave with
1825       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1826       // to initiate writeback of any dirty cache lines of earlier writes by the
1827       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1828       // writeback has completed.
1829       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1830           // Set SC bits to indicate system scope.
1831           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1832       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1833       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1834       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1835       Changed = true;
1836       break;
1837     case SIAtomicScope::AGENT:
1838       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1839           // Set SC bits to indicate agent scope.
1840           .addImm(AMDGPU::CPol::SC1);
1841 
1842       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1843       // SIAtomicScope::AGENT, the following insertWait will generate the
1844       // required "S_WAITCNT vmcnt(0)".
1845       Changed = true;
1846       break;
1847     case SIAtomicScope::WORKGROUP:
1848     case SIAtomicScope::WAVEFRONT:
1849     case SIAtomicScope::SINGLETHREAD:
1850       // Do not generate "BUFFER_WBL2" as there are no caches it would
1851       // writeback, and would require an otherwise unnecessary
1852       // "S_WAITCNT vmcnt(0)".
1853       break;
1854     default:
1855       llvm_unreachable("Unsupported synchronization scope");
1856     }
1857   }
1858 
1859   if (Pos == Position::AFTER)
1860     --MI;
1861 
1862   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1863   // S_WAITCNT needed.
1864   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1865                         IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1866 
1867   return Changed;
1868 }
1869 
1870 bool SIGfx10CacheControl::enableLoadCacheBypass(
1871     const MachineBasicBlock::iterator &MI,
1872     SIAtomicScope Scope,
1873     SIAtomicAddrSpace AddrSpace) const {
1874   assert(MI->mayLoad() && !MI->mayStore());
1875   bool Changed = false;
1876 
1877   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1878     switch (Scope) {
1879     case SIAtomicScope::SYSTEM:
1880     case SIAtomicScope::AGENT:
1881       // Set the L0 and L1 cache policies to MISS_EVICT.
1882       // Note: there is no L2 cache coherent bypass control at the ISA level.
1883       Changed |= enableGLCBit(MI);
1884       Changed |= enableDLCBit(MI);
1885       break;
1886     case SIAtomicScope::WORKGROUP:
1887       // In WGP mode the waves of a work-group can be executing on either CU of
1888       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1889       // CU mode all waves of a work-group are on the same CU, and so the L0
1890       // does not need to be bypassed.
1891       if (!ST.isCuModeEnabled())
1892         Changed |= enableGLCBit(MI);
1893       break;
1894     case SIAtomicScope::WAVEFRONT:
1895     case SIAtomicScope::SINGLETHREAD:
1896       // No cache to bypass.
1897       break;
1898     default:
1899       llvm_unreachable("Unsupported synchronization scope");
1900     }
1901   }
1902 
1903   /// The scratch address space does not need the global memory caches
1904   /// to be bypassed as all memory operations by the same thread are
1905   /// sequentially consistent, and no other thread can access scratch
1906   /// memory.
1907 
1908   /// Other address spaces do not have a cache.
1909 
1910   return Changed;
1911 }
1912 
1913 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1914     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1915     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1916 
1917   // Only handle load and store, not atomic read-modify-write insructions. The
1918   // latter use glc to indicate if the atomic returns a result and so must not
1919   // be used for cache control.
1920   assert(MI->mayLoad() ^ MI->mayStore());
1921 
1922   // Only update load and store, not LLVM IR atomic read-modify-write
1923   // instructions. The latter are always marked as volatile so cannot sensibly
1924   // handle it as do not want to pessimize all atomics. Also they do not support
1925   // the nontemporal attribute.
1926   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1927 
1928   bool Changed = false;
1929 
1930   if (IsVolatile) {
1931     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1932     // and MISS_LRU for store instructions.
1933     // Note: there is no L2 cache coherent bypass control at the ISA level.
1934     if (Op == SIMemOp::LOAD) {
1935       Changed |= enableGLCBit(MI);
1936       Changed |= enableDLCBit(MI);
1937     }
1938 
1939     // Ensure operation has completed at system scope to cause all volatile
1940     // operations to be visible outside the program in a global order. Do not
1941     // request cross address space as only the global address space can be
1942     // observable outside the program, so no need to cause a waitcnt for LDS
1943     // address space operations.
1944     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1945                           Position::AFTER, AtomicOrdering::Unordered);
1946     return Changed;
1947   }
1948 
1949   if (IsNonTemporal) {
1950     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1951     // and L2 cache policy to STREAM.
1952     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1953     // to MISS_EVICT and the L2 cache policy to STREAM.
1954     if (Op == SIMemOp::STORE)
1955       Changed |= enableGLCBit(MI);
1956     Changed |= enableSLCBit(MI);
1957 
1958     return Changed;
1959   }
1960 
1961   return Changed;
1962 }
1963 
1964 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1965                                      SIAtomicScope Scope,
1966                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1967                                      bool IsCrossAddrSpaceOrdering,
1968                                      Position Pos, AtomicOrdering Order) const {
1969   bool Changed = false;
1970 
1971   MachineBasicBlock &MBB = *MI->getParent();
1972   DebugLoc DL = MI->getDebugLoc();
1973 
1974   if (Pos == Position::AFTER)
1975     ++MI;
1976 
1977   bool VMCnt = false;
1978   bool VSCnt = false;
1979   bool LGKMCnt = false;
1980 
1981   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1982       SIAtomicAddrSpace::NONE) {
1983     switch (Scope) {
1984     case SIAtomicScope::SYSTEM:
1985     case SIAtomicScope::AGENT:
1986       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1987         VMCnt |= true;
1988       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1989         VSCnt |= true;
1990       break;
1991     case SIAtomicScope::WORKGROUP:
1992       // In WGP mode the waves of a work-group can be executing on either CU of
1993       // the WGP. Therefore need to wait for operations to complete to ensure
1994       // they are visible to waves in the other CU as the L0 is per CU.
1995       // Otherwise in CU mode and all waves of a work-group are on the same CU
1996       // which shares the same L0.
1997       if (!ST.isCuModeEnabled()) {
1998         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1999           VMCnt |= true;
2000         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2001           VSCnt |= true;
2002       }
2003       break;
2004     case SIAtomicScope::WAVEFRONT:
2005     case SIAtomicScope::SINGLETHREAD:
2006       // The L0 cache keeps all memory operations in order for
2007       // work-items in the same wavefront.
2008       break;
2009     default:
2010       llvm_unreachable("Unsupported synchronization scope");
2011     }
2012   }
2013 
2014   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2015     switch (Scope) {
2016     case SIAtomicScope::SYSTEM:
2017     case SIAtomicScope::AGENT:
2018     case SIAtomicScope::WORKGROUP:
2019       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2020       // not needed as LDS operations for all waves are executed in a total
2021       // global ordering as observed by all waves. Required if also
2022       // synchronizing with global/GDS memory as LDS operations could be
2023       // reordered with respect to later global/GDS memory operations of the
2024       // same wave.
2025       LGKMCnt |= IsCrossAddrSpaceOrdering;
2026       break;
2027     case SIAtomicScope::WAVEFRONT:
2028     case SIAtomicScope::SINGLETHREAD:
2029       // The LDS keeps all memory operations in order for
2030       // the same wavefront.
2031       break;
2032     default:
2033       llvm_unreachable("Unsupported synchronization scope");
2034     }
2035   }
2036 
2037   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2038     switch (Scope) {
2039     case SIAtomicScope::SYSTEM:
2040     case SIAtomicScope::AGENT:
2041       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2042       // is not needed as GDS operations for all waves are executed in a total
2043       // global ordering as observed by all waves. Required if also
2044       // synchronizing with global/LDS memory as GDS operations could be
2045       // reordered with respect to later global/LDS memory operations of the
2046       // same wave.
2047       LGKMCnt |= IsCrossAddrSpaceOrdering;
2048       break;
2049     case SIAtomicScope::WORKGROUP:
2050     case SIAtomicScope::WAVEFRONT:
2051     case SIAtomicScope::SINGLETHREAD:
2052       // The GDS keeps all memory operations in order for
2053       // the same work-group.
2054       break;
2055     default:
2056       llvm_unreachable("Unsupported synchronization scope");
2057     }
2058   }
2059 
2060   if (VMCnt || LGKMCnt) {
2061     unsigned WaitCntImmediate =
2062       AMDGPU::encodeWaitcnt(IV,
2063                             VMCnt ? 0 : getVmcntBitMask(IV),
2064                             getExpcntBitMask(IV),
2065                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2066     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2067         .addImm(WaitCntImmediate);
2068     Changed = true;
2069   }
2070 
2071   if (VSCnt) {
2072     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2073         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2074         .addImm(0);
2075     Changed = true;
2076   }
2077 
2078   if (Pos == Position::AFTER)
2079     --MI;
2080 
2081   return Changed;
2082 }
2083 
2084 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2085                                         SIAtomicScope Scope,
2086                                         SIAtomicAddrSpace AddrSpace,
2087                                         Position Pos) const {
2088   if (!InsertCacheInv)
2089     return false;
2090 
2091   bool Changed = false;
2092 
2093   MachineBasicBlock &MBB = *MI->getParent();
2094   DebugLoc DL = MI->getDebugLoc();
2095 
2096   if (Pos == Position::AFTER)
2097     ++MI;
2098 
2099   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2100     switch (Scope) {
2101     case SIAtomicScope::SYSTEM:
2102     case SIAtomicScope::AGENT:
2103       // The order of invalidates matter here. We must invalidate "outer in"
2104       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2105       // invalidated.
2106       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2107       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2108       Changed = true;
2109       break;
2110     case SIAtomicScope::WORKGROUP:
2111       // In WGP mode the waves of a work-group can be executing on either CU of
2112       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2113       // in CU mode and all waves of a work-group are on the same CU, and so the
2114       // L0 does not need to be invalidated.
2115       if (!ST.isCuModeEnabled()) {
2116         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2117         Changed = true;
2118       }
2119       break;
2120     case SIAtomicScope::WAVEFRONT:
2121     case SIAtomicScope::SINGLETHREAD:
2122       // No cache to invalidate.
2123       break;
2124     default:
2125       llvm_unreachable("Unsupported synchronization scope");
2126     }
2127   }
2128 
2129   /// The scratch address space does not need the global memory cache
2130   /// to be flushed as all memory operations by the same thread are
2131   /// sequentially consistent, and no other thread can access scratch
2132   /// memory.
2133 
2134   /// Other address spaces do not have a cache.
2135 
2136   if (Pos == Position::AFTER)
2137     --MI;
2138 
2139   return Changed;
2140 }
2141 
2142 bool SIGfx11CacheControl::enableLoadCacheBypass(
2143     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2144     SIAtomicAddrSpace AddrSpace) const {
2145   assert(MI->mayLoad() && !MI->mayStore());
2146   bool Changed = false;
2147 
2148   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2149     switch (Scope) {
2150     case SIAtomicScope::SYSTEM:
2151     case SIAtomicScope::AGENT:
2152       // Set the L0 and L1 cache policies to MISS_EVICT.
2153       // Note: there is no L2 cache coherent bypass control at the ISA level.
2154       Changed |= enableGLCBit(MI);
2155       break;
2156     case SIAtomicScope::WORKGROUP:
2157       // In WGP mode the waves of a work-group can be executing on either CU of
2158       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2159       // CU mode all waves of a work-group are on the same CU, and so the L0
2160       // does not need to be bypassed.
2161       if (!ST.isCuModeEnabled())
2162         Changed |= enableGLCBit(MI);
2163       break;
2164     case SIAtomicScope::WAVEFRONT:
2165     case SIAtomicScope::SINGLETHREAD:
2166       // No cache to bypass.
2167       break;
2168     default:
2169       llvm_unreachable("Unsupported synchronization scope");
2170     }
2171   }
2172 
2173   /// The scratch address space does not need the global memory caches
2174   /// to be bypassed as all memory operations by the same thread are
2175   /// sequentially consistent, and no other thread can access scratch
2176   /// memory.
2177 
2178   /// Other address spaces do not have a cache.
2179 
2180   return Changed;
2181 }
2182 
2183 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2184     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2185     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2186 
2187   // Only handle load and store, not atomic read-modify-write insructions. The
2188   // latter use glc to indicate if the atomic returns a result and so must not
2189   // be used for cache control.
2190   assert(MI->mayLoad() ^ MI->mayStore());
2191 
2192   // Only update load and store, not LLVM IR atomic read-modify-write
2193   // instructions. The latter are always marked as volatile so cannot sensibly
2194   // handle it as do not want to pessimize all atomics. Also they do not support
2195   // the nontemporal attribute.
2196   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2197 
2198   bool Changed = false;
2199 
2200   if (IsVolatile) {
2201     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2202     // and MISS_LRU for store instructions.
2203     // Note: there is no L2 cache coherent bypass control at the ISA level.
2204     if (Op == SIMemOp::LOAD)
2205       Changed |= enableGLCBit(MI);
2206 
2207     // Set MALL NOALLOC for load and store instructions.
2208     Changed |= enableDLCBit(MI);
2209 
2210     // Ensure operation has completed at system scope to cause all volatile
2211     // operations to be visible outside the program in a global order. Do not
2212     // request cross address space as only the global address space can be
2213     // observable outside the program, so no need to cause a waitcnt for LDS
2214     // address space operations.
2215     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2216                           Position::AFTER, AtomicOrdering::Unordered);
2217     return Changed;
2218   }
2219 
2220   if (IsNonTemporal) {
2221     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2222     // and L2 cache policy to STREAM.
2223     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2224     // to MISS_EVICT and the L2 cache policy to STREAM.
2225     if (Op == SIMemOp::STORE)
2226       Changed |= enableGLCBit(MI);
2227     Changed |= enableSLCBit(MI);
2228 
2229     // Set MALL NOALLOC for load and store instructions.
2230     Changed |= enableDLCBit(MI);
2231     return Changed;
2232   }
2233 
2234   return Changed;
2235 }
2236 
2237 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2238                                 AMDGPU::CPol::CPol Value) const {
2239   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2240   if (!CPol)
2241     return false;
2242 
2243   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2244   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2245     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2246     return true;
2247   }
2248 
2249   return false;
2250 }
2251 
2252 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2253                                    AMDGPU::CPol::CPol Value) const {
2254   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2255   if (!CPol)
2256     return false;
2257 
2258   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2259   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2260     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2261     return true;
2262   }
2263 
2264   return false;
2265 }
2266 
2267 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2268     const MachineBasicBlock::iterator MI) const {
2269   // TODO: implement flag for frontend to give us a hint not to insert waits.
2270 
2271   MachineBasicBlock &MBB = *MI->getParent();
2272   const DebugLoc &DL = MI->getDebugLoc();
2273 
2274   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2275   if (ST.hasImageInsts()) {
2276     BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2277     BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2278   }
2279   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2280   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2281 
2282   return true;
2283 }
2284 
2285 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2286                                      SIAtomicScope Scope,
2287                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2288                                      bool IsCrossAddrSpaceOrdering,
2289                                      Position Pos, AtomicOrdering Order) const {
2290   bool Changed = false;
2291 
2292   MachineBasicBlock &MBB = *MI->getParent();
2293   DebugLoc DL = MI->getDebugLoc();
2294 
2295   bool LOADCnt = false;
2296   bool DSCnt = false;
2297   bool STORECnt = false;
2298 
2299   if (Pos == Position::AFTER)
2300     ++MI;
2301 
2302   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2303       SIAtomicAddrSpace::NONE) {
2304     switch (Scope) {
2305     case SIAtomicScope::SYSTEM:
2306     case SIAtomicScope::AGENT:
2307       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2308         LOADCnt |= true;
2309       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2310         STORECnt |= true;
2311       break;
2312     case SIAtomicScope::WORKGROUP:
2313       // In WGP mode the waves of a work-group can be executing on either CU of
2314       // the WGP. Therefore need to wait for operations to complete to ensure
2315       // they are visible to waves in the other CU as the L0 is per CU.
2316       // Otherwise in CU mode and all waves of a work-group are on the same CU
2317       // which shares the same L0.
2318       if (!ST.isCuModeEnabled()) {
2319         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2320           LOADCnt |= true;
2321         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2322           STORECnt |= true;
2323       }
2324       break;
2325     case SIAtomicScope::WAVEFRONT:
2326     case SIAtomicScope::SINGLETHREAD:
2327       // The L0 cache keeps all memory operations in order for
2328       // work-items in the same wavefront.
2329       break;
2330     default:
2331       llvm_unreachable("Unsupported synchronization scope");
2332     }
2333   }
2334 
2335   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2336     switch (Scope) {
2337     case SIAtomicScope::SYSTEM:
2338     case SIAtomicScope::AGENT:
2339     case SIAtomicScope::WORKGROUP:
2340       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2341       // not needed as LDS operations for all waves are executed in a total
2342       // global ordering as observed by all waves. Required if also
2343       // synchronizing with global/GDS memory as LDS operations could be
2344       // reordered with respect to later global/GDS memory operations of the
2345       // same wave.
2346       DSCnt |= IsCrossAddrSpaceOrdering;
2347       break;
2348     case SIAtomicScope::WAVEFRONT:
2349     case SIAtomicScope::SINGLETHREAD:
2350       // The LDS keeps all memory operations in order for
2351       // the same wavefront.
2352       break;
2353     default:
2354       llvm_unreachable("Unsupported synchronization scope");
2355     }
2356   }
2357 
2358   if (LOADCnt) {
2359     // Acquire sequences only need to wait on the previous atomic operation.
2360     // e.g. a typical sequence looks like
2361     //    atomic load
2362     //    (wait)
2363     //    global_inv
2364     //
2365     // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2366     // to be tracked using loadcnt.
2367     //
2368     // This also applies to fences. Fences cannot pair with an instruction
2369     // tracked with bvh/samplecnt as we don't have any atomics that do that.
2370     if (Order != AtomicOrdering::Acquire) {
2371       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2372       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2373     }
2374     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2375     Changed = true;
2376   }
2377 
2378   if (STORECnt) {
2379     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2380     Changed = true;
2381   }
2382 
2383   if (DSCnt) {
2384     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2385     Changed = true;
2386   }
2387 
2388   if (Pos == Position::AFTER)
2389     --MI;
2390 
2391   return Changed;
2392 }
2393 
2394 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2395                                         SIAtomicScope Scope,
2396                                         SIAtomicAddrSpace AddrSpace,
2397                                         Position Pos) const {
2398   if (!InsertCacheInv)
2399     return false;
2400 
2401   MachineBasicBlock &MBB = *MI->getParent();
2402   DebugLoc DL = MI->getDebugLoc();
2403 
2404   /// The scratch address space does not need the global memory cache
2405   /// to be flushed as all memory operations by the same thread are
2406   /// sequentially consistent, and no other thread can access scratch
2407   /// memory.
2408 
2409   /// Other address spaces do not have a cache.
2410   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2411     return false;
2412 
2413   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2414   switch (Scope) {
2415   case SIAtomicScope::SYSTEM:
2416     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2417     break;
2418   case SIAtomicScope::AGENT:
2419     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2420     break;
2421   case SIAtomicScope::WORKGROUP:
2422     // In WGP mode the waves of a work-group can be executing on either CU of
2423     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2424     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2425     // the L0 does not need to be invalidated.
2426     if (ST.isCuModeEnabled())
2427       return false;
2428 
2429     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2430     break;
2431   case SIAtomicScope::WAVEFRONT:
2432   case SIAtomicScope::SINGLETHREAD:
2433     // No cache to invalidate.
2434     return false;
2435   default:
2436     llvm_unreachable("Unsupported synchronization scope");
2437   }
2438 
2439   if (Pos == Position::AFTER)
2440     ++MI;
2441 
2442   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2443 
2444   if (Pos == Position::AFTER)
2445     --MI;
2446 
2447   return true;
2448 }
2449 
2450 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2451                                         SIAtomicScope Scope,
2452                                         SIAtomicAddrSpace AddrSpace,
2453                                         bool IsCrossAddrSpaceOrdering,
2454                                         Position Pos) const {
2455   MachineBasicBlock &MBB = *MI->getParent();
2456   DebugLoc DL = MI->getDebugLoc();
2457 
2458   // The scratch address space does not need the global memory cache
2459   // writeback as all memory operations by the same thread are
2460   // sequentially consistent, and no other thread can access scratch
2461   // memory.
2462 
2463   // Other address spaces do not have a cache.
2464   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2465     return false;
2466 
2467   if (Pos == Position::AFTER)
2468     ++MI;
2469 
2470   // global_wb is only necessary at system scope for gfx120x targets.
2471   //
2472   // Emitting it for lower scopes is a slow no-op, so we omit it
2473   // for performance.
2474   switch (Scope) {
2475   case SIAtomicScope::SYSTEM:
2476     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2477         .addImm(AMDGPU::CPol::SCOPE_SYS);
2478     break;
2479   case SIAtomicScope::AGENT:
2480   case SIAtomicScope::WORKGROUP:
2481     // No WB necessary, but we still have to wait.
2482     break;
2483   case SIAtomicScope::WAVEFRONT:
2484   case SIAtomicScope::SINGLETHREAD:
2485     // No WB or wait necessary here.
2486     return false;
2487   default:
2488     llvm_unreachable("Unsupported synchronization scope");
2489   }
2490 
2491   if (Pos == Position::AFTER)
2492     --MI;
2493 
2494   // We always have to wait for previous memory operations (load/store) to
2495   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2496   // we of course need to wait for that as well.
2497   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2498              IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2499 
2500   return true;
2501 }
2502 
2503 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2504     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2505     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2506 
2507   // Only handle load and store, not atomic read-modify-write instructions.
2508   assert(MI->mayLoad() ^ MI->mayStore());
2509 
2510   // Only update load and store, not LLVM IR atomic read-modify-write
2511   // instructions. The latter are always marked as volatile so cannot sensibly
2512   // handle it as do not want to pessimize all atomics. Also they do not support
2513   // the nontemporal attribute.
2514   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2515 
2516   bool Changed = false;
2517 
2518   if (IsLastUse) {
2519     // Set last-use hint.
2520     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2521   } else if (IsNonTemporal) {
2522     // Set non-temporal hint for all cache levels.
2523     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2524   }
2525 
2526   if (IsVolatile) {
2527     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2528 
2529     if (Op == SIMemOp::STORE)
2530       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2531 
2532     // Ensure operation has completed at system scope to cause all volatile
2533     // operations to be visible outside the program in a global order. Do not
2534     // request cross address space as only the global address space can be
2535     // observable outside the program, so no need to cause a waitcnt for LDS
2536     // address space operations.
2537     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2538                           Position::AFTER, AtomicOrdering::Unordered);
2539   }
2540 
2541   return Changed;
2542 }
2543 
2544 bool SIGfx12CacheControl::expandSystemScopeStore(
2545     MachineBasicBlock::iterator &MI) const {
2546   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2547   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2548     return insertWaitsBeforeSystemScopeStore(MI);
2549 
2550   return false;
2551 }
2552 
2553 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2554                                          SIAtomicScope Scope,
2555                                          SIAtomicAddrSpace AddrSpace) const {
2556   bool Changed = false;
2557 
2558   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2559     switch (Scope) {
2560     case SIAtomicScope::SYSTEM:
2561       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2562       break;
2563     case SIAtomicScope::AGENT:
2564       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2565       break;
2566     case SIAtomicScope::WORKGROUP:
2567       // In workgroup mode, SCOPE_SE is needed as waves can executes on
2568       // different CUs that access different L0s.
2569       if (!ST.isCuModeEnabled())
2570         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2571       break;
2572     case SIAtomicScope::WAVEFRONT:
2573     case SIAtomicScope::SINGLETHREAD:
2574       // No cache to bypass.
2575       break;
2576     default:
2577       llvm_unreachable("Unsupported synchronization scope");
2578     }
2579   }
2580 
2581   // The scratch address space does not need the global memory caches
2582   // to be bypassed as all memory operations by the same thread are
2583   // sequentially consistent, and no other thread can access scratch
2584   // memory.
2585 
2586   // Other address spaces do not have a cache.
2587 
2588   return Changed;
2589 }
2590 
2591 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2592   if (AtomicPseudoMIs.empty())
2593     return false;
2594 
2595   for (auto &MI : AtomicPseudoMIs)
2596     MI->eraseFromParent();
2597 
2598   AtomicPseudoMIs.clear();
2599   return true;
2600 }
2601 
2602 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2603                                    MachineBasicBlock::iterator &MI) {
2604   assert(MI->mayLoad() && !MI->mayStore());
2605 
2606   bool Changed = false;
2607 
2608   if (MOI.isAtomic()) {
2609     const AtomicOrdering Order = MOI.getOrdering();
2610     if (Order == AtomicOrdering::Monotonic ||
2611         Order == AtomicOrdering::Acquire ||
2612         Order == AtomicOrdering::SequentiallyConsistent) {
2613       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2614                                            MOI.getOrderingAddrSpace());
2615     }
2616 
2617     if (Order == AtomicOrdering::SequentiallyConsistent)
2618       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2619                                 SIMemOp::LOAD | SIMemOp::STORE,
2620                                 MOI.getIsCrossAddressSpaceOrdering(),
2621                                 Position::BEFORE, Order);
2622 
2623     if (Order == AtomicOrdering::Acquire ||
2624         Order == AtomicOrdering::SequentiallyConsistent) {
2625       Changed |= CC->insertWait(
2626           MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2627           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2628       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2629                                    MOI.getOrderingAddrSpace(),
2630                                    Position::AFTER);
2631     }
2632 
2633     return Changed;
2634   }
2635 
2636   // Atomic instructions already bypass caches to the scope specified by the
2637   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2638   // instructions need additional treatment.
2639   Changed |= CC->enableVolatileAndOrNonTemporal(
2640       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2641       MOI.isNonTemporal(), MOI.isLastUse());
2642 
2643   return Changed;
2644 }
2645 
2646 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2647                                     MachineBasicBlock::iterator &MI) {
2648   assert(!MI->mayLoad() && MI->mayStore());
2649 
2650   bool Changed = false;
2651 
2652   if (MOI.isAtomic()) {
2653     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2654         MOI.getOrdering() == AtomicOrdering::Release ||
2655         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2656       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2657                                             MOI.getOrderingAddrSpace());
2658     }
2659 
2660     if (MOI.getOrdering() == AtomicOrdering::Release ||
2661         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2662       Changed |= CC->insertRelease(MI, MOI.getScope(),
2663                                    MOI.getOrderingAddrSpace(),
2664                                    MOI.getIsCrossAddressSpaceOrdering(),
2665                                    Position::BEFORE);
2666 
2667     return Changed;
2668   }
2669 
2670   // Atomic instructions already bypass caches to the scope specified by the
2671   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2672   // need additional treatment.
2673   Changed |= CC->enableVolatileAndOrNonTemporal(
2674       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2675       MOI.isNonTemporal());
2676 
2677   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2678   // instruction field, do not confuse it with atomic scope.
2679   Changed |= CC->expandSystemScopeStore(MI);
2680   return Changed;
2681 }
2682 
2683 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2684                                           MachineBasicBlock::iterator &MI) {
2685   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2686 
2687   AtomicPseudoMIs.push_back(MI);
2688   bool Changed = false;
2689 
2690   // Refine fenced address space based on MMRAs.
2691   //
2692   // TODO: Should we support this MMRA on other atomic operations?
2693   auto OrderingAddrSpace =
2694       getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2695 
2696   if (MOI.isAtomic()) {
2697     const AtomicOrdering Order = MOI.getOrdering();
2698     if (Order == AtomicOrdering::Acquire) {
2699       Changed |= CC->insertWait(
2700           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2701           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2702     }
2703 
2704     if (Order == AtomicOrdering::Release ||
2705         Order == AtomicOrdering::AcquireRelease ||
2706         Order == AtomicOrdering::SequentiallyConsistent)
2707       /// TODO: This relies on a barrier always generating a waitcnt
2708       /// for LDS to ensure it is not reordered with the completion of
2709       /// the proceeding LDS operations. If barrier had a memory
2710       /// ordering and memory scope, then library does not need to
2711       /// generate a fence. Could add support in this file for
2712       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2713       /// adding S_WAITCNT before a S_BARRIER.
2714       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2715                                    MOI.getIsCrossAddressSpaceOrdering(),
2716                                    Position::BEFORE);
2717 
2718     // TODO: If both release and invalidate are happening they could be combined
2719     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2720     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2721     // track cache invalidate and write back instructions.
2722 
2723     if (Order == AtomicOrdering::Acquire ||
2724         Order == AtomicOrdering::AcquireRelease ||
2725         Order == AtomicOrdering::SequentiallyConsistent)
2726       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2727                                    Position::BEFORE);
2728 
2729     return Changed;
2730   }
2731 
2732   return Changed;
2733 }
2734 
2735 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2736   MachineBasicBlock::iterator &MI) {
2737   assert(MI->mayLoad() && MI->mayStore());
2738 
2739   bool Changed = false;
2740 
2741   if (MOI.isAtomic()) {
2742     const AtomicOrdering Order = MOI.getOrdering();
2743     if (Order == AtomicOrdering::Monotonic ||
2744         Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2745         Order == AtomicOrdering::AcquireRelease ||
2746         Order == AtomicOrdering::SequentiallyConsistent) {
2747       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2748                                           MOI.getInstrAddrSpace());
2749     }
2750 
2751     if (Order == AtomicOrdering::Release ||
2752         Order == AtomicOrdering::AcquireRelease ||
2753         Order == AtomicOrdering::SequentiallyConsistent ||
2754         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2755       Changed |= CC->insertRelease(MI, MOI.getScope(),
2756                                    MOI.getOrderingAddrSpace(),
2757                                    MOI.getIsCrossAddressSpaceOrdering(),
2758                                    Position::BEFORE);
2759 
2760     if (Order == AtomicOrdering::Acquire ||
2761         Order == AtomicOrdering::AcquireRelease ||
2762         Order == AtomicOrdering::SequentiallyConsistent ||
2763         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2764         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2765       Changed |= CC->insertWait(
2766           MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2767           isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2768           MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2769       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2770                                    MOI.getOrderingAddrSpace(),
2771                                    Position::AFTER);
2772     }
2773 
2774     return Changed;
2775   }
2776 
2777   return Changed;
2778 }
2779 
2780 bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2781   const MachineModuleInfo &MMI =
2782       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2783   return SIMemoryLegalizer(MMI).run(MF);
2784 }
2785 
2786 PreservedAnalyses
2787 SIMemoryLegalizerPass::run(MachineFunction &MF,
2788                            MachineFunctionAnalysisManager &MFAM) {
2789   auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
2790                   .getCachedResult<MachineModuleAnalysis>(
2791                       *MF.getFunction().getParent());
2792   assert(MMI && "MachineModuleAnalysis must be available");
2793   if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2794     return PreservedAnalyses::all();
2795   return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
2796 }
2797 
2798 bool SIMemoryLegalizer::run(MachineFunction &MF) {
2799   bool Changed = false;
2800 
2801   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2802   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2803 
2804   for (auto &MBB : MF) {
2805     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2806 
2807       // Unbundle instructions after the post-RA scheduler.
2808       if (MI->isBundle() && MI->mayLoadOrStore()) {
2809         MachineBasicBlock::instr_iterator II(MI->getIterator());
2810         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2811              I != E && I->isBundledWithPred(); ++I) {
2812           I->unbundleFromPred();
2813           for (MachineOperand &MO : I->operands())
2814             if (MO.isReg())
2815               MO.setIsInternalRead(false);
2816         }
2817 
2818         MI->eraseFromParent();
2819         MI = II->getIterator();
2820       }
2821 
2822       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2823         continue;
2824 
2825       if (const auto &MOI = MOA.getLoadInfo(MI))
2826         Changed |= expandLoad(*MOI, MI);
2827       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2828         Changed |= expandStore(*MOI, MI);
2829       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2830         Changed |= expandAtomicFence(*MOI, MI);
2831       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2832         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2833     }
2834   }
2835 
2836   Changed |= removeAtomicPseudoMIs();
2837   return Changed;
2838 }
2839 
2840 INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2841 
2842 char SIMemoryLegalizerLegacy::ID = 0;
2843 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2844 
2845 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2846   return new SIMemoryLegalizerLegacy();
2847 }
2848