xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/IR/DiagnosticInfo.h"
25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
26 #include "llvm/Support/AtomicOrdering.h"
27 #include "llvm/TargetParser/TargetParser.h"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 #define DEBUG_TYPE "si-memory-legalizer"
33 #define PASS_NAME "SI Memory Legalizer"
34 
35 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37     cl::desc("Use this to skip inserting cache invalidating instructions."));
38 
39 namespace {
40 
41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
42 
43 /// Memory operation flags. Can be ORed together.
44 enum class SIMemOp {
45   NONE = 0u,
46   LOAD = 1u << 0,
47   STORE = 1u << 1,
48   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49 };
50 
51 /// Position to insert a new instruction relative to an existing
52 /// instruction.
53 enum class Position {
54   BEFORE,
55   AFTER
56 };
57 
58 /// The atomic synchronization scopes supported by the AMDGPU target.
59 enum class SIAtomicScope {
60   NONE,
61   SINGLETHREAD,
62   WAVEFRONT,
63   WORKGROUP,
64   AGENT,
65   SYSTEM
66 };
67 
68 /// The distinct address spaces supported by the AMDGPU target for
69 /// atomic memory operation. Can be ORed together.
70 enum class SIAtomicAddrSpace {
71   NONE = 0u,
72   GLOBAL = 1u << 0,
73   LDS = 1u << 1,
74   SCRATCH = 1u << 2,
75   GDS = 1u << 3,
76   OTHER = 1u << 4,
77 
78   /// The address spaces that can be accessed by a FLAT instruction.
79   FLAT = GLOBAL | LDS | SCRATCH,
80 
81   /// The address spaces that support atomic instructions.
82   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83 
84   /// All address spaces.
85   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86 
87   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88 };
89 
90 class SIMemOpInfo final {
91 private:
92 
93   friend class SIMemOpAccess;
94 
95   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100   bool IsCrossAddressSpaceOrdering = false;
101   bool IsVolatile = false;
102   bool IsNonTemporal = false;
103   bool IsLastUse = false;
104 
105   SIMemOpInfo(
106       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109       SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110       bool IsCrossAddressSpaceOrdering = true,
111       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112       bool IsVolatile = false, bool IsNonTemporal = false,
113       bool IsLastUse = false)
114       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118         IsLastUse(IsLastUse) {
119 
120     if (Ordering == AtomicOrdering::NotAtomic) {
121       assert(Scope == SIAtomicScope::NONE &&
122              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123              !IsCrossAddressSpaceOrdering &&
124              FailureOrdering == AtomicOrdering::NotAtomic);
125       return;
126     }
127 
128     assert(Scope != SIAtomicScope::NONE &&
129            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE &&
131            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132                SIAtomicAddrSpace::NONE);
133 
134     // There is also no cross address space ordering if the ordering
135     // address space is the same as the instruction address space and
136     // only contains a single address space.
137     if ((OrderingAddrSpace == InstrAddrSpace) &&
138         isPowerOf2_32(uint32_t(InstrAddrSpace)))
139       this->IsCrossAddressSpaceOrdering = false;
140 
141     // Limit the scope to the maximum supported by the instruction's address
142     // spaces.
143     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144         SIAtomicAddrSpace::NONE) {
145       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146     } else if ((InstrAddrSpace &
147                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148                SIAtomicAddrSpace::NONE) {
149       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150     } else if ((InstrAddrSpace &
151                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154     }
155   }
156 
157 public:
158   /// \returns Atomic synchronization scope of the machine instruction used to
159   /// create this SIMemOpInfo.
160   SIAtomicScope getScope() const {
161     return Scope;
162   }
163 
164   /// \returns Ordering constraint of the machine instruction used to
165   /// create this SIMemOpInfo.
166   AtomicOrdering getOrdering() const {
167     return Ordering;
168   }
169 
170   /// \returns Failure ordering constraint of the machine instruction used to
171   /// create this SIMemOpInfo.
172   AtomicOrdering getFailureOrdering() const {
173     return FailureOrdering;
174   }
175 
176   /// \returns The address spaces be accessed by the machine
177   /// instruction used to create this SIMemOpInfo.
178   SIAtomicAddrSpace getInstrAddrSpace() const {
179     return InstrAddrSpace;
180   }
181 
182   /// \returns The address spaces that must be ordered by the machine
183   /// instruction used to create this SIMemOpInfo.
184   SIAtomicAddrSpace getOrderingAddrSpace() const {
185     return OrderingAddrSpace;
186   }
187 
188   /// \returns Return true iff memory ordering of operations on
189   /// different address spaces is required.
190   bool getIsCrossAddressSpaceOrdering() const {
191     return IsCrossAddressSpaceOrdering;
192   }
193 
194   /// \returns True if memory access of the machine instruction used to
195   /// create this SIMemOpInfo is volatile, false otherwise.
196   bool isVolatile() const {
197     return IsVolatile;
198   }
199 
200   /// \returns True if memory access of the machine instruction used to
201   /// create this SIMemOpInfo is nontemporal, false otherwise.
202   bool isNonTemporal() const {
203     return IsNonTemporal;
204   }
205 
206   /// \returns True if memory access of the machine instruction used to
207   /// create this SIMemOpInfo is last use, false otherwise.
208   bool isLastUse() const { return IsLastUse; }
209 
210   /// \returns True if ordering constraint of the machine instruction used to
211   /// create this SIMemOpInfo is unordered or higher, false otherwise.
212   bool isAtomic() const {
213     return Ordering != AtomicOrdering::NotAtomic;
214   }
215 
216 };
217 
218 class SIMemOpAccess final {
219 private:
220   const AMDGPUMachineModuleInfo *MMI = nullptr;
221 
222   /// Reports unsupported message \p Msg for \p MI to LLVM context.
223   void reportUnsupported(const MachineBasicBlock::iterator &MI,
224                          const char *Msg) const;
225 
226   /// Inspects the target synchronization scope \p SSID and determines
227   /// the SI atomic scope it corresponds to, the address spaces it
228   /// covers, and whether the memory ordering applies between address
229   /// spaces.
230   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232 
233   /// \return Return a bit set of the address spaces accessed by \p AS.
234   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235 
236   /// \returns Info constructed from \p MI, which has at least machine memory
237   /// operand.
238   std::optional<SIMemOpInfo>
239   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240 
241 public:
242   /// Construct class to support accessing the machine memory operands
243   /// of instructions in the machine function \p MF.
244   SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
245 
246   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247   std::optional<SIMemOpInfo>
248   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
249 
250   /// \returns Store info if \p MI is a store operation, "std::nullopt"
251   /// otherwise.
252   std::optional<SIMemOpInfo>
253   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254 
255   /// \returns Atomic fence info if \p MI is an atomic fence operation,
256   /// "std::nullopt" otherwise.
257   std::optional<SIMemOpInfo>
258   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259 
260   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261   /// rmw operation, "std::nullopt" otherwise.
262   std::optional<SIMemOpInfo>
263   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264 };
265 
266 class SICacheControl {
267 protected:
268 
269   /// AMDGPU subtarget info.
270   const GCNSubtarget &ST;
271 
272   /// Instruction info.
273   const SIInstrInfo *TII = nullptr;
274 
275   IsaVersion IV;
276 
277   /// Whether to insert cache invalidating instructions.
278   bool InsertCacheInv;
279 
280   SICacheControl(const GCNSubtarget &ST);
281 
282   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283   /// \returns Returns true if \p MI is modified, false otherwise.
284   bool enableNamedBit(const MachineBasicBlock::iterator MI,
285                       AMDGPU::CPol::CPol Bit) const;
286 
287 public:
288 
289   /// Create a cache control for the subtarget \p ST.
290   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291 
292   /// Update \p MI memory load instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296                                      SIAtomicScope Scope,
297                                      SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory store instruction to bypass any caches up to
300   /// the \p Scope memory scope for address spaces \p
301   /// AddrSpace. Return true iff the instruction was modified.
302   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303                                       SIAtomicScope Scope,
304                                       SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory read-modify-write instruction to bypass any caches up
307   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308   /// iff the instruction was modified.
309   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310                                     SIAtomicScope Scope,
311                                     SIAtomicAddrSpace AddrSpace) const = 0;
312 
313   /// Update \p MI memory instruction of kind \p Op associated with address
314   /// spaces \p AddrSpace to indicate it is volatile and/or
315   /// nontemporal/last-use. Return true iff the instruction was modified.
316   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317                                               SIAtomicAddrSpace AddrSpace,
318                                               SIMemOp Op, bool IsVolatile,
319                                               bool IsNonTemporal,
320                                               bool IsLastUse = false) const = 0;
321 
322   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323     return false;
324   };
325 
326   /// Inserts any necessary instructions at position \p Pos relative
327   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328   /// \p Op associated with address spaces \p AddrSpace have completed. Used
329   /// between memory instructions to enforce the order they become visible as
330   /// observed by other memory instructions executing in memory scope \p Scope.
331   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332   /// address spaces. Returns true iff any instructions inserted.
333   virtual bool insertWait(MachineBasicBlock::iterator &MI,
334                           SIAtomicScope Scope,
335                           SIAtomicAddrSpace AddrSpace,
336                           SIMemOp Op,
337                           bool IsCrossAddrSpaceOrdering,
338                           Position Pos) const = 0;
339 
340   /// Inserts any necessary instructions at position \p Pos relative to
341   /// instruction \p MI to ensure any subsequent memory instructions of this
342   /// thread with address spaces \p AddrSpace will observe the previous memory
343   /// operations by any thread for memory scopes up to memory scope \p Scope .
344   /// Returns true iff any instructions inserted.
345   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              Position Pos) const = 0;
349 
350   /// Inserts any necessary instructions at position \p Pos relative to
351   /// instruction \p MI to ensure previous memory instructions by this thread
352   /// with address spaces \p AddrSpace have completed and can be observed by
353   /// subsequent memory instructions by any thread executing in memory scope \p
354   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355   /// between address spaces. Returns true iff any instructions inserted.
356   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357                              SIAtomicScope Scope,
358                              SIAtomicAddrSpace AddrSpace,
359                              bool IsCrossAddrSpaceOrdering,
360                              Position Pos) const = 0;
361 
362   /// Virtual destructor to allow derivations to be deleted.
363   virtual ~SICacheControl() = default;
364 
365   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
366                                    MachineBasicBlock::iterator &MI) const {
367     return false;
368   }
369 };
370 
371 class SIGfx6CacheControl : public SICacheControl {
372 protected:
373 
374   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
375   /// is modified, false otherwise.
376   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377     return enableNamedBit(MI, AMDGPU::CPol::GLC);
378   }
379 
380   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
381   /// is modified, false otherwise.
382   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383     return enableNamedBit(MI, AMDGPU::CPol::SLC);
384   }
385 
386 public:
387 
388   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
389 
390   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
391                              SIAtomicScope Scope,
392                              SIAtomicAddrSpace AddrSpace) const override;
393 
394   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395                               SIAtomicScope Scope,
396                               SIAtomicAddrSpace AddrSpace) const override;
397 
398   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399                             SIAtomicScope Scope,
400                             SIAtomicAddrSpace AddrSpace) const override;
401 
402   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404                                       bool IsVolatile, bool IsNonTemporal,
405                                       bool IsLastUse) const override;
406 
407   bool insertWait(MachineBasicBlock::iterator &MI,
408                   SIAtomicScope Scope,
409                   SIAtomicAddrSpace AddrSpace,
410                   SIMemOp Op,
411                   bool IsCrossAddrSpaceOrdering,
412                   Position Pos) const override;
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419   bool insertRelease(MachineBasicBlock::iterator &MI,
420                      SIAtomicScope Scope,
421                      SIAtomicAddrSpace AddrSpace,
422                      bool IsCrossAddrSpaceOrdering,
423                      Position Pos) const override;
424 };
425 
426 class SIGfx7CacheControl : public SIGfx6CacheControl {
427 public:
428 
429   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
430 
431   bool insertAcquire(MachineBasicBlock::iterator &MI,
432                      SIAtomicScope Scope,
433                      SIAtomicAddrSpace AddrSpace,
434                      Position Pos) const override;
435 
436 };
437 
438 class SIGfx90ACacheControl : public SIGfx7CacheControl {
439 public:
440 
441   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
442 
443   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444                              SIAtomicScope Scope,
445                              SIAtomicAddrSpace AddrSpace) const override;
446 
447   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448                               SIAtomicScope Scope,
449                               SIAtomicAddrSpace AddrSpace) const override;
450 
451   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452                             SIAtomicScope Scope,
453                             SIAtomicAddrSpace AddrSpace) const override;
454 
455   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457                                       bool IsVolatile, bool IsNonTemporal,
458                                       bool IsLastUse) const override;
459 
460   bool insertWait(MachineBasicBlock::iterator &MI,
461                   SIAtomicScope Scope,
462                   SIAtomicAddrSpace AddrSpace,
463                   SIMemOp Op,
464                   bool IsCrossAddrSpaceOrdering,
465                   Position Pos) const override;
466 
467   bool insertAcquire(MachineBasicBlock::iterator &MI,
468                      SIAtomicScope Scope,
469                      SIAtomicAddrSpace AddrSpace,
470                      Position Pos) const override;
471 
472   bool insertRelease(MachineBasicBlock::iterator &MI,
473                      SIAtomicScope Scope,
474                      SIAtomicAddrSpace AddrSpace,
475                      bool IsCrossAddrSpaceOrdering,
476                      Position Pos) const override;
477 };
478 
479 class SIGfx940CacheControl : public SIGfx90ACacheControl {
480 protected:
481 
482   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
483   /// is modified, false otherwise.
484   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
485     return enableNamedBit(MI, AMDGPU::CPol::SC0);
486   }
487 
488   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
489   /// is modified, false otherwise.
490   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
491     return enableNamedBit(MI, AMDGPU::CPol::SC1);
492   }
493 
494   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
495   /// is modified, false otherwise.
496   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
497     return enableNamedBit(MI, AMDGPU::CPol::NT);
498   }
499 
500 public:
501 
502   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
503 
504   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
505                              SIAtomicScope Scope,
506                              SIAtomicAddrSpace AddrSpace) const override;
507 
508   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
509                               SIAtomicScope Scope,
510                               SIAtomicAddrSpace AddrSpace) const override;
511 
512   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
513                             SIAtomicScope Scope,
514                             SIAtomicAddrSpace AddrSpace) const override;
515 
516   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
517                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518                                       bool IsVolatile, bool IsNonTemporal,
519                                       bool IsLastUse) const override;
520 
521   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
522                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
523 
524   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
526                      Position Pos) const override;
527 
528   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
529                            MachineBasicBlock::iterator &MI) const override {
530     bool Changed = false;
531     if (ST.hasForceStoreSC0SC1() &&
532         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
533                                     SIAtomicAddrSpace::GLOBAL |
534                                     SIAtomicAddrSpace::OTHER)) !=
535          SIAtomicAddrSpace::NONE) {
536       Changed |= enableSC0Bit(MI);
537       Changed |= enableSC1Bit(MI);
538     }
539     return Changed;
540   }
541 };
542 
543 class SIGfx10CacheControl : public SIGfx7CacheControl {
544 protected:
545 
546   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
547   /// is modified, false otherwise.
548   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549     return enableNamedBit(MI, AMDGPU::CPol::DLC);
550   }
551 
552 public:
553 
554   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
555 
556   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557                              SIAtomicScope Scope,
558                              SIAtomicAddrSpace AddrSpace) const override;
559 
560   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562                                       bool IsVolatile, bool IsNonTemporal,
563                                       bool IsLastUse) const override;
564 
565   bool insertWait(MachineBasicBlock::iterator &MI,
566                   SIAtomicScope Scope,
567                   SIAtomicAddrSpace AddrSpace,
568                   SIMemOp Op,
569                   bool IsCrossAddrSpaceOrdering,
570                   Position Pos) const override;
571 
572   bool insertAcquire(MachineBasicBlock::iterator &MI,
573                      SIAtomicScope Scope,
574                      SIAtomicAddrSpace AddrSpace,
575                      Position Pos) const override;
576 };
577 
578 class SIGfx11CacheControl : public SIGfx10CacheControl {
579 public:
580   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
581 
582   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
583                              SIAtomicScope Scope,
584                              SIAtomicAddrSpace AddrSpace) const override;
585 
586   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
587                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588                                       bool IsVolatile, bool IsNonTemporal,
589                                       bool IsLastUse) const override;
590 };
591 
592 class SIGfx12CacheControl : public SIGfx11CacheControl {
593 protected:
594   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
595   // \returns Returns true if \p MI is modified, false otherwise.
596   bool setTH(const MachineBasicBlock::iterator MI,
597              AMDGPU::CPol::CPol Value) const;
598   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
599   // MI. \returns Returns true if \p MI is modified, false otherwise.
600   bool setScope(const MachineBasicBlock::iterator MI,
601                 AMDGPU::CPol::CPol Value) const;
602 
603   // Stores with system scope (SCOPE_SYS) need to wait for:
604   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
605   // - non-returning-atomics       - wait for STORECNT==0
606   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607   //   since it does not distinguish atomics-with-return from regular stores.
608   // There is no need to wait if memory is cached (mtype != UC).
609   bool
610   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611 
612   bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614 
615 public:
616   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
617 
618   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
619                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
621 
622   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
623                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
624 
625   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
626                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627                                       bool IsVolatile, bool IsNonTemporal,
628                                       bool IsLastUse) const override;
629 
630   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631 
632   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634                      Position Pos) const override;
635 
636   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637                              SIAtomicScope Scope,
638                              SIAtomicAddrSpace AddrSpace) const override {
639     return setAtomicScope(MI, Scope, AddrSpace);
640   }
641 
642   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643                               SIAtomicScope Scope,
644                               SIAtomicAddrSpace AddrSpace) const override {
645     return setAtomicScope(MI, Scope, AddrSpace);
646   }
647 
648   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649                             SIAtomicScope Scope,
650                             SIAtomicAddrSpace AddrSpace) const override {
651     return setAtomicScope(MI, Scope, AddrSpace);
652   }
653 };
654 
655 class SIMemoryLegalizer final : public MachineFunctionPass {
656 private:
657 
658   /// Cache Control.
659   std::unique_ptr<SICacheControl> CC = nullptr;
660 
661   /// List of atomic pseudo instructions.
662   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
663 
664   /// Return true iff instruction \p MI is a atomic instruction that
665   /// returns a result.
666   bool isAtomicRet(const MachineInstr &MI) const {
667     return SIInstrInfo::isAtomicRet(MI);
668   }
669 
670   /// Removes all processed atomic pseudo instructions from the current
671   /// function. Returns true if current function is modified, false otherwise.
672   bool removeAtomicPseudoMIs();
673 
674   /// Expands load operation \p MI. Returns true if instructions are
675   /// added/deleted or \p MI is modified, false otherwise.
676   bool expandLoad(const SIMemOpInfo &MOI,
677                   MachineBasicBlock::iterator &MI);
678   /// Expands store operation \p MI. Returns true if instructions are
679   /// added/deleted or \p MI is modified, false otherwise.
680   bool expandStore(const SIMemOpInfo &MOI,
681                    MachineBasicBlock::iterator &MI);
682   /// Expands atomic fence operation \p MI. Returns true if
683   /// instructions are added/deleted or \p MI is modified, false otherwise.
684   bool expandAtomicFence(const SIMemOpInfo &MOI,
685                          MachineBasicBlock::iterator &MI);
686   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
687   /// instructions are added/deleted or \p MI is modified, false otherwise.
688   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
689                                 MachineBasicBlock::iterator &MI);
690 
691 public:
692   static char ID;
693 
694   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
695 
696   void getAnalysisUsage(AnalysisUsage &AU) const override {
697     AU.setPreservesCFG();
698     MachineFunctionPass::getAnalysisUsage(AU);
699   }
700 
701   StringRef getPassName() const override {
702     return PASS_NAME;
703   }
704 
705   bool runOnMachineFunction(MachineFunction &MF) override;
706 };
707 
708 static const StringMap<SIAtomicAddrSpace> ASNames = {{
709     {"global", SIAtomicAddrSpace::GLOBAL},
710     {"local", SIAtomicAddrSpace::LDS},
711 }};
712 
713 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
714   const MachineFunction *MF = MI.getMF();
715   const Function &Fn = MF->getFunction();
716   SmallString<128> Str;
717   raw_svector_ostream OS(Str);
718   OS << "unknown address space '" << AS << "'; expected one of ";
719   ListSeparator LS;
720   for (const auto &[Name, Val] : ASNames)
721     OS << LS << '\'' << Name << '\'';
722   DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
723   Fn.getContext().diagnose(BadTag);
724 }
725 
726 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
727 /// If this tag isn't present, or if it has no meaningful values, returns \p
728 /// Default. Otherwise returns all the address spaces concerned by the MMRA.
729 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
730                                                SIAtomicAddrSpace Default) {
731   static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
732 
733   auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
734   if (!MMRA)
735     return Default;
736 
737   SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
738   for (const auto &[Prefix, Suffix] : MMRA) {
739     if (Prefix != FenceASPrefix)
740       continue;
741 
742     if (auto It = ASNames.find(Suffix); It != ASNames.end())
743       Result |= It->second;
744     else
745       diagnoseUnknownMMRAASName(MI, Suffix);
746   }
747 
748   return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
749 }
750 
751 } // end anonymous namespace
752 
753 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
754                                       const char *Msg) const {
755   const Function &Func = MI->getParent()->getParent()->getFunction();
756   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
757   Func.getContext().diagnose(Diag);
758 }
759 
760 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
761 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
762                                SIAtomicAddrSpace InstrAddrSpace) const {
763   if (SSID == SyncScope::System)
764     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
765   if (SSID == MMI->getAgentSSID())
766     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
767   if (SSID == MMI->getWorkgroupSSID())
768     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
769                       true);
770   if (SSID == MMI->getWavefrontSSID())
771     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
772                       true);
773   if (SSID == SyncScope::SingleThread)
774     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
775                       true);
776   if (SSID == MMI->getSystemOneAddressSpaceSSID())
777     return std::tuple(SIAtomicScope::SYSTEM,
778                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
779   if (SSID == MMI->getAgentOneAddressSpaceSSID())
780     return std::tuple(SIAtomicScope::AGENT,
781                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
782   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
783     return std::tuple(SIAtomicScope::WORKGROUP,
784                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
785   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
786     return std::tuple(SIAtomicScope::WAVEFRONT,
787                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
788   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
789     return std::tuple(SIAtomicScope::SINGLETHREAD,
790                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791   return std::nullopt;
792 }
793 
794 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
795   if (AS == AMDGPUAS::FLAT_ADDRESS)
796     return SIAtomicAddrSpace::FLAT;
797   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
798     return SIAtomicAddrSpace::GLOBAL;
799   if (AS == AMDGPUAS::LOCAL_ADDRESS)
800     return SIAtomicAddrSpace::LDS;
801   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
802     return SIAtomicAddrSpace::SCRATCH;
803   if (AS == AMDGPUAS::REGION_ADDRESS)
804     return SIAtomicAddrSpace::GDS;
805 
806   return SIAtomicAddrSpace::OTHER;
807 }
808 
809 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
810     : MMI(&MMI_) {}
811 
812 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
813     const MachineBasicBlock::iterator &MI) const {
814   assert(MI->getNumMemOperands() > 0);
815 
816   SyncScope::ID SSID = SyncScope::SingleThread;
817   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
818   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
819   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
820   bool IsNonTemporal = true;
821   bool IsVolatile = false;
822   bool IsLastUse = false;
823 
824   // Validator should check whether or not MMOs cover the entire set of
825   // locations accessed by the memory instruction.
826   for (const auto &MMO : MI->memoperands()) {
827     IsNonTemporal &= MMO->isNonTemporal();
828     IsVolatile |= MMO->isVolatile();
829     IsLastUse |= MMO->getFlags() & MOLastUse;
830     InstrAddrSpace |=
831       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
832     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
833     if (OpOrdering != AtomicOrdering::NotAtomic) {
834       const auto &IsSyncScopeInclusion =
835           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
836       if (!IsSyncScopeInclusion) {
837         reportUnsupported(MI,
838           "Unsupported non-inclusive atomic synchronization scope");
839         return std::nullopt;
840       }
841 
842       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
843       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
844       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
845              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
846       FailureOrdering =
847           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
848     }
849   }
850 
851   SIAtomicScope Scope = SIAtomicScope::NONE;
852   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
853   bool IsCrossAddressSpaceOrdering = false;
854   if (Ordering != AtomicOrdering::NotAtomic) {
855     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
856     if (!ScopeOrNone) {
857       reportUnsupported(MI, "Unsupported atomic synchronization scope");
858       return std::nullopt;
859     }
860     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
861         *ScopeOrNone;
862     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
863         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
864         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
865       reportUnsupported(MI, "Unsupported atomic address space");
866       return std::nullopt;
867     }
868   }
869   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
870                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
871                      IsNonTemporal, IsLastUse);
872 }
873 
874 std::optional<SIMemOpInfo>
875 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
876   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
877 
878   if (!(MI->mayLoad() && !MI->mayStore()))
879     return std::nullopt;
880 
881   // Be conservative if there are no memory operands.
882   if (MI->getNumMemOperands() == 0)
883     return SIMemOpInfo();
884 
885   return constructFromMIWithMMO(MI);
886 }
887 
888 std::optional<SIMemOpInfo>
889 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
890   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
891 
892   if (!(!MI->mayLoad() && MI->mayStore()))
893     return std::nullopt;
894 
895   // Be conservative if there are no memory operands.
896   if (MI->getNumMemOperands() == 0)
897     return SIMemOpInfo();
898 
899   return constructFromMIWithMMO(MI);
900 }
901 
902 std::optional<SIMemOpInfo>
903 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
904   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
905 
906   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
907     return std::nullopt;
908 
909   AtomicOrdering Ordering =
910     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
911 
912   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
913   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
914   if (!ScopeOrNone) {
915     reportUnsupported(MI, "Unsupported atomic synchronization scope");
916     return std::nullopt;
917   }
918 
919   SIAtomicScope Scope = SIAtomicScope::NONE;
920   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
921   bool IsCrossAddressSpaceOrdering = false;
922   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
923       *ScopeOrNone;
924 
925   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
926       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
927     reportUnsupported(MI, "Unsupported atomic address space");
928     return std::nullopt;
929   }
930 
931   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
932                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
933 }
934 
935 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
936     const MachineBasicBlock::iterator &MI) const {
937   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
938 
939   if (!(MI->mayLoad() && MI->mayStore()))
940     return std::nullopt;
941 
942   // Be conservative if there are no memory operands.
943   if (MI->getNumMemOperands() == 0)
944     return SIMemOpInfo();
945 
946   return constructFromMIWithMMO(MI);
947 }
948 
949 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
950   TII = ST.getInstrInfo();
951   IV = getIsaVersion(ST.getCPU());
952   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
953 }
954 
955 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
956                                     AMDGPU::CPol::CPol Bit) const {
957   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
958   if (!CPol)
959     return false;
960 
961   CPol->setImm(CPol->getImm() | Bit);
962   return true;
963 }
964 
965 /* static */
966 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
967   GCNSubtarget::Generation Generation = ST.getGeneration();
968   if (ST.hasGFX940Insts())
969     return std::make_unique<SIGfx940CacheControl>(ST);
970   if (ST.hasGFX90AInsts())
971     return std::make_unique<SIGfx90ACacheControl>(ST);
972   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
973     return std::make_unique<SIGfx6CacheControl>(ST);
974   if (Generation < AMDGPUSubtarget::GFX10)
975     return std::make_unique<SIGfx7CacheControl>(ST);
976   if (Generation < AMDGPUSubtarget::GFX11)
977     return std::make_unique<SIGfx10CacheControl>(ST);
978   if (Generation < AMDGPUSubtarget::GFX12)
979     return std::make_unique<SIGfx11CacheControl>(ST);
980   return std::make_unique<SIGfx12CacheControl>(ST);
981 }
982 
983 bool SIGfx6CacheControl::enableLoadCacheBypass(
984     const MachineBasicBlock::iterator &MI,
985     SIAtomicScope Scope,
986     SIAtomicAddrSpace AddrSpace) const {
987   assert(MI->mayLoad() && !MI->mayStore());
988   bool Changed = false;
989 
990   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
991     switch (Scope) {
992     case SIAtomicScope::SYSTEM:
993     case SIAtomicScope::AGENT:
994       // Set L1 cache policy to MISS_EVICT.
995       // Note: there is no L2 cache bypass policy at the ISA level.
996       Changed |= enableGLCBit(MI);
997       break;
998     case SIAtomicScope::WORKGROUP:
999     case SIAtomicScope::WAVEFRONT:
1000     case SIAtomicScope::SINGLETHREAD:
1001       // No cache to bypass.
1002       break;
1003     default:
1004       llvm_unreachable("Unsupported synchronization scope");
1005     }
1006   }
1007 
1008   /// The scratch address space does not need the global memory caches
1009   /// to be bypassed as all memory operations by the same thread are
1010   /// sequentially consistent, and no other thread can access scratch
1011   /// memory.
1012 
1013   /// Other address spaces do not have a cache.
1014 
1015   return Changed;
1016 }
1017 
1018 bool SIGfx6CacheControl::enableStoreCacheBypass(
1019     const MachineBasicBlock::iterator &MI,
1020     SIAtomicScope Scope,
1021     SIAtomicAddrSpace AddrSpace) const {
1022   assert(!MI->mayLoad() && MI->mayStore());
1023   bool Changed = false;
1024 
1025   /// The L1 cache is write through so does not need to be bypassed. There is no
1026   /// bypass control for the L2 cache at the isa level.
1027 
1028   return Changed;
1029 }
1030 
1031 bool SIGfx6CacheControl::enableRMWCacheBypass(
1032     const MachineBasicBlock::iterator &MI,
1033     SIAtomicScope Scope,
1034     SIAtomicAddrSpace AddrSpace) const {
1035   assert(MI->mayLoad() && MI->mayStore());
1036   bool Changed = false;
1037 
1038   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1039   /// bypassed, and the GLC bit is instead used to indicate if they are
1040   /// return or no-return.
1041   /// Note: there is no L2 cache coherent bypass control at the ISA level.
1042 
1043   return Changed;
1044 }
1045 
1046 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1047     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1048     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1049   // Only handle load and store, not atomic read-modify-write insructions. The
1050   // latter use glc to indicate if the atomic returns a result and so must not
1051   // be used for cache control.
1052   assert(MI->mayLoad() ^ MI->mayStore());
1053 
1054   // Only update load and store, not LLVM IR atomic read-modify-write
1055   // instructions. The latter are always marked as volatile so cannot sensibly
1056   // handle it as do not want to pessimize all atomics. Also they do not support
1057   // the nontemporal attribute.
1058   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1059 
1060   bool Changed = false;
1061 
1062   if (IsVolatile) {
1063     // Set L1 cache policy to be MISS_EVICT for load instructions
1064     // and MISS_LRU for store instructions.
1065     // Note: there is no L2 cache bypass policy at the ISA level.
1066     if (Op == SIMemOp::LOAD)
1067       Changed |= enableGLCBit(MI);
1068 
1069     // Ensure operation has completed at system scope to cause all volatile
1070     // operations to be visible outside the program in a global order. Do not
1071     // request cross address space as only the global address space can be
1072     // observable outside the program, so no need to cause a waitcnt for LDS
1073     // address space operations.
1074     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1075                           Position::AFTER);
1076 
1077     return Changed;
1078   }
1079 
1080   if (IsNonTemporal) {
1081     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1082     // for both loads and stores, and the L2 cache policy to STREAM.
1083     Changed |= enableGLCBit(MI);
1084     Changed |= enableSLCBit(MI);
1085     return Changed;
1086   }
1087 
1088   return Changed;
1089 }
1090 
1091 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1092                                     SIAtomicScope Scope,
1093                                     SIAtomicAddrSpace AddrSpace,
1094                                     SIMemOp Op,
1095                                     bool IsCrossAddrSpaceOrdering,
1096                                     Position Pos) const {
1097   bool Changed = false;
1098 
1099   MachineBasicBlock &MBB = *MI->getParent();
1100   DebugLoc DL = MI->getDebugLoc();
1101 
1102   if (Pos == Position::AFTER)
1103     ++MI;
1104 
1105   bool VMCnt = false;
1106   bool LGKMCnt = false;
1107 
1108   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1109       SIAtomicAddrSpace::NONE) {
1110     switch (Scope) {
1111     case SIAtomicScope::SYSTEM:
1112     case SIAtomicScope::AGENT:
1113       VMCnt |= true;
1114       break;
1115     case SIAtomicScope::WORKGROUP:
1116     case SIAtomicScope::WAVEFRONT:
1117     case SIAtomicScope::SINGLETHREAD:
1118       // The L1 cache keeps all memory operations in order for
1119       // wavefronts in the same work-group.
1120       break;
1121     default:
1122       llvm_unreachable("Unsupported synchronization scope");
1123     }
1124   }
1125 
1126   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1127     switch (Scope) {
1128     case SIAtomicScope::SYSTEM:
1129     case SIAtomicScope::AGENT:
1130     case SIAtomicScope::WORKGROUP:
1131       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1132       // not needed as LDS operations for all waves are executed in a total
1133       // global ordering as observed by all waves. Required if also
1134       // synchronizing with global/GDS memory as LDS operations could be
1135       // reordered with respect to later global/GDS memory operations of the
1136       // same wave.
1137       LGKMCnt |= IsCrossAddrSpaceOrdering;
1138       break;
1139     case SIAtomicScope::WAVEFRONT:
1140     case SIAtomicScope::SINGLETHREAD:
1141       // The LDS keeps all memory operations in order for
1142       // the same wavefront.
1143       break;
1144     default:
1145       llvm_unreachable("Unsupported synchronization scope");
1146     }
1147   }
1148 
1149   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1150     switch (Scope) {
1151     case SIAtomicScope::SYSTEM:
1152     case SIAtomicScope::AGENT:
1153       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1154       // is not needed as GDS operations for all waves are executed in a total
1155       // global ordering as observed by all waves. Required if also
1156       // synchronizing with global/LDS memory as GDS operations could be
1157       // reordered with respect to later global/LDS memory operations of the
1158       // same wave.
1159       LGKMCnt |= IsCrossAddrSpaceOrdering;
1160       break;
1161     case SIAtomicScope::WORKGROUP:
1162     case SIAtomicScope::WAVEFRONT:
1163     case SIAtomicScope::SINGLETHREAD:
1164       // The GDS keeps all memory operations in order for
1165       // the same work-group.
1166       break;
1167     default:
1168       llvm_unreachable("Unsupported synchronization scope");
1169     }
1170   }
1171 
1172   if (VMCnt || LGKMCnt) {
1173     unsigned WaitCntImmediate =
1174       AMDGPU::encodeWaitcnt(IV,
1175                             VMCnt ? 0 : getVmcntBitMask(IV),
1176                             getExpcntBitMask(IV),
1177                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1178     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1179         .addImm(WaitCntImmediate);
1180     Changed = true;
1181   }
1182 
1183   if (Pos == Position::AFTER)
1184     --MI;
1185 
1186   return Changed;
1187 }
1188 
1189 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1190                                        SIAtomicScope Scope,
1191                                        SIAtomicAddrSpace AddrSpace,
1192                                        Position Pos) const {
1193   if (!InsertCacheInv)
1194     return false;
1195 
1196   bool Changed = false;
1197 
1198   MachineBasicBlock &MBB = *MI->getParent();
1199   DebugLoc DL = MI->getDebugLoc();
1200 
1201   if (Pos == Position::AFTER)
1202     ++MI;
1203 
1204   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1205     switch (Scope) {
1206     case SIAtomicScope::SYSTEM:
1207     case SIAtomicScope::AGENT:
1208       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1209       Changed = true;
1210       break;
1211     case SIAtomicScope::WORKGROUP:
1212     case SIAtomicScope::WAVEFRONT:
1213     case SIAtomicScope::SINGLETHREAD:
1214       // No cache to invalidate.
1215       break;
1216     default:
1217       llvm_unreachable("Unsupported synchronization scope");
1218     }
1219   }
1220 
1221   /// The scratch address space does not need the global memory cache
1222   /// to be flushed as all memory operations by the same thread are
1223   /// sequentially consistent, and no other thread can access scratch
1224   /// memory.
1225 
1226   /// Other address spaces do not have a cache.
1227 
1228   if (Pos == Position::AFTER)
1229     --MI;
1230 
1231   return Changed;
1232 }
1233 
1234 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1235                                        SIAtomicScope Scope,
1236                                        SIAtomicAddrSpace AddrSpace,
1237                                        bool IsCrossAddrSpaceOrdering,
1238                                        Position Pos) const {
1239   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1240                     IsCrossAddrSpaceOrdering, Pos);
1241 }
1242 
1243 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1244                                        SIAtomicScope Scope,
1245                                        SIAtomicAddrSpace AddrSpace,
1246                                        Position Pos) const {
1247   if (!InsertCacheInv)
1248     return false;
1249 
1250   bool Changed = false;
1251 
1252   MachineBasicBlock &MBB = *MI->getParent();
1253   DebugLoc DL = MI->getDebugLoc();
1254 
1255   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1256 
1257   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1258                                     ? AMDGPU::BUFFER_WBINVL1
1259                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1260 
1261   if (Pos == Position::AFTER)
1262     ++MI;
1263 
1264   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1265     switch (Scope) {
1266     case SIAtomicScope::SYSTEM:
1267     case SIAtomicScope::AGENT:
1268       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1269       Changed = true;
1270       break;
1271     case SIAtomicScope::WORKGROUP:
1272     case SIAtomicScope::WAVEFRONT:
1273     case SIAtomicScope::SINGLETHREAD:
1274       // No cache to invalidate.
1275       break;
1276     default:
1277       llvm_unreachable("Unsupported synchronization scope");
1278     }
1279   }
1280 
1281   /// The scratch address space does not need the global memory cache
1282   /// to be flushed as all memory operations by the same thread are
1283   /// sequentially consistent, and no other thread can access scratch
1284   /// memory.
1285 
1286   /// Other address spaces do not have a cache.
1287 
1288   if (Pos == Position::AFTER)
1289     --MI;
1290 
1291   return Changed;
1292 }
1293 
1294 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1295     const MachineBasicBlock::iterator &MI,
1296     SIAtomicScope Scope,
1297     SIAtomicAddrSpace AddrSpace) const {
1298   assert(MI->mayLoad() && !MI->mayStore());
1299   bool Changed = false;
1300 
1301   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1302     switch (Scope) {
1303     case SIAtomicScope::SYSTEM:
1304     case SIAtomicScope::AGENT:
1305       // Set the L1 cache policy to MISS_LRU.
1306       // Note: there is no L2 cache bypass policy at the ISA level.
1307       Changed |= enableGLCBit(MI);
1308       break;
1309     case SIAtomicScope::WORKGROUP:
1310       // In threadgroup split mode the waves of a work-group can be executing on
1311       // different CUs. Therefore need to bypass the L1 which is per CU.
1312       // Otherwise in non-threadgroup split mode all waves of a work-group are
1313       // on the same CU, and so the L1 does not need to be bypassed.
1314       if (ST.isTgSplitEnabled())
1315         Changed |= enableGLCBit(MI);
1316       break;
1317     case SIAtomicScope::WAVEFRONT:
1318     case SIAtomicScope::SINGLETHREAD:
1319       // No cache to bypass.
1320       break;
1321     default:
1322       llvm_unreachable("Unsupported synchronization scope");
1323     }
1324   }
1325 
1326   /// The scratch address space does not need the global memory caches
1327   /// to be bypassed as all memory operations by the same thread are
1328   /// sequentially consistent, and no other thread can access scratch
1329   /// memory.
1330 
1331   /// Other address spaces do not have a cache.
1332 
1333   return Changed;
1334 }
1335 
1336 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1337     const MachineBasicBlock::iterator &MI,
1338     SIAtomicScope Scope,
1339     SIAtomicAddrSpace AddrSpace) const {
1340   assert(!MI->mayLoad() && MI->mayStore());
1341   bool Changed = false;
1342 
1343   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1344     switch (Scope) {
1345     case SIAtomicScope::SYSTEM:
1346     case SIAtomicScope::AGENT:
1347       /// Do not set glc for store atomic operations as they implicitly write
1348       /// through the L1 cache.
1349       break;
1350     case SIAtomicScope::WORKGROUP:
1351     case SIAtomicScope::WAVEFRONT:
1352     case SIAtomicScope::SINGLETHREAD:
1353       // No cache to bypass. Store atomics implicitly write through the L1
1354       // cache.
1355       break;
1356     default:
1357       llvm_unreachable("Unsupported synchronization scope");
1358     }
1359   }
1360 
1361   /// The scratch address space does not need the global memory caches
1362   /// to be bypassed as all memory operations by the same thread are
1363   /// sequentially consistent, and no other thread can access scratch
1364   /// memory.
1365 
1366   /// Other address spaces do not have a cache.
1367 
1368   return Changed;
1369 }
1370 
1371 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1372     const MachineBasicBlock::iterator &MI,
1373     SIAtomicScope Scope,
1374     SIAtomicAddrSpace AddrSpace) const {
1375   assert(MI->mayLoad() && MI->mayStore());
1376   bool Changed = false;
1377 
1378   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1379     switch (Scope) {
1380     case SIAtomicScope::SYSTEM:
1381     case SIAtomicScope::AGENT:
1382       /// Do not set glc for RMW atomic operations as they implicitly bypass
1383       /// the L1 cache, and the glc bit is instead used to indicate if they are
1384       /// return or no-return.
1385       break;
1386     case SIAtomicScope::WORKGROUP:
1387     case SIAtomicScope::WAVEFRONT:
1388     case SIAtomicScope::SINGLETHREAD:
1389       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1390       break;
1391     default:
1392       llvm_unreachable("Unsupported synchronization scope");
1393     }
1394   }
1395 
1396   return Changed;
1397 }
1398 
1399 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1400     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1401     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1402   // Only handle load and store, not atomic read-modify-write insructions. The
1403   // latter use glc to indicate if the atomic returns a result and so must not
1404   // be used for cache control.
1405   assert(MI->mayLoad() ^ MI->mayStore());
1406 
1407   // Only update load and store, not LLVM IR atomic read-modify-write
1408   // instructions. The latter are always marked as volatile so cannot sensibly
1409   // handle it as do not want to pessimize all atomics. Also they do not support
1410   // the nontemporal attribute.
1411   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1412 
1413   bool Changed = false;
1414 
1415   if (IsVolatile) {
1416     // Set L1 cache policy to be MISS_EVICT for load instructions
1417     // and MISS_LRU for store instructions.
1418     // Note: there is no L2 cache bypass policy at the ISA level.
1419     if (Op == SIMemOp::LOAD)
1420       Changed |= enableGLCBit(MI);
1421 
1422     // Ensure operation has completed at system scope to cause all volatile
1423     // operations to be visible outside the program in a global order. Do not
1424     // request cross address space as only the global address space can be
1425     // observable outside the program, so no need to cause a waitcnt for LDS
1426     // address space operations.
1427     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1428                           Position::AFTER);
1429 
1430     return Changed;
1431   }
1432 
1433   if (IsNonTemporal) {
1434     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1435     // for both loads and stores, and the L2 cache policy to STREAM.
1436     Changed |= enableGLCBit(MI);
1437     Changed |= enableSLCBit(MI);
1438     return Changed;
1439   }
1440 
1441   return Changed;
1442 }
1443 
1444 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1445                                       SIAtomicScope Scope,
1446                                       SIAtomicAddrSpace AddrSpace,
1447                                       SIMemOp Op,
1448                                       bool IsCrossAddrSpaceOrdering,
1449                                       Position Pos) const {
1450   if (ST.isTgSplitEnabled()) {
1451     // In threadgroup split mode the waves of a work-group can be executing on
1452     // different CUs. Therefore need to wait for global or GDS memory operations
1453     // to complete to ensure they are visible to waves in the other CUs.
1454     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1455     // the same CU, so no need to wait for global memory as all waves in the
1456     // work-group access the same the L1, nor wait for GDS as access are ordered
1457     // on a CU.
1458     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1459                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1460         (Scope == SIAtomicScope::WORKGROUP)) {
1461       // Same as GFX7 using agent scope.
1462       Scope = SIAtomicScope::AGENT;
1463     }
1464     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1465     // LDS memory operations.
1466     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1467   }
1468   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1469                                         IsCrossAddrSpaceOrdering, Pos);
1470 }
1471 
1472 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1473                                          SIAtomicScope Scope,
1474                                          SIAtomicAddrSpace AddrSpace,
1475                                          Position Pos) const {
1476   if (!InsertCacheInv)
1477     return false;
1478 
1479   bool Changed = false;
1480 
1481   MachineBasicBlock &MBB = *MI->getParent();
1482   DebugLoc DL = MI->getDebugLoc();
1483 
1484   if (Pos == Position::AFTER)
1485     ++MI;
1486 
1487   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1488     switch (Scope) {
1489     case SIAtomicScope::SYSTEM:
1490       // Ensures that following loads will not see stale remote VMEM data or
1491       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1492       // CC will never be stale due to the local memory probes.
1493       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1494       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1495       // hardware does not reorder memory operations by the same wave with
1496       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1497       // remove any cache lines of earlier writes by the same wave and ensures
1498       // later reads by the same wave will refetch the cache lines.
1499       Changed = true;
1500       break;
1501     case SIAtomicScope::AGENT:
1502       // Same as GFX7.
1503       break;
1504     case SIAtomicScope::WORKGROUP:
1505       // In threadgroup split mode the waves of a work-group can be executing on
1506       // different CUs. Therefore need to invalidate the L1 which is per CU.
1507       // Otherwise in non-threadgroup split mode all waves of a work-group are
1508       // on the same CU, and so the L1 does not need to be invalidated.
1509       if (ST.isTgSplitEnabled()) {
1510         // Same as GFX7 using agent scope.
1511         Scope = SIAtomicScope::AGENT;
1512       }
1513       break;
1514     case SIAtomicScope::WAVEFRONT:
1515     case SIAtomicScope::SINGLETHREAD:
1516       // Same as GFX7.
1517       break;
1518     default:
1519       llvm_unreachable("Unsupported synchronization scope");
1520     }
1521   }
1522 
1523   /// The scratch address space does not need the global memory cache
1524   /// to be flushed as all memory operations by the same thread are
1525   /// sequentially consistent, and no other thread can access scratch
1526   /// memory.
1527 
1528   /// Other address spaces do not have a cache.
1529 
1530   if (Pos == Position::AFTER)
1531     --MI;
1532 
1533   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1534 
1535   return Changed;
1536 }
1537 
1538 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1539                                          SIAtomicScope Scope,
1540                                          SIAtomicAddrSpace AddrSpace,
1541                                          bool IsCrossAddrSpaceOrdering,
1542                                          Position Pos) const {
1543   bool Changed = false;
1544 
1545   MachineBasicBlock &MBB = *MI->getParent();
1546   const DebugLoc &DL = MI->getDebugLoc();
1547 
1548   if (Pos == Position::AFTER)
1549     ++MI;
1550 
1551   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1552     switch (Scope) {
1553     case SIAtomicScope::SYSTEM:
1554       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1555       // hardware does not reorder memory operations by the same wave with
1556       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1557       // to initiate writeback of any dirty cache lines of earlier writes by the
1558       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1559       // writeback has completed.
1560       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1561         // Set SC bits to indicate system scope.
1562         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1563       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1564       // vmcnt(0)" needed by the "BUFFER_WBL2".
1565       Changed = true;
1566       break;
1567     case SIAtomicScope::AGENT:
1568     case SIAtomicScope::WORKGROUP:
1569     case SIAtomicScope::WAVEFRONT:
1570     case SIAtomicScope::SINGLETHREAD:
1571       // Same as GFX7.
1572       break;
1573     default:
1574       llvm_unreachable("Unsupported synchronization scope");
1575     }
1576   }
1577 
1578   if (Pos == Position::AFTER)
1579     --MI;
1580 
1581   Changed |=
1582       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1583                                         IsCrossAddrSpaceOrdering, Pos);
1584 
1585   return Changed;
1586 }
1587 
1588 bool SIGfx940CacheControl::enableLoadCacheBypass(
1589     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1590     SIAtomicAddrSpace AddrSpace) const {
1591   assert(MI->mayLoad() && !MI->mayStore());
1592   bool Changed = false;
1593 
1594   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1595     switch (Scope) {
1596     case SIAtomicScope::SYSTEM:
1597       // Set SC bits to indicate system scope.
1598       Changed |= enableSC0Bit(MI);
1599       Changed |= enableSC1Bit(MI);
1600       break;
1601     case SIAtomicScope::AGENT:
1602       // Set SC bits to indicate agent scope.
1603       Changed |= enableSC1Bit(MI);
1604       break;
1605     case SIAtomicScope::WORKGROUP:
1606       // In threadgroup split mode the waves of a work-group can be executing on
1607       // different CUs. Therefore need to bypass the L1 which is per CU.
1608       // Otherwise in non-threadgroup split mode all waves of a work-group are
1609       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1610       // bits to indicate work-group scope will do this automatically.
1611       Changed |= enableSC0Bit(MI);
1612       break;
1613     case SIAtomicScope::WAVEFRONT:
1614     case SIAtomicScope::SINGLETHREAD:
1615       // Leave SC bits unset to indicate wavefront scope.
1616       break;
1617     default:
1618       llvm_unreachable("Unsupported synchronization scope");
1619     }
1620   }
1621 
1622   /// The scratch address space does not need the global memory caches
1623   /// to be bypassed as all memory operations by the same thread are
1624   /// sequentially consistent, and no other thread can access scratch
1625   /// memory.
1626 
1627   /// Other address spaces do not have a cache.
1628 
1629   return Changed;
1630 }
1631 
1632 bool SIGfx940CacheControl::enableStoreCacheBypass(
1633     const MachineBasicBlock::iterator &MI,
1634     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1635   assert(!MI->mayLoad() && MI->mayStore());
1636   bool Changed = false;
1637 
1638   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1639     switch (Scope) {
1640     case SIAtomicScope::SYSTEM:
1641       // Set SC bits to indicate system scope.
1642       Changed |= enableSC0Bit(MI);
1643       Changed |= enableSC1Bit(MI);
1644       break;
1645     case SIAtomicScope::AGENT:
1646       // Set SC bits to indicate agent scope.
1647       Changed |= enableSC1Bit(MI);
1648       break;
1649     case SIAtomicScope::WORKGROUP:
1650       // Set SC bits to indicate workgroup scope.
1651       Changed |= enableSC0Bit(MI);
1652       break;
1653     case SIAtomicScope::WAVEFRONT:
1654     case SIAtomicScope::SINGLETHREAD:
1655       // Leave SC bits unset to indicate wavefront scope.
1656       break;
1657     default:
1658       llvm_unreachable("Unsupported synchronization scope");
1659     }
1660   }
1661 
1662   /// The scratch address space does not need the global memory caches
1663   /// to be bypassed as all memory operations by the same thread are
1664   /// sequentially consistent, and no other thread can access scratch
1665   /// memory.
1666 
1667   /// Other address spaces do not have a cache.
1668 
1669   return Changed;
1670 }
1671 
1672 bool SIGfx940CacheControl::enableRMWCacheBypass(
1673     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1674     SIAtomicAddrSpace AddrSpace) const {
1675   assert(MI->mayLoad() && MI->mayStore());
1676   bool Changed = false;
1677 
1678   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1679     switch (Scope) {
1680     case SIAtomicScope::SYSTEM:
1681       // Set SC1 bit to indicate system scope.
1682       Changed |= enableSC1Bit(MI);
1683       break;
1684     case SIAtomicScope::AGENT:
1685     case SIAtomicScope::WORKGROUP:
1686     case SIAtomicScope::WAVEFRONT:
1687     case SIAtomicScope::SINGLETHREAD:
1688       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1689       // to indicate system or agent scope. The SC0 bit is used to indicate if
1690       // they are return or no-return. Leave SC1 bit unset to indicate agent
1691       // scope.
1692       break;
1693     default:
1694       llvm_unreachable("Unsupported synchronization scope");
1695     }
1696   }
1697 
1698   return Changed;
1699 }
1700 
1701 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1702     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1703     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1704   // Only handle load and store, not atomic read-modify-write insructions. The
1705   // latter use glc to indicate if the atomic returns a result and so must not
1706   // be used for cache control.
1707   assert(MI->mayLoad() ^ MI->mayStore());
1708 
1709   // Only update load and store, not LLVM IR atomic read-modify-write
1710   // instructions. The latter are always marked as volatile so cannot sensibly
1711   // handle it as do not want to pessimize all atomics. Also they do not support
1712   // the nontemporal attribute.
1713   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1714 
1715   bool Changed = false;
1716 
1717   if (IsVolatile) {
1718     // Set SC bits to indicate system scope.
1719     Changed |= enableSC0Bit(MI);
1720     Changed |= enableSC1Bit(MI);
1721 
1722     // Ensure operation has completed at system scope to cause all volatile
1723     // operations to be visible outside the program in a global order. Do not
1724     // request cross address space as only the global address space can be
1725     // observable outside the program, so no need to cause a waitcnt for LDS
1726     // address space operations.
1727     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1728                           Position::AFTER);
1729 
1730     return Changed;
1731   }
1732 
1733   if (IsNonTemporal) {
1734     Changed |= enableNTBit(MI);
1735     return Changed;
1736   }
1737 
1738   return Changed;
1739 }
1740 
1741 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1742                                          SIAtomicScope Scope,
1743                                          SIAtomicAddrSpace AddrSpace,
1744                                          Position Pos) const {
1745   if (!InsertCacheInv)
1746     return false;
1747 
1748   bool Changed = false;
1749 
1750   MachineBasicBlock &MBB = *MI->getParent();
1751   DebugLoc DL = MI->getDebugLoc();
1752 
1753   if (Pos == Position::AFTER)
1754     ++MI;
1755 
1756   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1757     switch (Scope) {
1758     case SIAtomicScope::SYSTEM:
1759       // Ensures that following loads will not see stale remote VMEM data or
1760       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1761       // CC will never be stale due to the local memory probes.
1762       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1763           // Set SC bits to indicate system scope.
1764           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1765       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1766       // hardware does not reorder memory operations by the same wave with
1767       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1768       // remove any cache lines of earlier writes by the same wave and ensures
1769       // later reads by the same wave will refetch the cache lines.
1770       Changed = true;
1771       break;
1772     case SIAtomicScope::AGENT:
1773       // Ensures that following loads will not see stale remote date or local
1774       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1775       // due to the memory probes.
1776       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1777           // Set SC bits to indicate agent scope.
1778           .addImm(AMDGPU::CPol::SC1);
1779       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1780       // does not reorder memory operations with respect to preceeding buffer
1781       // invalidate. The invalidate is guaranteed to remove any cache lines of
1782       // earlier writes and ensures later writes will refetch the cache lines.
1783       Changed = true;
1784       break;
1785     case SIAtomicScope::WORKGROUP:
1786       // In threadgroup split mode the waves of a work-group can be executing on
1787       // different CUs. Therefore need to invalidate the L1 which is per CU.
1788       // Otherwise in non-threadgroup split mode all waves of a work-group are
1789       // on the same CU, and so the L1 does not need to be invalidated.
1790       if (ST.isTgSplitEnabled()) {
1791         // Ensures L1 is invalidated if in threadgroup split mode. In
1792         // non-threadgroup split mode it is a NOP, but no point generating it in
1793         // that case if know not in that mode.
1794         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1795             // Set SC bits to indicate work-group scope.
1796             .addImm(AMDGPU::CPol::SC0);
1797         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1798         // does not reorder memory operations with respect to preceeding buffer
1799         // invalidate. The invalidate is guaranteed to remove any cache lines of
1800         // earlier writes and ensures later writes will refetch the cache lines.
1801         Changed = true;
1802       }
1803       break;
1804     case SIAtomicScope::WAVEFRONT:
1805     case SIAtomicScope::SINGLETHREAD:
1806       // Could generate "BUFFER_INV" but it would do nothing as there are no
1807       // caches to invalidate.
1808       break;
1809     default:
1810       llvm_unreachable("Unsupported synchronization scope");
1811     }
1812   }
1813 
1814   /// The scratch address space does not need the global memory cache
1815   /// to be flushed as all memory operations by the same thread are
1816   /// sequentially consistent, and no other thread can access scratch
1817   /// memory.
1818 
1819   /// Other address spaces do not have a cache.
1820 
1821   if (Pos == Position::AFTER)
1822     --MI;
1823 
1824   return Changed;
1825 }
1826 
1827 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1828                                          SIAtomicScope Scope,
1829                                          SIAtomicAddrSpace AddrSpace,
1830                                          bool IsCrossAddrSpaceOrdering,
1831                                          Position Pos) const {
1832   bool Changed = false;
1833 
1834   MachineBasicBlock &MBB = *MI->getParent();
1835   DebugLoc DL = MI->getDebugLoc();
1836 
1837   if (Pos == Position::AFTER)
1838     ++MI;
1839 
1840   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1841     switch (Scope) {
1842     case SIAtomicScope::SYSTEM:
1843       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1844       // hardware does not reorder memory operations by the same wave with
1845       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1846       // to initiate writeback of any dirty cache lines of earlier writes by the
1847       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1848       // writeback has completed.
1849       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1850           // Set SC bits to indicate system scope.
1851           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1852       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1853       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1854       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1855       Changed = true;
1856       break;
1857     case SIAtomicScope::AGENT:
1858       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1859           // Set SC bits to indicate agent scope.
1860           .addImm(AMDGPU::CPol::SC1);
1861 
1862       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1863       // SIAtomicScope::AGENT, the following insertWait will generate the
1864       // required "S_WAITCNT vmcnt(0)".
1865       Changed = true;
1866       break;
1867     case SIAtomicScope::WORKGROUP:
1868     case SIAtomicScope::WAVEFRONT:
1869     case SIAtomicScope::SINGLETHREAD:
1870       // Do not generate "BUFFER_WBL2" as there are no caches it would
1871       // writeback, and would require an otherwise unnecessary
1872       // "S_WAITCNT vmcnt(0)".
1873       break;
1874     default:
1875       llvm_unreachable("Unsupported synchronization scope");
1876     }
1877   }
1878 
1879   if (Pos == Position::AFTER)
1880     --MI;
1881 
1882   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1883   // S_WAITCNT needed.
1884   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1885                         IsCrossAddrSpaceOrdering, Pos);
1886 
1887   return Changed;
1888 }
1889 
1890 bool SIGfx10CacheControl::enableLoadCacheBypass(
1891     const MachineBasicBlock::iterator &MI,
1892     SIAtomicScope Scope,
1893     SIAtomicAddrSpace AddrSpace) const {
1894   assert(MI->mayLoad() && !MI->mayStore());
1895   bool Changed = false;
1896 
1897   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1898     switch (Scope) {
1899     case SIAtomicScope::SYSTEM:
1900     case SIAtomicScope::AGENT:
1901       // Set the L0 and L1 cache policies to MISS_EVICT.
1902       // Note: there is no L2 cache coherent bypass control at the ISA level.
1903       Changed |= enableGLCBit(MI);
1904       Changed |= enableDLCBit(MI);
1905       break;
1906     case SIAtomicScope::WORKGROUP:
1907       // In WGP mode the waves of a work-group can be executing on either CU of
1908       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1909       // CU mode all waves of a work-group are on the same CU, and so the L0
1910       // does not need to be bypassed.
1911       if (!ST.isCuModeEnabled())
1912         Changed |= enableGLCBit(MI);
1913       break;
1914     case SIAtomicScope::WAVEFRONT:
1915     case SIAtomicScope::SINGLETHREAD:
1916       // No cache to bypass.
1917       break;
1918     default:
1919       llvm_unreachable("Unsupported synchronization scope");
1920     }
1921   }
1922 
1923   /// The scratch address space does not need the global memory caches
1924   /// to be bypassed as all memory operations by the same thread are
1925   /// sequentially consistent, and no other thread can access scratch
1926   /// memory.
1927 
1928   /// Other address spaces do not have a cache.
1929 
1930   return Changed;
1931 }
1932 
1933 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1934     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1935     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1936 
1937   // Only handle load and store, not atomic read-modify-write insructions. The
1938   // latter use glc to indicate if the atomic returns a result and so must not
1939   // be used for cache control.
1940   assert(MI->mayLoad() ^ MI->mayStore());
1941 
1942   // Only update load and store, not LLVM IR atomic read-modify-write
1943   // instructions. The latter are always marked as volatile so cannot sensibly
1944   // handle it as do not want to pessimize all atomics. Also they do not support
1945   // the nontemporal attribute.
1946   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1947 
1948   bool Changed = false;
1949 
1950   if (IsVolatile) {
1951     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1952     // and MISS_LRU for store instructions.
1953     // Note: there is no L2 cache coherent bypass control at the ISA level.
1954     if (Op == SIMemOp::LOAD) {
1955       Changed |= enableGLCBit(MI);
1956       Changed |= enableDLCBit(MI);
1957     }
1958 
1959     // Ensure operation has completed at system scope to cause all volatile
1960     // operations to be visible outside the program in a global order. Do not
1961     // request cross address space as only the global address space can be
1962     // observable outside the program, so no need to cause a waitcnt for LDS
1963     // address space operations.
1964     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1965                           Position::AFTER);
1966     return Changed;
1967   }
1968 
1969   if (IsNonTemporal) {
1970     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1971     // and L2 cache policy to STREAM.
1972     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1973     // to MISS_EVICT and the L2 cache policy to STREAM.
1974     if (Op == SIMemOp::STORE)
1975       Changed |= enableGLCBit(MI);
1976     Changed |= enableSLCBit(MI);
1977 
1978     return Changed;
1979   }
1980 
1981   return Changed;
1982 }
1983 
1984 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1985                                      SIAtomicScope Scope,
1986                                      SIAtomicAddrSpace AddrSpace,
1987                                      SIMemOp Op,
1988                                      bool IsCrossAddrSpaceOrdering,
1989                                      Position Pos) const {
1990   bool Changed = false;
1991 
1992   MachineBasicBlock &MBB = *MI->getParent();
1993   DebugLoc DL = MI->getDebugLoc();
1994 
1995   if (Pos == Position::AFTER)
1996     ++MI;
1997 
1998   bool VMCnt = false;
1999   bool VSCnt = false;
2000   bool LGKMCnt = false;
2001 
2002   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2003       SIAtomicAddrSpace::NONE) {
2004     switch (Scope) {
2005     case SIAtomicScope::SYSTEM:
2006     case SIAtomicScope::AGENT:
2007       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2008         VMCnt |= true;
2009       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2010         VSCnt |= true;
2011       break;
2012     case SIAtomicScope::WORKGROUP:
2013       // In WGP mode the waves of a work-group can be executing on either CU of
2014       // the WGP. Therefore need to wait for operations to complete to ensure
2015       // they are visible to waves in the other CU as the L0 is per CU.
2016       // Otherwise in CU mode and all waves of a work-group are on the same CU
2017       // which shares the same L0.
2018       if (!ST.isCuModeEnabled()) {
2019         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2020           VMCnt |= true;
2021         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2022           VSCnt |= true;
2023       }
2024       break;
2025     case SIAtomicScope::WAVEFRONT:
2026     case SIAtomicScope::SINGLETHREAD:
2027       // The L0 cache keeps all memory operations in order for
2028       // work-items in the same wavefront.
2029       break;
2030     default:
2031       llvm_unreachable("Unsupported synchronization scope");
2032     }
2033   }
2034 
2035   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2036     switch (Scope) {
2037     case SIAtomicScope::SYSTEM:
2038     case SIAtomicScope::AGENT:
2039     case SIAtomicScope::WORKGROUP:
2040       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2041       // not needed as LDS operations for all waves are executed in a total
2042       // global ordering as observed by all waves. Required if also
2043       // synchronizing with global/GDS memory as LDS operations could be
2044       // reordered with respect to later global/GDS memory operations of the
2045       // same wave.
2046       LGKMCnt |= IsCrossAddrSpaceOrdering;
2047       break;
2048     case SIAtomicScope::WAVEFRONT:
2049     case SIAtomicScope::SINGLETHREAD:
2050       // The LDS keeps all memory operations in order for
2051       // the same wavefront.
2052       break;
2053     default:
2054       llvm_unreachable("Unsupported synchronization scope");
2055     }
2056   }
2057 
2058   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2059     switch (Scope) {
2060     case SIAtomicScope::SYSTEM:
2061     case SIAtomicScope::AGENT:
2062       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2063       // is not needed as GDS operations for all waves are executed in a total
2064       // global ordering as observed by all waves. Required if also
2065       // synchronizing with global/LDS memory as GDS operations could be
2066       // reordered with respect to later global/LDS memory operations of the
2067       // same wave.
2068       LGKMCnt |= IsCrossAddrSpaceOrdering;
2069       break;
2070     case SIAtomicScope::WORKGROUP:
2071     case SIAtomicScope::WAVEFRONT:
2072     case SIAtomicScope::SINGLETHREAD:
2073       // The GDS keeps all memory operations in order for
2074       // the same work-group.
2075       break;
2076     default:
2077       llvm_unreachable("Unsupported synchronization scope");
2078     }
2079   }
2080 
2081   if (VMCnt || LGKMCnt) {
2082     unsigned WaitCntImmediate =
2083       AMDGPU::encodeWaitcnt(IV,
2084                             VMCnt ? 0 : getVmcntBitMask(IV),
2085                             getExpcntBitMask(IV),
2086                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2087     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2088         .addImm(WaitCntImmediate);
2089     Changed = true;
2090   }
2091 
2092   if (VSCnt) {
2093     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2094         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2095         .addImm(0);
2096     Changed = true;
2097   }
2098 
2099   if (Pos == Position::AFTER)
2100     --MI;
2101 
2102   return Changed;
2103 }
2104 
2105 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2106                                         SIAtomicScope Scope,
2107                                         SIAtomicAddrSpace AddrSpace,
2108                                         Position Pos) const {
2109   if (!InsertCacheInv)
2110     return false;
2111 
2112   bool Changed = false;
2113 
2114   MachineBasicBlock &MBB = *MI->getParent();
2115   DebugLoc DL = MI->getDebugLoc();
2116 
2117   if (Pos == Position::AFTER)
2118     ++MI;
2119 
2120   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2121     switch (Scope) {
2122     case SIAtomicScope::SYSTEM:
2123     case SIAtomicScope::AGENT:
2124       // The order of invalidates matter here. We must invalidate "outer in"
2125       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2126       // invalidated.
2127       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2128       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2129       Changed = true;
2130       break;
2131     case SIAtomicScope::WORKGROUP:
2132       // In WGP mode the waves of a work-group can be executing on either CU of
2133       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2134       // in CU mode and all waves of a work-group are on the same CU, and so the
2135       // L0 does not need to be invalidated.
2136       if (!ST.isCuModeEnabled()) {
2137         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2138         Changed = true;
2139       }
2140       break;
2141     case SIAtomicScope::WAVEFRONT:
2142     case SIAtomicScope::SINGLETHREAD:
2143       // No cache to invalidate.
2144       break;
2145     default:
2146       llvm_unreachable("Unsupported synchronization scope");
2147     }
2148   }
2149 
2150   /// The scratch address space does not need the global memory cache
2151   /// to be flushed as all memory operations by the same thread are
2152   /// sequentially consistent, and no other thread can access scratch
2153   /// memory.
2154 
2155   /// Other address spaces do not have a cache.
2156 
2157   if (Pos == Position::AFTER)
2158     --MI;
2159 
2160   return Changed;
2161 }
2162 
2163 bool SIGfx11CacheControl::enableLoadCacheBypass(
2164     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2165     SIAtomicAddrSpace AddrSpace) const {
2166   assert(MI->mayLoad() && !MI->mayStore());
2167   bool Changed = false;
2168 
2169   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2170     switch (Scope) {
2171     case SIAtomicScope::SYSTEM:
2172     case SIAtomicScope::AGENT:
2173       // Set the L0 and L1 cache policies to MISS_EVICT.
2174       // Note: there is no L2 cache coherent bypass control at the ISA level.
2175       Changed |= enableGLCBit(MI);
2176       break;
2177     case SIAtomicScope::WORKGROUP:
2178       // In WGP mode the waves of a work-group can be executing on either CU of
2179       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2180       // CU mode all waves of a work-group are on the same CU, and so the L0
2181       // does not need to be bypassed.
2182       if (!ST.isCuModeEnabled())
2183         Changed |= enableGLCBit(MI);
2184       break;
2185     case SIAtomicScope::WAVEFRONT:
2186     case SIAtomicScope::SINGLETHREAD:
2187       // No cache to bypass.
2188       break;
2189     default:
2190       llvm_unreachable("Unsupported synchronization scope");
2191     }
2192   }
2193 
2194   /// The scratch address space does not need the global memory caches
2195   /// to be bypassed as all memory operations by the same thread are
2196   /// sequentially consistent, and no other thread can access scratch
2197   /// memory.
2198 
2199   /// Other address spaces do not have a cache.
2200 
2201   return Changed;
2202 }
2203 
2204 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2205     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2206     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2207 
2208   // Only handle load and store, not atomic read-modify-write insructions. The
2209   // latter use glc to indicate if the atomic returns a result and so must not
2210   // be used for cache control.
2211   assert(MI->mayLoad() ^ MI->mayStore());
2212 
2213   // Only update load and store, not LLVM IR atomic read-modify-write
2214   // instructions. The latter are always marked as volatile so cannot sensibly
2215   // handle it as do not want to pessimize all atomics. Also they do not support
2216   // the nontemporal attribute.
2217   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2218 
2219   bool Changed = false;
2220 
2221   if (IsVolatile) {
2222     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2223     // and MISS_LRU for store instructions.
2224     // Note: there is no L2 cache coherent bypass control at the ISA level.
2225     if (Op == SIMemOp::LOAD)
2226       Changed |= enableGLCBit(MI);
2227 
2228     // Set MALL NOALLOC for load and store instructions.
2229     Changed |= enableDLCBit(MI);
2230 
2231     // Ensure operation has completed at system scope to cause all volatile
2232     // operations to be visible outside the program in a global order. Do not
2233     // request cross address space as only the global address space can be
2234     // observable outside the program, so no need to cause a waitcnt for LDS
2235     // address space operations.
2236     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2237                           Position::AFTER);
2238     return Changed;
2239   }
2240 
2241   if (IsNonTemporal) {
2242     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2243     // and L2 cache policy to STREAM.
2244     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2245     // to MISS_EVICT and the L2 cache policy to STREAM.
2246     if (Op == SIMemOp::STORE)
2247       Changed |= enableGLCBit(MI);
2248     Changed |= enableSLCBit(MI);
2249 
2250     // Set MALL NOALLOC for load and store instructions.
2251     Changed |= enableDLCBit(MI);
2252     return Changed;
2253   }
2254 
2255   return Changed;
2256 }
2257 
2258 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2259                                 AMDGPU::CPol::CPol Value) const {
2260   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2261   if (!CPol)
2262     return false;
2263 
2264   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2265   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2266     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2267     return true;
2268   }
2269 
2270   return false;
2271 }
2272 
2273 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2274                                    AMDGPU::CPol::CPol Value) const {
2275   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2276   if (!CPol)
2277     return false;
2278 
2279   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2280   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2281     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2282     return true;
2283   }
2284 
2285   return false;
2286 }
2287 
2288 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2289     const MachineBasicBlock::iterator MI) const {
2290   // TODO: implement flag for frontend to give us a hint not to insert waits.
2291 
2292   MachineBasicBlock &MBB = *MI->getParent();
2293   const DebugLoc &DL = MI->getDebugLoc();
2294 
2295   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2296   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2297   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2298   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2299   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2300 
2301   return true;
2302 }
2303 
2304 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2305                                      SIAtomicScope Scope,
2306                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2307                                      bool IsCrossAddrSpaceOrdering,
2308                                      Position Pos) const {
2309   bool Changed = false;
2310 
2311   MachineBasicBlock &MBB = *MI->getParent();
2312   DebugLoc DL = MI->getDebugLoc();
2313 
2314   bool LOADCnt = false;
2315   bool DSCnt = false;
2316   bool STORECnt = false;
2317 
2318   if (Pos == Position::AFTER)
2319     ++MI;
2320 
2321   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2322       SIAtomicAddrSpace::NONE) {
2323     switch (Scope) {
2324     case SIAtomicScope::SYSTEM:
2325     case SIAtomicScope::AGENT:
2326       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2327         LOADCnt |= true;
2328       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2329         STORECnt |= true;
2330       break;
2331     case SIAtomicScope::WORKGROUP:
2332       // In WGP mode the waves of a work-group can be executing on either CU of
2333       // the WGP. Therefore need to wait for operations to complete to ensure
2334       // they are visible to waves in the other CU as the L0 is per CU.
2335       // Otherwise in CU mode and all waves of a work-group are on the same CU
2336       // which shares the same L0.
2337       if (!ST.isCuModeEnabled()) {
2338         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2339           LOADCnt |= true;
2340         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2341           STORECnt |= true;
2342       }
2343       break;
2344     case SIAtomicScope::WAVEFRONT:
2345     case SIAtomicScope::SINGLETHREAD:
2346       // The L0 cache keeps all memory operations in order for
2347       // work-items in the same wavefront.
2348       break;
2349     default:
2350       llvm_unreachable("Unsupported synchronization scope");
2351     }
2352   }
2353 
2354   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2355     switch (Scope) {
2356     case SIAtomicScope::SYSTEM:
2357     case SIAtomicScope::AGENT:
2358     case SIAtomicScope::WORKGROUP:
2359       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2360       // not needed as LDS operations for all waves are executed in a total
2361       // global ordering as observed by all waves. Required if also
2362       // synchronizing with global/GDS memory as LDS operations could be
2363       // reordered with respect to later global/GDS memory operations of the
2364       // same wave.
2365       DSCnt |= IsCrossAddrSpaceOrdering;
2366       break;
2367     case SIAtomicScope::WAVEFRONT:
2368     case SIAtomicScope::SINGLETHREAD:
2369       // The LDS keeps all memory operations in order for
2370       // the same wavefront.
2371       break;
2372     default:
2373       llvm_unreachable("Unsupported synchronization scope");
2374     }
2375   }
2376 
2377   if (LOADCnt) {
2378     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2379     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2380     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2381     Changed = true;
2382   }
2383 
2384   if (STORECnt) {
2385     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2386     Changed = true;
2387   }
2388 
2389   if (DSCnt) {
2390     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2391     Changed = true;
2392   }
2393 
2394   if (Pos == Position::AFTER)
2395     --MI;
2396 
2397   return Changed;
2398 }
2399 
2400 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2401                                         SIAtomicScope Scope,
2402                                         SIAtomicAddrSpace AddrSpace,
2403                                         Position Pos) const {
2404   if (!InsertCacheInv)
2405     return false;
2406 
2407   MachineBasicBlock &MBB = *MI->getParent();
2408   DebugLoc DL = MI->getDebugLoc();
2409 
2410   /// The scratch address space does not need the global memory cache
2411   /// to be flushed as all memory operations by the same thread are
2412   /// sequentially consistent, and no other thread can access scratch
2413   /// memory.
2414 
2415   /// Other address spaces do not have a cache.
2416   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2417     return false;
2418 
2419   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2420   switch (Scope) {
2421   case SIAtomicScope::SYSTEM:
2422     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2423     break;
2424   case SIAtomicScope::AGENT:
2425     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2426     break;
2427   case SIAtomicScope::WORKGROUP:
2428     // In WGP mode the waves of a work-group can be executing on either CU of
2429     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2430     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2431     // the L0 does not need to be invalidated.
2432     if (ST.isCuModeEnabled())
2433       return false;
2434 
2435     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2436     break;
2437   case SIAtomicScope::WAVEFRONT:
2438   case SIAtomicScope::SINGLETHREAD:
2439     // No cache to invalidate.
2440     return false;
2441   default:
2442     llvm_unreachable("Unsupported synchronization scope");
2443   }
2444 
2445   if (Pos == Position::AFTER)
2446     ++MI;
2447 
2448   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2449 
2450   if (Pos == Position::AFTER)
2451     --MI;
2452 
2453   return true;
2454 }
2455 
2456 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2457                                         SIAtomicScope Scope,
2458                                         SIAtomicAddrSpace AddrSpace,
2459                                         bool IsCrossAddrSpaceOrdering,
2460                                         Position Pos) const {
2461   MachineBasicBlock &MBB = *MI->getParent();
2462   DebugLoc DL = MI->getDebugLoc();
2463 
2464   // The scratch address space does not need the global memory cache
2465   // writeback as all memory operations by the same thread are
2466   // sequentially consistent, and no other thread can access scratch
2467   // memory.
2468 
2469   // Other address spaces do not have a cache.
2470   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2471     return false;
2472 
2473   if (Pos == Position::AFTER)
2474     ++MI;
2475 
2476   // GLOBAL_WB is always needed, even for write-through caches, as it
2477   // additionally ensures all operations have reached the desired cache level.
2478   bool SkipWB = false;
2479   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2480   switch (Scope) {
2481   case SIAtomicScope::SYSTEM:
2482     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2483     break;
2484   case SIAtomicScope::AGENT:
2485     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2486     break;
2487   case SIAtomicScope::WORKGROUP:
2488     // In WGP mode the waves of a work-group can be executing on either CU of
2489     // the WGP. Therefore we need to ensure all operations have reached L1,
2490     // hence the SCOPE_SE WB.
2491     // For CU mode, we need operations to reach L0, so the wait is enough -
2492     // there are no ways for an operation to report completion without reaching
2493     // at least L0.
2494     if (ST.isCuModeEnabled())
2495       SkipWB = true;
2496     else
2497       ScopeImm = AMDGPU::CPol::SCOPE_SE;
2498     break;
2499   case SIAtomicScope::WAVEFRONT:
2500   case SIAtomicScope::SINGLETHREAD:
2501     // No cache to invalidate.
2502     return false;
2503   default:
2504     llvm_unreachable("Unsupported synchronization scope");
2505   }
2506 
2507   if (!SkipWB)
2508     BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2509 
2510   if (Pos == Position::AFTER)
2511     --MI;
2512 
2513   // We always have to wait for previous memory operations (load/store) to
2514   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2515   // we of course need to wait for that as well.
2516   insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2517              IsCrossAddrSpaceOrdering, Pos);
2518 
2519   return true;
2520 }
2521 
2522 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2523     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2524     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2525 
2526   // Only handle load and store, not atomic read-modify-write instructions.
2527   assert(MI->mayLoad() ^ MI->mayStore());
2528 
2529   // Only update load and store, not LLVM IR atomic read-modify-write
2530   // instructions. The latter are always marked as volatile so cannot sensibly
2531   // handle it as do not want to pessimize all atomics. Also they do not support
2532   // the nontemporal attribute.
2533   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2534 
2535   bool Changed = false;
2536 
2537   if (IsLastUse) {
2538     // Set last-use hint.
2539     Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2540   } else if (IsNonTemporal) {
2541     // Set non-temporal hint for all cache levels.
2542     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2543   }
2544 
2545   if (IsVolatile) {
2546     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2547 
2548     if (Op == SIMemOp::STORE)
2549       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2550 
2551     // Ensure operation has completed at system scope to cause all volatile
2552     // operations to be visible outside the program in a global order. Do not
2553     // request cross address space as only the global address space can be
2554     // observable outside the program, so no need to cause a waitcnt for LDS
2555     // address space operations.
2556     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2557                           Position::AFTER);
2558   }
2559 
2560   return Changed;
2561 }
2562 
2563 bool SIGfx12CacheControl::expandSystemScopeStore(
2564     MachineBasicBlock::iterator &MI) const {
2565   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2566   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2567     return insertWaitsBeforeSystemScopeStore(MI);
2568 
2569   return false;
2570 }
2571 
2572 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2573                                          SIAtomicScope Scope,
2574                                          SIAtomicAddrSpace AddrSpace) const {
2575   bool Changed = false;
2576 
2577   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2578     switch (Scope) {
2579     case SIAtomicScope::SYSTEM:
2580       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2581       break;
2582     case SIAtomicScope::AGENT:
2583       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2584       break;
2585     case SIAtomicScope::WORKGROUP:
2586       // In workgroup mode, SCOPE_SE is needed as waves can executes on
2587       // different CUs that access different L0s.
2588       if (!ST.isCuModeEnabled())
2589         Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2590       break;
2591     case SIAtomicScope::WAVEFRONT:
2592     case SIAtomicScope::SINGLETHREAD:
2593       // No cache to bypass.
2594       break;
2595     default:
2596       llvm_unreachable("Unsupported synchronization scope");
2597     }
2598   }
2599 
2600   // The scratch address space does not need the global memory caches
2601   // to be bypassed as all memory operations by the same thread are
2602   // sequentially consistent, and no other thread can access scratch
2603   // memory.
2604 
2605   // Other address spaces do not have a cache.
2606 
2607   return Changed;
2608 }
2609 
2610 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2611   if (AtomicPseudoMIs.empty())
2612     return false;
2613 
2614   for (auto &MI : AtomicPseudoMIs)
2615     MI->eraseFromParent();
2616 
2617   AtomicPseudoMIs.clear();
2618   return true;
2619 }
2620 
2621 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2622                                    MachineBasicBlock::iterator &MI) {
2623   assert(MI->mayLoad() && !MI->mayStore());
2624 
2625   bool Changed = false;
2626 
2627   if (MOI.isAtomic()) {
2628     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2629         MOI.getOrdering() == AtomicOrdering::Acquire ||
2630         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2631       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2632                                            MOI.getOrderingAddrSpace());
2633     }
2634 
2635     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2636       Changed |= CC->insertWait(MI, MOI.getScope(),
2637                                 MOI.getOrderingAddrSpace(),
2638                                 SIMemOp::LOAD | SIMemOp::STORE,
2639                                 MOI.getIsCrossAddressSpaceOrdering(),
2640                                 Position::BEFORE);
2641 
2642     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2643         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2644       Changed |= CC->insertWait(MI, MOI.getScope(),
2645                                 MOI.getInstrAddrSpace(),
2646                                 SIMemOp::LOAD,
2647                                 MOI.getIsCrossAddressSpaceOrdering(),
2648                                 Position::AFTER);
2649       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2650                                    MOI.getOrderingAddrSpace(),
2651                                    Position::AFTER);
2652     }
2653 
2654     return Changed;
2655   }
2656 
2657   // Atomic instructions already bypass caches to the scope specified by the
2658   // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2659   // instructions need additional treatment.
2660   Changed |= CC->enableVolatileAndOrNonTemporal(
2661       MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2662       MOI.isNonTemporal(), MOI.isLastUse());
2663 
2664   return Changed;
2665 }
2666 
2667 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2668                                     MachineBasicBlock::iterator &MI) {
2669   assert(!MI->mayLoad() && MI->mayStore());
2670 
2671   bool Changed = false;
2672 
2673   if (MOI.isAtomic()) {
2674     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2675         MOI.getOrdering() == AtomicOrdering::Release ||
2676         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2677       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2678                                             MOI.getOrderingAddrSpace());
2679     }
2680 
2681     if (MOI.getOrdering() == AtomicOrdering::Release ||
2682         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2683       Changed |= CC->insertRelease(MI, MOI.getScope(),
2684                                    MOI.getOrderingAddrSpace(),
2685                                    MOI.getIsCrossAddressSpaceOrdering(),
2686                                    Position::BEFORE);
2687 
2688     return Changed;
2689   }
2690 
2691   // Atomic instructions already bypass caches to the scope specified by the
2692   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2693   // need additional treatment.
2694   Changed |= CC->enableVolatileAndOrNonTemporal(
2695       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2696       MOI.isNonTemporal());
2697 
2698   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2699   // instruction field, do not confuse it with atomic scope.
2700   Changed |= CC->expandSystemScopeStore(MI);
2701   return Changed;
2702 }
2703 
2704 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2705                                           MachineBasicBlock::iterator &MI) {
2706   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2707 
2708   AtomicPseudoMIs.push_back(MI);
2709   bool Changed = false;
2710 
2711   // Refine fenced address space based on MMRAs.
2712   //
2713   // TODO: Should we support this MMRA on other atomic operations?
2714   auto OrderingAddrSpace =
2715       getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2716 
2717   if (MOI.isAtomic()) {
2718     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2719       Changed |= CC->insertWait(
2720           MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2721           MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
2722 
2723     if (MOI.getOrdering() == AtomicOrdering::Release ||
2724         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2725         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2726       /// TODO: This relies on a barrier always generating a waitcnt
2727       /// for LDS to ensure it is not reordered with the completion of
2728       /// the proceeding LDS operations. If barrier had a memory
2729       /// ordering and memory scope, then library does not need to
2730       /// generate a fence. Could add support in this file for
2731       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2732       /// adding S_WAITCNT before a S_BARRIER.
2733       Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2734                                    MOI.getIsCrossAddressSpaceOrdering(),
2735                                    Position::BEFORE);
2736 
2737     // TODO: If both release and invalidate are happening they could be combined
2738     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2739     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2740     // track cache invalidate and write back instructions.
2741 
2742     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2743         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2744         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2745       Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2746                                    Position::BEFORE);
2747 
2748     return Changed;
2749   }
2750 
2751   return Changed;
2752 }
2753 
2754 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2755   MachineBasicBlock::iterator &MI) {
2756   assert(MI->mayLoad() && MI->mayStore());
2757 
2758   bool Changed = false;
2759 
2760   if (MOI.isAtomic()) {
2761     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2762         MOI.getOrdering() == AtomicOrdering::Acquire ||
2763         MOI.getOrdering() == AtomicOrdering::Release ||
2764         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2765         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2766       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2767                                           MOI.getInstrAddrSpace());
2768     }
2769 
2770     if (MOI.getOrdering() == AtomicOrdering::Release ||
2771         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2772         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2773         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2774       Changed |= CC->insertRelease(MI, MOI.getScope(),
2775                                    MOI.getOrderingAddrSpace(),
2776                                    MOI.getIsCrossAddressSpaceOrdering(),
2777                                    Position::BEFORE);
2778 
2779     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2780         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2781         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2782         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2783         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2784       Changed |= CC->insertWait(MI, MOI.getScope(),
2785                                 MOI.getInstrAddrSpace(),
2786                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2787                                                    SIMemOp::STORE,
2788                                 MOI.getIsCrossAddressSpaceOrdering(),
2789                                 Position::AFTER);
2790       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2791                                    MOI.getOrderingAddrSpace(),
2792                                    Position::AFTER);
2793     }
2794 
2795     return Changed;
2796   }
2797 
2798   return Changed;
2799 }
2800 
2801 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2802   bool Changed = false;
2803 
2804   const MachineModuleInfo &MMI =
2805       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2806 
2807   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2808   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2809 
2810   for (auto &MBB : MF) {
2811     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2812 
2813       // Unbundle instructions after the post-RA scheduler.
2814       if (MI->isBundle() && MI->mayLoadOrStore()) {
2815         MachineBasicBlock::instr_iterator II(MI->getIterator());
2816         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2817              I != E && I->isBundledWithPred(); ++I) {
2818           I->unbundleFromPred();
2819           for (MachineOperand &MO : I->operands())
2820             if (MO.isReg())
2821               MO.setIsInternalRead(false);
2822         }
2823 
2824         MI->eraseFromParent();
2825         MI = II->getIterator();
2826       }
2827 
2828       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2829         continue;
2830 
2831       if (const auto &MOI = MOA.getLoadInfo(MI))
2832         Changed |= expandLoad(*MOI, MI);
2833       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2834         Changed |= expandStore(*MOI, MI);
2835         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2836       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2837         Changed |= expandAtomicFence(*MOI, MI);
2838       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2839         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2840     }
2841   }
2842 
2843   Changed |= removeAtomicPseudoMIs();
2844   return Changed;
2845 }
2846 
2847 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2848 
2849 char SIMemoryLegalizer::ID = 0;
2850 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2851 
2852 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2853   return new SIMemoryLegalizer();
2854 }
2855