xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 53120fbb68952b7d620c2c0e1cf05c5017fc1b27)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   /// Inserts any necessary instructions at position \p Pos relative
316   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317   /// \p Op associated with address spaces \p AddrSpace have completed. Used
318   /// between memory instructions to enforce the order they become visible as
319   /// observed by other memory instructions executing in memory scope \p Scope.
320   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321   /// address spaces. Returns true iff any instructions inserted.
322   virtual bool insertWait(MachineBasicBlock::iterator &MI,
323                           SIAtomicScope Scope,
324                           SIAtomicAddrSpace AddrSpace,
325                           SIMemOp Op,
326                           bool IsCrossAddrSpaceOrdering,
327                           Position Pos) const = 0;
328 
329   /// Inserts any necessary instructions at position \p Pos relative to
330   /// instruction \p MI to ensure any subsequent memory instructions of this
331   /// thread with address spaces \p AddrSpace will observe the previous memory
332   /// operations by any thread for memory scopes up to memory scope \p Scope .
333   /// Returns true iff any instructions inserted.
334   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335                              SIAtomicScope Scope,
336                              SIAtomicAddrSpace AddrSpace,
337                              Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure previous memory instructions by this thread
341   /// with address spaces \p AddrSpace have completed and can be observed by
342   /// subsequent memory instructions by any thread executing in memory scope \p
343   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344   /// between address spaces. Returns true iff any instructions inserted.
345   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              bool IsCrossAddrSpaceOrdering,
349                              Position Pos) const = 0;
350 
351   /// Virtual destructor to allow derivations to be deleted.
352   virtual ~SICacheControl() = default;
353 
354   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
355                                    MachineBasicBlock::iterator &MI) const {
356     return false;
357   }
358 };
359 
360 class SIGfx6CacheControl : public SICacheControl {
361 protected:
362 
363   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::GLC);
367   }
368 
369   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370   /// is modified, false otherwise.
371   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372     return enableNamedBit(MI, AMDGPU::CPol::SLC);
373   }
374 
375 public:
376 
377   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378 
379   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380                              SIAtomicScope Scope,
381                              SIAtomicAddrSpace AddrSpace) const override;
382 
383   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384                               SIAtomicScope Scope,
385                               SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388                             SIAtomicScope Scope,
389                             SIAtomicAddrSpace AddrSpace) const override;
390 
391   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393                                       bool IsVolatile,
394                                       bool IsNonTemporal) const override;
395 
396   bool insertWait(MachineBasicBlock::iterator &MI,
397                   SIAtomicScope Scope,
398                   SIAtomicAddrSpace AddrSpace,
399                   SIMemOp Op,
400                   bool IsCrossAddrSpaceOrdering,
401                   Position Pos) const override;
402 
403   bool insertAcquire(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      Position Pos) const override;
407 
408   bool insertRelease(MachineBasicBlock::iterator &MI,
409                      SIAtomicScope Scope,
410                      SIAtomicAddrSpace AddrSpace,
411                      bool IsCrossAddrSpaceOrdering,
412                      Position Pos) const override;
413 };
414 
415 class SIGfx7CacheControl : public SIGfx6CacheControl {
416 public:
417 
418   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419 
420   bool insertAcquire(MachineBasicBlock::iterator &MI,
421                      SIAtomicScope Scope,
422                      SIAtomicAddrSpace AddrSpace,
423                      Position Pos) const override;
424 
425 };
426 
427 class SIGfx90ACacheControl : public SIGfx7CacheControl {
428 public:
429 
430   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431 
432   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433                              SIAtomicScope Scope,
434                              SIAtomicAddrSpace AddrSpace) const override;
435 
436   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437                               SIAtomicScope Scope,
438                               SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441                             SIAtomicScope Scope,
442                             SIAtomicAddrSpace AddrSpace) const override;
443 
444   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446                                       bool IsVolatile,
447                                       bool IsNonTemporal) const override;
448 
449   bool insertWait(MachineBasicBlock::iterator &MI,
450                   SIAtomicScope Scope,
451                   SIAtomicAddrSpace AddrSpace,
452                   SIMemOp Op,
453                   bool IsCrossAddrSpaceOrdering,
454                   Position Pos) const override;
455 
456   bool insertAcquire(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      Position Pos) const override;
460 
461   bool insertRelease(MachineBasicBlock::iterator &MI,
462                      SIAtomicScope Scope,
463                      SIAtomicAddrSpace AddrSpace,
464                      bool IsCrossAddrSpaceOrdering,
465                      Position Pos) const override;
466 };
467 
468 class SIGfx940CacheControl : public SIGfx90ACacheControl {
469 protected:
470 
471   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472   /// is modified, false otherwise.
473   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474     return enableNamedBit(MI, AMDGPU::CPol::SC0);
475   }
476 
477   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478   /// is modified, false otherwise.
479   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480     return enableNamedBit(MI, AMDGPU::CPol::SC1);
481   }
482 
483   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484   /// is modified, false otherwise.
485   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486     return enableNamedBit(MI, AMDGPU::CPol::NT);
487   }
488 
489 public:
490 
491   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492 
493   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494                              SIAtomicScope Scope,
495                              SIAtomicAddrSpace AddrSpace) const override;
496 
497   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498                               SIAtomicScope Scope,
499                               SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502                             SIAtomicScope Scope,
503                             SIAtomicAddrSpace AddrSpace) const override;
504 
505   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507                                       bool IsVolatile,
508                                       bool IsNonTemporal) const override;
509 
510   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512 
513   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515                      Position Pos) const override;
516 
517   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518                            MachineBasicBlock::iterator &MI) const override {
519     bool Changed = false;
520     if (ST.hasForceStoreSC0SC1() &&
521         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522                                     SIAtomicAddrSpace::GLOBAL |
523                                     SIAtomicAddrSpace::OTHER)) !=
524          SIAtomicAddrSpace::NONE) {
525       Changed |= enableSC0Bit(MI);
526       Changed |= enableSC1Bit(MI);
527     }
528     return Changed;
529   }
530 };
531 
532 class SIGfx10CacheControl : public SIGfx7CacheControl {
533 protected:
534 
535   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536   /// is modified, false otherwise.
537   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538     return enableNamedBit(MI, AMDGPU::CPol::DLC);
539   }
540 
541 public:
542 
543   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544 
545   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546                              SIAtomicScope Scope,
547                              SIAtomicAddrSpace AddrSpace) const override;
548 
549   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551                                       bool IsVolatile,
552                                       bool IsNonTemporal) const override;
553 
554   bool insertWait(MachineBasicBlock::iterator &MI,
555                   SIAtomicScope Scope,
556                   SIAtomicAddrSpace AddrSpace,
557                   SIMemOp Op,
558                   bool IsCrossAddrSpaceOrdering,
559                   Position Pos) const override;
560 
561   bool insertAcquire(MachineBasicBlock::iterator &MI,
562                      SIAtomicScope Scope,
563                      SIAtomicAddrSpace AddrSpace,
564                      Position Pos) const override;
565 };
566 
567 class SIGfx11CacheControl : public SIGfx10CacheControl {
568 public:
569   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570 
571   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572                              SIAtomicScope Scope,
573                              SIAtomicAddrSpace AddrSpace) const override;
574 
575   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577                                       bool IsVolatile,
578                                       bool IsNonTemporal) const override;
579 };
580 
581 class SIGfx12CacheControl : public SIGfx11CacheControl {
582 protected:
583   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
584   // \returns Returns true if \p MI is modified, false otherwise.
585   bool setTH(const MachineBasicBlock::iterator MI,
586              AMDGPU::CPol::CPol Value) const;
587   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
588   // MI. \returns Returns true if \p MI is modified, false otherwise.
589   bool setScope(const MachineBasicBlock::iterator MI,
590                 AMDGPU::CPol::CPol Value) const;
591 
592 public:
593   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
594 
595   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
596                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
597                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
598 
599   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
600                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
601 
602   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
603                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
604                                       bool IsVolatile,
605                                       bool IsNonTemporal) const override;
606 };
607 
608 class SIMemoryLegalizer final : public MachineFunctionPass {
609 private:
610 
611   /// Cache Control.
612   std::unique_ptr<SICacheControl> CC = nullptr;
613 
614   /// List of atomic pseudo instructions.
615   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
616 
617   /// Return true iff instruction \p MI is a atomic instruction that
618   /// returns a result.
619   bool isAtomicRet(const MachineInstr &MI) const {
620     return SIInstrInfo::isAtomicRet(MI);
621   }
622 
623   /// Removes all processed atomic pseudo instructions from the current
624   /// function. Returns true if current function is modified, false otherwise.
625   bool removeAtomicPseudoMIs();
626 
627   /// Expands load operation \p MI. Returns true if instructions are
628   /// added/deleted or \p MI is modified, false otherwise.
629   bool expandLoad(const SIMemOpInfo &MOI,
630                   MachineBasicBlock::iterator &MI);
631   /// Expands store operation \p MI. Returns true if instructions are
632   /// added/deleted or \p MI is modified, false otherwise.
633   bool expandStore(const SIMemOpInfo &MOI,
634                    MachineBasicBlock::iterator &MI);
635   /// Expands atomic fence operation \p MI. Returns true if
636   /// instructions are added/deleted or \p MI is modified, false otherwise.
637   bool expandAtomicFence(const SIMemOpInfo &MOI,
638                          MachineBasicBlock::iterator &MI);
639   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
640   /// instructions are added/deleted or \p MI is modified, false otherwise.
641   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
642                                 MachineBasicBlock::iterator &MI);
643 
644 public:
645   static char ID;
646 
647   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
648 
649   void getAnalysisUsage(AnalysisUsage &AU) const override {
650     AU.setPreservesCFG();
651     MachineFunctionPass::getAnalysisUsage(AU);
652   }
653 
654   StringRef getPassName() const override {
655     return PASS_NAME;
656   }
657 
658   bool runOnMachineFunction(MachineFunction &MF) override;
659 };
660 
661 } // end namespace anonymous
662 
663 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
664                                       const char *Msg) const {
665   const Function &Func = MI->getParent()->getParent()->getFunction();
666   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
667   Func.getContext().diagnose(Diag);
668 }
669 
670 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
671 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
672                                SIAtomicAddrSpace InstrAddrSpace) const {
673   if (SSID == SyncScope::System)
674     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
675   if (SSID == MMI->getAgentSSID())
676     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
677   if (SSID == MMI->getWorkgroupSSID())
678     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
679                       true);
680   if (SSID == MMI->getWavefrontSSID())
681     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
682                       true);
683   if (SSID == SyncScope::SingleThread)
684     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
685                       true);
686   if (SSID == MMI->getSystemOneAddressSpaceSSID())
687     return std::tuple(SIAtomicScope::SYSTEM,
688                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
689   if (SSID == MMI->getAgentOneAddressSpaceSSID())
690     return std::tuple(SIAtomicScope::AGENT,
691                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
692   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
693     return std::tuple(SIAtomicScope::WORKGROUP,
694                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
695   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
696     return std::tuple(SIAtomicScope::WAVEFRONT,
697                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
698   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
699     return std::tuple(SIAtomicScope::SINGLETHREAD,
700                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
701   return std::nullopt;
702 }
703 
704 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
705   if (AS == AMDGPUAS::FLAT_ADDRESS)
706     return SIAtomicAddrSpace::FLAT;
707   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
708     return SIAtomicAddrSpace::GLOBAL;
709   if (AS == AMDGPUAS::LOCAL_ADDRESS)
710     return SIAtomicAddrSpace::LDS;
711   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
712     return SIAtomicAddrSpace::SCRATCH;
713   if (AS == AMDGPUAS::REGION_ADDRESS)
714     return SIAtomicAddrSpace::GDS;
715 
716   return SIAtomicAddrSpace::OTHER;
717 }
718 
719 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
720   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
721 }
722 
723 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
724     const MachineBasicBlock::iterator &MI) const {
725   assert(MI->getNumMemOperands() > 0);
726 
727   SyncScope::ID SSID = SyncScope::SingleThread;
728   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
729   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
730   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
731   bool IsNonTemporal = true;
732   bool IsVolatile = false;
733 
734   // Validator should check whether or not MMOs cover the entire set of
735   // locations accessed by the memory instruction.
736   for (const auto &MMO : MI->memoperands()) {
737     IsNonTemporal &= MMO->isNonTemporal();
738     IsVolatile |= MMO->isVolatile();
739     InstrAddrSpace |=
740       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
741     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
742     if (OpOrdering != AtomicOrdering::NotAtomic) {
743       const auto &IsSyncScopeInclusion =
744           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
745       if (!IsSyncScopeInclusion) {
746         reportUnsupported(MI,
747           "Unsupported non-inclusive atomic synchronization scope");
748         return std::nullopt;
749       }
750 
751       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
752       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
753       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
754              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
755       FailureOrdering =
756           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
757     }
758   }
759 
760   SIAtomicScope Scope = SIAtomicScope::NONE;
761   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
762   bool IsCrossAddressSpaceOrdering = false;
763   if (Ordering != AtomicOrdering::NotAtomic) {
764     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
765     if (!ScopeOrNone) {
766       reportUnsupported(MI, "Unsupported atomic synchronization scope");
767       return std::nullopt;
768     }
769     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
770         *ScopeOrNone;
771     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
772         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
773         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
774       reportUnsupported(MI, "Unsupported atomic address space");
775       return std::nullopt;
776     }
777   }
778   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
779                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
780                      IsNonTemporal);
781 }
782 
783 std::optional<SIMemOpInfo>
784 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
785   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
786 
787   if (!(MI->mayLoad() && !MI->mayStore()))
788     return std::nullopt;
789 
790   // Be conservative if there are no memory operands.
791   if (MI->getNumMemOperands() == 0)
792     return SIMemOpInfo();
793 
794   return constructFromMIWithMMO(MI);
795 }
796 
797 std::optional<SIMemOpInfo>
798 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
799   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
800 
801   if (!(!MI->mayLoad() && MI->mayStore()))
802     return std::nullopt;
803 
804   // Be conservative if there are no memory operands.
805   if (MI->getNumMemOperands() == 0)
806     return SIMemOpInfo();
807 
808   return constructFromMIWithMMO(MI);
809 }
810 
811 std::optional<SIMemOpInfo>
812 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
813   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
814 
815   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
816     return std::nullopt;
817 
818   AtomicOrdering Ordering =
819     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
820 
821   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
822   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
823   if (!ScopeOrNone) {
824     reportUnsupported(MI, "Unsupported atomic synchronization scope");
825     return std::nullopt;
826   }
827 
828   SIAtomicScope Scope = SIAtomicScope::NONE;
829   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
830   bool IsCrossAddressSpaceOrdering = false;
831   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
832       *ScopeOrNone;
833 
834   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
835       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
836     reportUnsupported(MI, "Unsupported atomic address space");
837     return std::nullopt;
838   }
839 
840   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
841                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
842 }
843 
844 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
845     const MachineBasicBlock::iterator &MI) const {
846   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
847 
848   if (!(MI->mayLoad() && MI->mayStore()))
849     return std::nullopt;
850 
851   // Be conservative if there are no memory operands.
852   if (MI->getNumMemOperands() == 0)
853     return SIMemOpInfo();
854 
855   return constructFromMIWithMMO(MI);
856 }
857 
858 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
859   TII = ST.getInstrInfo();
860   IV = getIsaVersion(ST.getCPU());
861   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
862 }
863 
864 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
865                                     AMDGPU::CPol::CPol Bit) const {
866   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
867   if (!CPol)
868     return false;
869 
870   CPol->setImm(CPol->getImm() | Bit);
871   return true;
872 }
873 
874 /* static */
875 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
876   GCNSubtarget::Generation Generation = ST.getGeneration();
877   if (ST.hasGFX940Insts())
878     return std::make_unique<SIGfx940CacheControl>(ST);
879   if (ST.hasGFX90AInsts())
880     return std::make_unique<SIGfx90ACacheControl>(ST);
881   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
882     return std::make_unique<SIGfx6CacheControl>(ST);
883   if (Generation < AMDGPUSubtarget::GFX10)
884     return std::make_unique<SIGfx7CacheControl>(ST);
885   if (Generation < AMDGPUSubtarget::GFX11)
886     return std::make_unique<SIGfx10CacheControl>(ST);
887   if (Generation < AMDGPUSubtarget::GFX12)
888     return std::make_unique<SIGfx11CacheControl>(ST);
889   return std::make_unique<SIGfx12CacheControl>(ST);
890 }
891 
892 bool SIGfx6CacheControl::enableLoadCacheBypass(
893     const MachineBasicBlock::iterator &MI,
894     SIAtomicScope Scope,
895     SIAtomicAddrSpace AddrSpace) const {
896   assert(MI->mayLoad() && !MI->mayStore());
897   bool Changed = false;
898 
899   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
900     switch (Scope) {
901     case SIAtomicScope::SYSTEM:
902     case SIAtomicScope::AGENT:
903       // Set L1 cache policy to MISS_EVICT.
904       // Note: there is no L2 cache bypass policy at the ISA level.
905       Changed |= enableGLCBit(MI);
906       break;
907     case SIAtomicScope::WORKGROUP:
908     case SIAtomicScope::WAVEFRONT:
909     case SIAtomicScope::SINGLETHREAD:
910       // No cache to bypass.
911       break;
912     default:
913       llvm_unreachable("Unsupported synchronization scope");
914     }
915   }
916 
917   /// The scratch address space does not need the global memory caches
918   /// to be bypassed as all memory operations by the same thread are
919   /// sequentially consistent, and no other thread can access scratch
920   /// memory.
921 
922   /// Other address spaces do not have a cache.
923 
924   return Changed;
925 }
926 
927 bool SIGfx6CacheControl::enableStoreCacheBypass(
928     const MachineBasicBlock::iterator &MI,
929     SIAtomicScope Scope,
930     SIAtomicAddrSpace AddrSpace) const {
931   assert(!MI->mayLoad() && MI->mayStore());
932   bool Changed = false;
933 
934   /// The L1 cache is write through so does not need to be bypassed. There is no
935   /// bypass control for the L2 cache at the isa level.
936 
937   return Changed;
938 }
939 
940 bool SIGfx6CacheControl::enableRMWCacheBypass(
941     const MachineBasicBlock::iterator &MI,
942     SIAtomicScope Scope,
943     SIAtomicAddrSpace AddrSpace) const {
944   assert(MI->mayLoad() && MI->mayStore());
945   bool Changed = false;
946 
947   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
948   /// bypassed, and the GLC bit is instead used to indicate if they are
949   /// return or no-return.
950   /// Note: there is no L2 cache coherent bypass control at the ISA level.
951 
952   return Changed;
953 }
954 
955 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
956     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
957     bool IsVolatile, bool IsNonTemporal) const {
958   // Only handle load and store, not atomic read-modify-write insructions. The
959   // latter use glc to indicate if the atomic returns a result and so must not
960   // be used for cache control.
961   assert(MI->mayLoad() ^ MI->mayStore());
962 
963   // Only update load and store, not LLVM IR atomic read-modify-write
964   // instructions. The latter are always marked as volatile so cannot sensibly
965   // handle it as do not want to pessimize all atomics. Also they do not support
966   // the nontemporal attribute.
967   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
968 
969   bool Changed = false;
970 
971   if (IsVolatile) {
972     // Set L1 cache policy to be MISS_EVICT for load instructions
973     // and MISS_LRU for store instructions.
974     // Note: there is no L2 cache bypass policy at the ISA level.
975     if (Op == SIMemOp::LOAD)
976       Changed |= enableGLCBit(MI);
977 
978     // Ensure operation has completed at system scope to cause all volatile
979     // operations to be visible outside the program in a global order. Do not
980     // request cross address space as only the global address space can be
981     // observable outside the program, so no need to cause a waitcnt for LDS
982     // address space operations.
983     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
984                           Position::AFTER);
985 
986     return Changed;
987   }
988 
989   if (IsNonTemporal) {
990     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
991     // for both loads and stores, and the L2 cache policy to STREAM.
992     Changed |= enableGLCBit(MI);
993     Changed |= enableSLCBit(MI);
994     return Changed;
995   }
996 
997   return Changed;
998 }
999 
1000 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1001                                     SIAtomicScope Scope,
1002                                     SIAtomicAddrSpace AddrSpace,
1003                                     SIMemOp Op,
1004                                     bool IsCrossAddrSpaceOrdering,
1005                                     Position Pos) const {
1006   bool Changed = false;
1007 
1008   MachineBasicBlock &MBB = *MI->getParent();
1009   DebugLoc DL = MI->getDebugLoc();
1010 
1011   if (Pos == Position::AFTER)
1012     ++MI;
1013 
1014   bool VMCnt = false;
1015   bool LGKMCnt = false;
1016 
1017   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1018       SIAtomicAddrSpace::NONE) {
1019     switch (Scope) {
1020     case SIAtomicScope::SYSTEM:
1021     case SIAtomicScope::AGENT:
1022       VMCnt |= true;
1023       break;
1024     case SIAtomicScope::WORKGROUP:
1025     case SIAtomicScope::WAVEFRONT:
1026     case SIAtomicScope::SINGLETHREAD:
1027       // The L1 cache keeps all memory operations in order for
1028       // wavefronts in the same work-group.
1029       break;
1030     default:
1031       llvm_unreachable("Unsupported synchronization scope");
1032     }
1033   }
1034 
1035   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1036     switch (Scope) {
1037     case SIAtomicScope::SYSTEM:
1038     case SIAtomicScope::AGENT:
1039     case SIAtomicScope::WORKGROUP:
1040       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1041       // not needed as LDS operations for all waves are executed in a total
1042       // global ordering as observed by all waves. Required if also
1043       // synchronizing with global/GDS memory as LDS operations could be
1044       // reordered with respect to later global/GDS memory operations of the
1045       // same wave.
1046       LGKMCnt |= IsCrossAddrSpaceOrdering;
1047       break;
1048     case SIAtomicScope::WAVEFRONT:
1049     case SIAtomicScope::SINGLETHREAD:
1050       // The LDS keeps all memory operations in order for
1051       // the same wavefront.
1052       break;
1053     default:
1054       llvm_unreachable("Unsupported synchronization scope");
1055     }
1056   }
1057 
1058   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1059     switch (Scope) {
1060     case SIAtomicScope::SYSTEM:
1061     case SIAtomicScope::AGENT:
1062       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1063       // is not needed as GDS operations for all waves are executed in a total
1064       // global ordering as observed by all waves. Required if also
1065       // synchronizing with global/LDS memory as GDS operations could be
1066       // reordered with respect to later global/LDS memory operations of the
1067       // same wave.
1068       LGKMCnt |= IsCrossAddrSpaceOrdering;
1069       break;
1070     case SIAtomicScope::WORKGROUP:
1071     case SIAtomicScope::WAVEFRONT:
1072     case SIAtomicScope::SINGLETHREAD:
1073       // The GDS keeps all memory operations in order for
1074       // the same work-group.
1075       break;
1076     default:
1077       llvm_unreachable("Unsupported synchronization scope");
1078     }
1079   }
1080 
1081   if (VMCnt || LGKMCnt) {
1082     unsigned WaitCntImmediate =
1083       AMDGPU::encodeWaitcnt(IV,
1084                             VMCnt ? 0 : getVmcntBitMask(IV),
1085                             getExpcntBitMask(IV),
1086                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1087     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1088         .addImm(WaitCntImmediate);
1089     Changed = true;
1090   }
1091 
1092   if (Pos == Position::AFTER)
1093     --MI;
1094 
1095   return Changed;
1096 }
1097 
1098 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1099                                        SIAtomicScope Scope,
1100                                        SIAtomicAddrSpace AddrSpace,
1101                                        Position Pos) const {
1102   if (!InsertCacheInv)
1103     return false;
1104 
1105   bool Changed = false;
1106 
1107   MachineBasicBlock &MBB = *MI->getParent();
1108   DebugLoc DL = MI->getDebugLoc();
1109 
1110   if (Pos == Position::AFTER)
1111     ++MI;
1112 
1113   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1114     switch (Scope) {
1115     case SIAtomicScope::SYSTEM:
1116     case SIAtomicScope::AGENT:
1117       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1118       Changed = true;
1119       break;
1120     case SIAtomicScope::WORKGROUP:
1121     case SIAtomicScope::WAVEFRONT:
1122     case SIAtomicScope::SINGLETHREAD:
1123       // No cache to invalidate.
1124       break;
1125     default:
1126       llvm_unreachable("Unsupported synchronization scope");
1127     }
1128   }
1129 
1130   /// The scratch address space does not need the global memory cache
1131   /// to be flushed as all memory operations by the same thread are
1132   /// sequentially consistent, and no other thread can access scratch
1133   /// memory.
1134 
1135   /// Other address spaces do not have a cache.
1136 
1137   if (Pos == Position::AFTER)
1138     --MI;
1139 
1140   return Changed;
1141 }
1142 
1143 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1144                                        SIAtomicScope Scope,
1145                                        SIAtomicAddrSpace AddrSpace,
1146                                        bool IsCrossAddrSpaceOrdering,
1147                                        Position Pos) const {
1148   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1149                     IsCrossAddrSpaceOrdering, Pos);
1150 }
1151 
1152 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1153                                        SIAtomicScope Scope,
1154                                        SIAtomicAddrSpace AddrSpace,
1155                                        Position Pos) const {
1156   if (!InsertCacheInv)
1157     return false;
1158 
1159   bool Changed = false;
1160 
1161   MachineBasicBlock &MBB = *MI->getParent();
1162   DebugLoc DL = MI->getDebugLoc();
1163 
1164   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1165 
1166   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1167                                     ? AMDGPU::BUFFER_WBINVL1
1168                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1169 
1170   if (Pos == Position::AFTER)
1171     ++MI;
1172 
1173   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1174     switch (Scope) {
1175     case SIAtomicScope::SYSTEM:
1176     case SIAtomicScope::AGENT:
1177       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1178       Changed = true;
1179       break;
1180     case SIAtomicScope::WORKGROUP:
1181     case SIAtomicScope::WAVEFRONT:
1182     case SIAtomicScope::SINGLETHREAD:
1183       // No cache to invalidate.
1184       break;
1185     default:
1186       llvm_unreachable("Unsupported synchronization scope");
1187     }
1188   }
1189 
1190   /// The scratch address space does not need the global memory cache
1191   /// to be flushed as all memory operations by the same thread are
1192   /// sequentially consistent, and no other thread can access scratch
1193   /// memory.
1194 
1195   /// Other address spaces do not have a cache.
1196 
1197   if (Pos == Position::AFTER)
1198     --MI;
1199 
1200   return Changed;
1201 }
1202 
1203 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1204     const MachineBasicBlock::iterator &MI,
1205     SIAtomicScope Scope,
1206     SIAtomicAddrSpace AddrSpace) const {
1207   assert(MI->mayLoad() && !MI->mayStore());
1208   bool Changed = false;
1209 
1210   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1211     switch (Scope) {
1212     case SIAtomicScope::SYSTEM:
1213     case SIAtomicScope::AGENT:
1214       // Set the L1 cache policy to MISS_LRU.
1215       // Note: there is no L2 cache bypass policy at the ISA level.
1216       Changed |= enableGLCBit(MI);
1217       break;
1218     case SIAtomicScope::WORKGROUP:
1219       // In threadgroup split mode the waves of a work-group can be executing on
1220       // different CUs. Therefore need to bypass the L1 which is per CU.
1221       // Otherwise in non-threadgroup split mode all waves of a work-group are
1222       // on the same CU, and so the L1 does not need to be bypassed.
1223       if (ST.isTgSplitEnabled())
1224         Changed |= enableGLCBit(MI);
1225       break;
1226     case SIAtomicScope::WAVEFRONT:
1227     case SIAtomicScope::SINGLETHREAD:
1228       // No cache to bypass.
1229       break;
1230     default:
1231       llvm_unreachable("Unsupported synchronization scope");
1232     }
1233   }
1234 
1235   /// The scratch address space does not need the global memory caches
1236   /// to be bypassed as all memory operations by the same thread are
1237   /// sequentially consistent, and no other thread can access scratch
1238   /// memory.
1239 
1240   /// Other address spaces do not have a cache.
1241 
1242   return Changed;
1243 }
1244 
1245 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1246     const MachineBasicBlock::iterator &MI,
1247     SIAtomicScope Scope,
1248     SIAtomicAddrSpace AddrSpace) const {
1249   assert(!MI->mayLoad() && MI->mayStore());
1250   bool Changed = false;
1251 
1252   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1253     switch (Scope) {
1254     case SIAtomicScope::SYSTEM:
1255     case SIAtomicScope::AGENT:
1256       /// Do not set glc for store atomic operations as they implicitly write
1257       /// through the L1 cache.
1258       break;
1259     case SIAtomicScope::WORKGROUP:
1260     case SIAtomicScope::WAVEFRONT:
1261     case SIAtomicScope::SINGLETHREAD:
1262       // No cache to bypass. Store atomics implicitly write through the L1
1263       // cache.
1264       break;
1265     default:
1266       llvm_unreachable("Unsupported synchronization scope");
1267     }
1268   }
1269 
1270   /// The scratch address space does not need the global memory caches
1271   /// to be bypassed as all memory operations by the same thread are
1272   /// sequentially consistent, and no other thread can access scratch
1273   /// memory.
1274 
1275   /// Other address spaces do not have a cache.
1276 
1277   return Changed;
1278 }
1279 
1280 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1281     const MachineBasicBlock::iterator &MI,
1282     SIAtomicScope Scope,
1283     SIAtomicAddrSpace AddrSpace) const {
1284   assert(MI->mayLoad() && MI->mayStore());
1285   bool Changed = false;
1286 
1287   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1288     switch (Scope) {
1289     case SIAtomicScope::SYSTEM:
1290     case SIAtomicScope::AGENT:
1291       /// Do not set glc for RMW atomic operations as they implicitly bypass
1292       /// the L1 cache, and the glc bit is instead used to indicate if they are
1293       /// return or no-return.
1294       break;
1295     case SIAtomicScope::WORKGROUP:
1296     case SIAtomicScope::WAVEFRONT:
1297     case SIAtomicScope::SINGLETHREAD:
1298       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1299       break;
1300     default:
1301       llvm_unreachable("Unsupported synchronization scope");
1302     }
1303   }
1304 
1305   return Changed;
1306 }
1307 
1308 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1309     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1310     bool IsVolatile, bool IsNonTemporal) const {
1311   // Only handle load and store, not atomic read-modify-write insructions. The
1312   // latter use glc to indicate if the atomic returns a result and so must not
1313   // be used for cache control.
1314   assert(MI->mayLoad() ^ MI->mayStore());
1315 
1316   // Only update load and store, not LLVM IR atomic read-modify-write
1317   // instructions. The latter are always marked as volatile so cannot sensibly
1318   // handle it as do not want to pessimize all atomics. Also they do not support
1319   // the nontemporal attribute.
1320   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1321 
1322   bool Changed = false;
1323 
1324   if (IsVolatile) {
1325     // Set L1 cache policy to be MISS_EVICT for load instructions
1326     // and MISS_LRU for store instructions.
1327     // Note: there is no L2 cache bypass policy at the ISA level.
1328     if (Op == SIMemOp::LOAD)
1329       Changed |= enableGLCBit(MI);
1330 
1331     // Ensure operation has completed at system scope to cause all volatile
1332     // operations to be visible outside the program in a global order. Do not
1333     // request cross address space as only the global address space can be
1334     // observable outside the program, so no need to cause a waitcnt for LDS
1335     // address space operations.
1336     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1337                           Position::AFTER);
1338 
1339     return Changed;
1340   }
1341 
1342   if (IsNonTemporal) {
1343     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1344     // for both loads and stores, and the L2 cache policy to STREAM.
1345     Changed |= enableGLCBit(MI);
1346     Changed |= enableSLCBit(MI);
1347     return Changed;
1348   }
1349 
1350   return Changed;
1351 }
1352 
1353 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1354                                       SIAtomicScope Scope,
1355                                       SIAtomicAddrSpace AddrSpace,
1356                                       SIMemOp Op,
1357                                       bool IsCrossAddrSpaceOrdering,
1358                                       Position Pos) const {
1359   if (ST.isTgSplitEnabled()) {
1360     // In threadgroup split mode the waves of a work-group can be executing on
1361     // different CUs. Therefore need to wait for global or GDS memory operations
1362     // to complete to ensure they are visible to waves in the other CUs.
1363     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1364     // the same CU, so no need to wait for global memory as all waves in the
1365     // work-group access the same the L1, nor wait for GDS as access are ordered
1366     // on a CU.
1367     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1368                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1369         (Scope == SIAtomicScope::WORKGROUP)) {
1370       // Same as GFX7 using agent scope.
1371       Scope = SIAtomicScope::AGENT;
1372     }
1373     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1374     // LDS memory operations.
1375     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1376   }
1377   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1378                                         IsCrossAddrSpaceOrdering, Pos);
1379 }
1380 
1381 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1382                                          SIAtomicScope Scope,
1383                                          SIAtomicAddrSpace AddrSpace,
1384                                          Position Pos) const {
1385   if (!InsertCacheInv)
1386     return false;
1387 
1388   bool Changed = false;
1389 
1390   MachineBasicBlock &MBB = *MI->getParent();
1391   DebugLoc DL = MI->getDebugLoc();
1392 
1393   if (Pos == Position::AFTER)
1394     ++MI;
1395 
1396   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1397     switch (Scope) {
1398     case SIAtomicScope::SYSTEM:
1399       // Ensures that following loads will not see stale remote VMEM data or
1400       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1401       // CC will never be stale due to the local memory probes.
1402       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1403       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1404       // hardware does not reorder memory operations by the same wave with
1405       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1406       // remove any cache lines of earlier writes by the same wave and ensures
1407       // later reads by the same wave will refetch the cache lines.
1408       Changed = true;
1409       break;
1410     case SIAtomicScope::AGENT:
1411       // Same as GFX7.
1412       break;
1413     case SIAtomicScope::WORKGROUP:
1414       // In threadgroup split mode the waves of a work-group can be executing on
1415       // different CUs. Therefore need to invalidate the L1 which is per CU.
1416       // Otherwise in non-threadgroup split mode all waves of a work-group are
1417       // on the same CU, and so the L1 does not need to be invalidated.
1418       if (ST.isTgSplitEnabled()) {
1419         // Same as GFX7 using agent scope.
1420         Scope = SIAtomicScope::AGENT;
1421       }
1422       break;
1423     case SIAtomicScope::WAVEFRONT:
1424     case SIAtomicScope::SINGLETHREAD:
1425       // Same as GFX7.
1426       break;
1427     default:
1428       llvm_unreachable("Unsupported synchronization scope");
1429     }
1430   }
1431 
1432   /// The scratch address space does not need the global memory cache
1433   /// to be flushed as all memory operations by the same thread are
1434   /// sequentially consistent, and no other thread can access scratch
1435   /// memory.
1436 
1437   /// Other address spaces do not have a cache.
1438 
1439   if (Pos == Position::AFTER)
1440     --MI;
1441 
1442   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1443 
1444   return Changed;
1445 }
1446 
1447 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1448                                          SIAtomicScope Scope,
1449                                          SIAtomicAddrSpace AddrSpace,
1450                                          bool IsCrossAddrSpaceOrdering,
1451                                          Position Pos) const {
1452   bool Changed = false;
1453 
1454   MachineBasicBlock &MBB = *MI->getParent();
1455   const DebugLoc &DL = MI->getDebugLoc();
1456 
1457   if (Pos == Position::AFTER)
1458     ++MI;
1459 
1460   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1461     switch (Scope) {
1462     case SIAtomicScope::SYSTEM:
1463       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1464       // hardware does not reorder memory operations by the same wave with
1465       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1466       // to initiate writeback of any dirty cache lines of earlier writes by the
1467       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1468       // writeback has completed.
1469       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1470         // Set SC bits to indicate system scope.
1471         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1472       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1473       // vmcnt(0)" needed by the "BUFFER_WBL2".
1474       Changed = true;
1475       break;
1476     case SIAtomicScope::AGENT:
1477     case SIAtomicScope::WORKGROUP:
1478     case SIAtomicScope::WAVEFRONT:
1479     case SIAtomicScope::SINGLETHREAD:
1480       // Same as GFX7.
1481       break;
1482     default:
1483       llvm_unreachable("Unsupported synchronization scope");
1484     }
1485   }
1486 
1487   if (Pos == Position::AFTER)
1488     --MI;
1489 
1490   Changed |=
1491       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1492                                         IsCrossAddrSpaceOrdering, Pos);
1493 
1494   return Changed;
1495 }
1496 
1497 bool SIGfx940CacheControl::enableLoadCacheBypass(
1498     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1499     SIAtomicAddrSpace AddrSpace) const {
1500   assert(MI->mayLoad() && !MI->mayStore());
1501   bool Changed = false;
1502 
1503   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1504     switch (Scope) {
1505     case SIAtomicScope::SYSTEM:
1506       // Set SC bits to indicate system scope.
1507       Changed |= enableSC0Bit(MI);
1508       Changed |= enableSC1Bit(MI);
1509       break;
1510     case SIAtomicScope::AGENT:
1511       // Set SC bits to indicate agent scope.
1512       Changed |= enableSC1Bit(MI);
1513       break;
1514     case SIAtomicScope::WORKGROUP:
1515       // In threadgroup split mode the waves of a work-group can be executing on
1516       // different CUs. Therefore need to bypass the L1 which is per CU.
1517       // Otherwise in non-threadgroup split mode all waves of a work-group are
1518       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1519       // bits to indicate work-group scope will do this automatically.
1520       Changed |= enableSC0Bit(MI);
1521       break;
1522     case SIAtomicScope::WAVEFRONT:
1523     case SIAtomicScope::SINGLETHREAD:
1524       // Leave SC bits unset to indicate wavefront scope.
1525       break;
1526     default:
1527       llvm_unreachable("Unsupported synchronization scope");
1528     }
1529   }
1530 
1531   /// The scratch address space does not need the global memory caches
1532   /// to be bypassed as all memory operations by the same thread are
1533   /// sequentially consistent, and no other thread can access scratch
1534   /// memory.
1535 
1536   /// Other address spaces do not have a cache.
1537 
1538   return Changed;
1539 }
1540 
1541 bool SIGfx940CacheControl::enableStoreCacheBypass(
1542     const MachineBasicBlock::iterator &MI,
1543     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1544   assert(!MI->mayLoad() && MI->mayStore());
1545   bool Changed = false;
1546 
1547   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1548     switch (Scope) {
1549     case SIAtomicScope::SYSTEM:
1550       // Set SC bits to indicate system scope.
1551       Changed |= enableSC0Bit(MI);
1552       Changed |= enableSC1Bit(MI);
1553       break;
1554     case SIAtomicScope::AGENT:
1555       // Set SC bits to indicate agent scope.
1556       Changed |= enableSC1Bit(MI);
1557       break;
1558     case SIAtomicScope::WORKGROUP:
1559       // Set SC bits to indicate workgroup scope.
1560       Changed |= enableSC0Bit(MI);
1561       break;
1562     case SIAtomicScope::WAVEFRONT:
1563     case SIAtomicScope::SINGLETHREAD:
1564       // Leave SC bits unset to indicate wavefront scope.
1565       break;
1566     default:
1567       llvm_unreachable("Unsupported synchronization scope");
1568     }
1569   }
1570 
1571   /// The scratch address space does not need the global memory caches
1572   /// to be bypassed as all memory operations by the same thread are
1573   /// sequentially consistent, and no other thread can access scratch
1574   /// memory.
1575 
1576   /// Other address spaces do not have a cache.
1577 
1578   return Changed;
1579 }
1580 
1581 bool SIGfx940CacheControl::enableRMWCacheBypass(
1582     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1583     SIAtomicAddrSpace AddrSpace) const {
1584   assert(MI->mayLoad() && MI->mayStore());
1585   bool Changed = false;
1586 
1587   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1588     switch (Scope) {
1589     case SIAtomicScope::SYSTEM:
1590       // Set SC1 bit to indicate system scope.
1591       Changed |= enableSC1Bit(MI);
1592       break;
1593     case SIAtomicScope::AGENT:
1594     case SIAtomicScope::WORKGROUP:
1595     case SIAtomicScope::WAVEFRONT:
1596     case SIAtomicScope::SINGLETHREAD:
1597       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1598       // to indicate system or agent scope. The SC0 bit is used to indicate if
1599       // they are return or no-return. Leave SC1 bit unset to indicate agent
1600       // scope.
1601       break;
1602     default:
1603       llvm_unreachable("Unsupported synchronization scope");
1604     }
1605   }
1606 
1607   return Changed;
1608 }
1609 
1610 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1611     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1612     bool IsVolatile, bool IsNonTemporal) const {
1613   // Only handle load and store, not atomic read-modify-write insructions. The
1614   // latter use glc to indicate if the atomic returns a result and so must not
1615   // be used for cache control.
1616   assert(MI->mayLoad() ^ MI->mayStore());
1617 
1618   // Only update load and store, not LLVM IR atomic read-modify-write
1619   // instructions. The latter are always marked as volatile so cannot sensibly
1620   // handle it as do not want to pessimize all atomics. Also they do not support
1621   // the nontemporal attribute.
1622   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1623 
1624   bool Changed = false;
1625 
1626   if (IsVolatile) {
1627     // Set SC bits to indicate system scope.
1628     Changed |= enableSC0Bit(MI);
1629     Changed |= enableSC1Bit(MI);
1630 
1631     // Ensure operation has completed at system scope to cause all volatile
1632     // operations to be visible outside the program in a global order. Do not
1633     // request cross address space as only the global address space can be
1634     // observable outside the program, so no need to cause a waitcnt for LDS
1635     // address space operations.
1636     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1637                           Position::AFTER);
1638 
1639     return Changed;
1640   }
1641 
1642   if (IsNonTemporal) {
1643     Changed |= enableNTBit(MI);
1644     return Changed;
1645   }
1646 
1647   return Changed;
1648 }
1649 
1650 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1651                                          SIAtomicScope Scope,
1652                                          SIAtomicAddrSpace AddrSpace,
1653                                          Position Pos) const {
1654   if (!InsertCacheInv)
1655     return false;
1656 
1657   bool Changed = false;
1658 
1659   MachineBasicBlock &MBB = *MI->getParent();
1660   DebugLoc DL = MI->getDebugLoc();
1661 
1662   if (Pos == Position::AFTER)
1663     ++MI;
1664 
1665   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1666     switch (Scope) {
1667     case SIAtomicScope::SYSTEM:
1668       // Ensures that following loads will not see stale remote VMEM data or
1669       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1670       // CC will never be stale due to the local memory probes.
1671       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1672           // Set SC bits to indicate system scope.
1673           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1674       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1675       // hardware does not reorder memory operations by the same wave with
1676       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1677       // remove any cache lines of earlier writes by the same wave and ensures
1678       // later reads by the same wave will refetch the cache lines.
1679       Changed = true;
1680       break;
1681     case SIAtomicScope::AGENT:
1682       // Ensures that following loads will not see stale remote date or local
1683       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1684       // due to the memory probes.
1685       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1686           // Set SC bits to indicate agent scope.
1687           .addImm(AMDGPU::CPol::SC1);
1688       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1689       // does not reorder memory operations with respect to preceeding buffer
1690       // invalidate. The invalidate is guaranteed to remove any cache lines of
1691       // earlier writes and ensures later writes will refetch the cache lines.
1692       Changed = true;
1693       break;
1694     case SIAtomicScope::WORKGROUP:
1695       // In threadgroup split mode the waves of a work-group can be executing on
1696       // different CUs. Therefore need to invalidate the L1 which is per CU.
1697       // Otherwise in non-threadgroup split mode all waves of a work-group are
1698       // on the same CU, and so the L1 does not need to be invalidated.
1699       if (ST.isTgSplitEnabled()) {
1700         // Ensures L1 is invalidated if in threadgroup split mode. In
1701         // non-threadgroup split mode it is a NOP, but no point generating it in
1702         // that case if know not in that mode.
1703         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1704             // Set SC bits to indicate work-group scope.
1705             .addImm(AMDGPU::CPol::SC0);
1706         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1707         // does not reorder memory operations with respect to preceeding buffer
1708         // invalidate. The invalidate is guaranteed to remove any cache lines of
1709         // earlier writes and ensures later writes will refetch the cache lines.
1710         Changed = true;
1711       }
1712       break;
1713     case SIAtomicScope::WAVEFRONT:
1714     case SIAtomicScope::SINGLETHREAD:
1715       // Could generate "BUFFER_INV" but it would do nothing as there are no
1716       // caches to invalidate.
1717       break;
1718     default:
1719       llvm_unreachable("Unsupported synchronization scope");
1720     }
1721   }
1722 
1723   /// The scratch address space does not need the global memory cache
1724   /// to be flushed as all memory operations by the same thread are
1725   /// sequentially consistent, and no other thread can access scratch
1726   /// memory.
1727 
1728   /// Other address spaces do not have a cache.
1729 
1730   if (Pos == Position::AFTER)
1731     --MI;
1732 
1733   return Changed;
1734 }
1735 
1736 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1737                                          SIAtomicScope Scope,
1738                                          SIAtomicAddrSpace AddrSpace,
1739                                          bool IsCrossAddrSpaceOrdering,
1740                                          Position Pos) const {
1741   bool Changed = false;
1742 
1743   MachineBasicBlock &MBB = *MI->getParent();
1744   DebugLoc DL = MI->getDebugLoc();
1745 
1746   if (Pos == Position::AFTER)
1747     ++MI;
1748 
1749   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1750     switch (Scope) {
1751     case SIAtomicScope::SYSTEM:
1752       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1753       // hardware does not reorder memory operations by the same wave with
1754       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1755       // to initiate writeback of any dirty cache lines of earlier writes by the
1756       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1757       // writeback has completed.
1758       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1759           // Set SC bits to indicate system scope.
1760           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1761       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1762       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1763       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1764       Changed = true;
1765       break;
1766     case SIAtomicScope::AGENT:
1767       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1768           // Set SC bits to indicate agent scope.
1769           .addImm(AMDGPU::CPol::SC1);
1770 
1771       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1772       // SIAtomicScope::AGENT, the following insertWait will generate the
1773       // required "S_WAITCNT vmcnt(0)".
1774       Changed = true;
1775       break;
1776     case SIAtomicScope::WORKGROUP:
1777     case SIAtomicScope::WAVEFRONT:
1778     case SIAtomicScope::SINGLETHREAD:
1779       // Do not generate "BUFFER_WBL2" as there are no caches it would
1780       // writeback, and would require an otherwise unnecessary
1781       // "S_WAITCNT vmcnt(0)".
1782       break;
1783     default:
1784       llvm_unreachable("Unsupported synchronization scope");
1785     }
1786   }
1787 
1788   if (Pos == Position::AFTER)
1789     --MI;
1790 
1791   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1792   // S_WAITCNT needed.
1793   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1794                         IsCrossAddrSpaceOrdering, Pos);
1795 
1796   return Changed;
1797 }
1798 
1799 bool SIGfx10CacheControl::enableLoadCacheBypass(
1800     const MachineBasicBlock::iterator &MI,
1801     SIAtomicScope Scope,
1802     SIAtomicAddrSpace AddrSpace) const {
1803   assert(MI->mayLoad() && !MI->mayStore());
1804   bool Changed = false;
1805 
1806   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1807     switch (Scope) {
1808     case SIAtomicScope::SYSTEM:
1809     case SIAtomicScope::AGENT:
1810       // Set the L0 and L1 cache policies to MISS_EVICT.
1811       // Note: there is no L2 cache coherent bypass control at the ISA level.
1812       Changed |= enableGLCBit(MI);
1813       Changed |= enableDLCBit(MI);
1814       break;
1815     case SIAtomicScope::WORKGROUP:
1816       // In WGP mode the waves of a work-group can be executing on either CU of
1817       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1818       // CU mode all waves of a work-group are on the same CU, and so the L0
1819       // does not need to be bypassed.
1820       if (!ST.isCuModeEnabled())
1821         Changed |= enableGLCBit(MI);
1822       break;
1823     case SIAtomicScope::WAVEFRONT:
1824     case SIAtomicScope::SINGLETHREAD:
1825       // No cache to bypass.
1826       break;
1827     default:
1828       llvm_unreachable("Unsupported synchronization scope");
1829     }
1830   }
1831 
1832   /// The scratch address space does not need the global memory caches
1833   /// to be bypassed as all memory operations by the same thread are
1834   /// sequentially consistent, and no other thread can access scratch
1835   /// memory.
1836 
1837   /// Other address spaces do not have a cache.
1838 
1839   return Changed;
1840 }
1841 
1842 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1843     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1844     bool IsVolatile, bool IsNonTemporal) const {
1845 
1846   // Only handle load and store, not atomic read-modify-write insructions. The
1847   // latter use glc to indicate if the atomic returns a result and so must not
1848   // be used for cache control.
1849   assert(MI->mayLoad() ^ MI->mayStore());
1850 
1851   // Only update load and store, not LLVM IR atomic read-modify-write
1852   // instructions. The latter are always marked as volatile so cannot sensibly
1853   // handle it as do not want to pessimize all atomics. Also they do not support
1854   // the nontemporal attribute.
1855   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1856 
1857   bool Changed = false;
1858 
1859   if (IsVolatile) {
1860     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1861     // and MISS_LRU for store instructions.
1862     // Note: there is no L2 cache coherent bypass control at the ISA level.
1863     if (Op == SIMemOp::LOAD) {
1864       Changed |= enableGLCBit(MI);
1865       Changed |= enableDLCBit(MI);
1866     }
1867 
1868     // Ensure operation has completed at system scope to cause all volatile
1869     // operations to be visible outside the program in a global order. Do not
1870     // request cross address space as only the global address space can be
1871     // observable outside the program, so no need to cause a waitcnt for LDS
1872     // address space operations.
1873     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1874                           Position::AFTER);
1875     return Changed;
1876   }
1877 
1878   if (IsNonTemporal) {
1879     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1880     // and L2 cache policy to STREAM.
1881     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1882     // to MISS_EVICT and the L2 cache policy to STREAM.
1883     if (Op == SIMemOp::STORE)
1884       Changed |= enableGLCBit(MI);
1885     Changed |= enableSLCBit(MI);
1886 
1887     return Changed;
1888   }
1889 
1890   return Changed;
1891 }
1892 
1893 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1894                                      SIAtomicScope Scope,
1895                                      SIAtomicAddrSpace AddrSpace,
1896                                      SIMemOp Op,
1897                                      bool IsCrossAddrSpaceOrdering,
1898                                      Position Pos) const {
1899   bool Changed = false;
1900 
1901   MachineBasicBlock &MBB = *MI->getParent();
1902   DebugLoc DL = MI->getDebugLoc();
1903 
1904   if (Pos == Position::AFTER)
1905     ++MI;
1906 
1907   bool VMCnt = false;
1908   bool VSCnt = false;
1909   bool LGKMCnt = false;
1910 
1911   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1912       SIAtomicAddrSpace::NONE) {
1913     switch (Scope) {
1914     case SIAtomicScope::SYSTEM:
1915     case SIAtomicScope::AGENT:
1916       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1917         VMCnt |= true;
1918       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1919         VSCnt |= true;
1920       break;
1921     case SIAtomicScope::WORKGROUP:
1922       // In WGP mode the waves of a work-group can be executing on either CU of
1923       // the WGP. Therefore need to wait for operations to complete to ensure
1924       // they are visible to waves in the other CU as the L0 is per CU.
1925       // Otherwise in CU mode and all waves of a work-group are on the same CU
1926       // which shares the same L0.
1927       if (!ST.isCuModeEnabled()) {
1928         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1929           VMCnt |= true;
1930         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1931           VSCnt |= true;
1932       }
1933       break;
1934     case SIAtomicScope::WAVEFRONT:
1935     case SIAtomicScope::SINGLETHREAD:
1936       // The L0 cache keeps all memory operations in order for
1937       // work-items in the same wavefront.
1938       break;
1939     default:
1940       llvm_unreachable("Unsupported synchronization scope");
1941     }
1942   }
1943 
1944   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1945     switch (Scope) {
1946     case SIAtomicScope::SYSTEM:
1947     case SIAtomicScope::AGENT:
1948     case SIAtomicScope::WORKGROUP:
1949       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1950       // not needed as LDS operations for all waves are executed in a total
1951       // global ordering as observed by all waves. Required if also
1952       // synchronizing with global/GDS memory as LDS operations could be
1953       // reordered with respect to later global/GDS memory operations of the
1954       // same wave.
1955       LGKMCnt |= IsCrossAddrSpaceOrdering;
1956       break;
1957     case SIAtomicScope::WAVEFRONT:
1958     case SIAtomicScope::SINGLETHREAD:
1959       // The LDS keeps all memory operations in order for
1960       // the same wavefront.
1961       break;
1962     default:
1963       llvm_unreachable("Unsupported synchronization scope");
1964     }
1965   }
1966 
1967   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1968     switch (Scope) {
1969     case SIAtomicScope::SYSTEM:
1970     case SIAtomicScope::AGENT:
1971       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1972       // is not needed as GDS operations for all waves are executed in a total
1973       // global ordering as observed by all waves. Required if also
1974       // synchronizing with global/LDS memory as GDS operations could be
1975       // reordered with respect to later global/LDS memory operations of the
1976       // same wave.
1977       LGKMCnt |= IsCrossAddrSpaceOrdering;
1978       break;
1979     case SIAtomicScope::WORKGROUP:
1980     case SIAtomicScope::WAVEFRONT:
1981     case SIAtomicScope::SINGLETHREAD:
1982       // The GDS keeps all memory operations in order for
1983       // the same work-group.
1984       break;
1985     default:
1986       llvm_unreachable("Unsupported synchronization scope");
1987     }
1988   }
1989 
1990   if (VMCnt || LGKMCnt) {
1991     unsigned WaitCntImmediate =
1992       AMDGPU::encodeWaitcnt(IV,
1993                             VMCnt ? 0 : getVmcntBitMask(IV),
1994                             getExpcntBitMask(IV),
1995                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1996     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1997         .addImm(WaitCntImmediate);
1998     Changed = true;
1999   }
2000 
2001   if (VSCnt) {
2002     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2003         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2004         .addImm(0);
2005     Changed = true;
2006   }
2007 
2008   if (Pos == Position::AFTER)
2009     --MI;
2010 
2011   return Changed;
2012 }
2013 
2014 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2015                                         SIAtomicScope Scope,
2016                                         SIAtomicAddrSpace AddrSpace,
2017                                         Position Pos) const {
2018   if (!InsertCacheInv)
2019     return false;
2020 
2021   bool Changed = false;
2022 
2023   MachineBasicBlock &MBB = *MI->getParent();
2024   DebugLoc DL = MI->getDebugLoc();
2025 
2026   if (Pos == Position::AFTER)
2027     ++MI;
2028 
2029   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2030     switch (Scope) {
2031     case SIAtomicScope::SYSTEM:
2032     case SIAtomicScope::AGENT:
2033       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2034       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2035       Changed = true;
2036       break;
2037     case SIAtomicScope::WORKGROUP:
2038       // In WGP mode the waves of a work-group can be executing on either CU of
2039       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2040       // in CU mode and all waves of a work-group are on the same CU, and so the
2041       // L0 does not need to be invalidated.
2042       if (!ST.isCuModeEnabled()) {
2043         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2044         Changed = true;
2045       }
2046       break;
2047     case SIAtomicScope::WAVEFRONT:
2048     case SIAtomicScope::SINGLETHREAD:
2049       // No cache to invalidate.
2050       break;
2051     default:
2052       llvm_unreachable("Unsupported synchronization scope");
2053     }
2054   }
2055 
2056   /// The scratch address space does not need the global memory cache
2057   /// to be flushed as all memory operations by the same thread are
2058   /// sequentially consistent, and no other thread can access scratch
2059   /// memory.
2060 
2061   /// Other address spaces do not have a cache.
2062 
2063   if (Pos == Position::AFTER)
2064     --MI;
2065 
2066   return Changed;
2067 }
2068 
2069 bool SIGfx11CacheControl::enableLoadCacheBypass(
2070     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2071     SIAtomicAddrSpace AddrSpace) const {
2072   assert(MI->mayLoad() && !MI->mayStore());
2073   bool Changed = false;
2074 
2075   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2076     switch (Scope) {
2077     case SIAtomicScope::SYSTEM:
2078     case SIAtomicScope::AGENT:
2079       // Set the L0 and L1 cache policies to MISS_EVICT.
2080       // Note: there is no L2 cache coherent bypass control at the ISA level.
2081       Changed |= enableGLCBit(MI);
2082       break;
2083     case SIAtomicScope::WORKGROUP:
2084       // In WGP mode the waves of a work-group can be executing on either CU of
2085       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2086       // CU mode all waves of a work-group are on the same CU, and so the L0
2087       // does not need to be bypassed.
2088       if (!ST.isCuModeEnabled())
2089         Changed |= enableGLCBit(MI);
2090       break;
2091     case SIAtomicScope::WAVEFRONT:
2092     case SIAtomicScope::SINGLETHREAD:
2093       // No cache to bypass.
2094       break;
2095     default:
2096       llvm_unreachable("Unsupported synchronization scope");
2097     }
2098   }
2099 
2100   /// The scratch address space does not need the global memory caches
2101   /// to be bypassed as all memory operations by the same thread are
2102   /// sequentially consistent, and no other thread can access scratch
2103   /// memory.
2104 
2105   /// Other address spaces do not have a cache.
2106 
2107   return Changed;
2108 }
2109 
2110 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2111     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2112     bool IsVolatile, bool IsNonTemporal) const {
2113 
2114   // Only handle load and store, not atomic read-modify-write insructions. The
2115   // latter use glc to indicate if the atomic returns a result and so must not
2116   // be used for cache control.
2117   assert(MI->mayLoad() ^ MI->mayStore());
2118 
2119   // Only update load and store, not LLVM IR atomic read-modify-write
2120   // instructions. The latter are always marked as volatile so cannot sensibly
2121   // handle it as do not want to pessimize all atomics. Also they do not support
2122   // the nontemporal attribute.
2123   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2124 
2125   bool Changed = false;
2126 
2127   if (IsVolatile) {
2128     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2129     // and MISS_LRU for store instructions.
2130     // Note: there is no L2 cache coherent bypass control at the ISA level.
2131     if (Op == SIMemOp::LOAD)
2132       Changed |= enableGLCBit(MI);
2133 
2134     // Set MALL NOALLOC for load and store instructions.
2135     Changed |= enableDLCBit(MI);
2136 
2137     // Ensure operation has completed at system scope to cause all volatile
2138     // operations to be visible outside the program in a global order. Do not
2139     // request cross address space as only the global address space can be
2140     // observable outside the program, so no need to cause a waitcnt for LDS
2141     // address space operations.
2142     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2143                           Position::AFTER);
2144     return Changed;
2145   }
2146 
2147   if (IsNonTemporal) {
2148     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2149     // and L2 cache policy to STREAM.
2150     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2151     // to MISS_EVICT and the L2 cache policy to STREAM.
2152     if (Op == SIMemOp::STORE)
2153       Changed |= enableGLCBit(MI);
2154     Changed |= enableSLCBit(MI);
2155 
2156     // Set MALL NOALLOC for load and store instructions.
2157     Changed |= enableDLCBit(MI);
2158     return Changed;
2159   }
2160 
2161   return Changed;
2162 }
2163 
2164 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2165                                 AMDGPU::CPol::CPol Value) const {
2166   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2167   if (!CPol)
2168     return false;
2169 
2170   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2171   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2172     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2173     return true;
2174   }
2175 
2176   return false;
2177 }
2178 
2179 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2180                                    AMDGPU::CPol::CPol Value) const {
2181   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2182   if (!CPol)
2183     return false;
2184 
2185   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2186   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2187     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2188     return true;
2189   }
2190 
2191   return false;
2192 }
2193 
2194 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2195                                      SIAtomicScope Scope,
2196                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2197                                      bool IsCrossAddrSpaceOrdering,
2198                                      Position Pos) const {
2199   bool Changed = false;
2200 
2201   MachineBasicBlock &MBB = *MI->getParent();
2202   DebugLoc DL = MI->getDebugLoc();
2203 
2204   bool LOADCnt = false;
2205   bool DSCnt = false;
2206   bool STORECnt = false;
2207 
2208   if (Pos == Position::AFTER)
2209     ++MI;
2210 
2211   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2212       SIAtomicAddrSpace::NONE) {
2213     switch (Scope) {
2214     case SIAtomicScope::SYSTEM:
2215     case SIAtomicScope::AGENT:
2216       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2217         LOADCnt |= true;
2218       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2219         STORECnt |= true;
2220       break;
2221     case SIAtomicScope::WORKGROUP:
2222       // In WGP mode the waves of a work-group can be executing on either CU of
2223       // the WGP. Therefore need to wait for operations to complete to ensure
2224       // they are visible to waves in the other CU as the L0 is per CU.
2225       // Otherwise in CU mode and all waves of a work-group are on the same CU
2226       // which shares the same L0.
2227       if (!ST.isCuModeEnabled()) {
2228         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2229           LOADCnt |= true;
2230         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2231           STORECnt |= true;
2232       }
2233       break;
2234     case SIAtomicScope::WAVEFRONT:
2235     case SIAtomicScope::SINGLETHREAD:
2236       // The L0 cache keeps all memory operations in order for
2237       // work-items in the same wavefront.
2238       break;
2239     default:
2240       llvm_unreachable("Unsupported synchronization scope");
2241     }
2242   }
2243 
2244   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2245     switch (Scope) {
2246     case SIAtomicScope::SYSTEM:
2247     case SIAtomicScope::AGENT:
2248     case SIAtomicScope::WORKGROUP:
2249       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2250       // not needed as LDS operations for all waves are executed in a total
2251       // global ordering as observed by all waves. Required if also
2252       // synchronizing with global/GDS memory as LDS operations could be
2253       // reordered with respect to later global/GDS memory operations of the
2254       // same wave.
2255       DSCnt |= IsCrossAddrSpaceOrdering;
2256       break;
2257     case SIAtomicScope::WAVEFRONT:
2258     case SIAtomicScope::SINGLETHREAD:
2259       // The LDS keeps all memory operations in order for
2260       // the same wavefront.
2261       break;
2262     default:
2263       llvm_unreachable("Unsupported synchronization scope");
2264     }
2265   }
2266 
2267   if (LOADCnt) {
2268     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2269     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2270     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2271     Changed = true;
2272   }
2273 
2274   if (STORECnt) {
2275     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2276     Changed = true;
2277   }
2278 
2279   if (DSCnt) {
2280     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2281     Changed = true;
2282   }
2283 
2284   if (Pos == Position::AFTER)
2285     --MI;
2286 
2287   return Changed;
2288 }
2289 
2290 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2291                                         SIAtomicScope Scope,
2292                                         SIAtomicAddrSpace AddrSpace,
2293                                         Position Pos) const {
2294   if (!InsertCacheInv)
2295     return false;
2296 
2297   MachineBasicBlock &MBB = *MI->getParent();
2298   DebugLoc DL = MI->getDebugLoc();
2299 
2300   /// The scratch address space does not need the global memory cache
2301   /// to be flushed as all memory operations by the same thread are
2302   /// sequentially consistent, and no other thread can access scratch
2303   /// memory.
2304 
2305   /// Other address spaces do not have a cache.
2306   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2307     return false;
2308 
2309   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2310   switch (Scope) {
2311   case SIAtomicScope::SYSTEM:
2312     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2313     break;
2314   case SIAtomicScope::AGENT:
2315     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2316     break;
2317   case SIAtomicScope::WORKGROUP:
2318     // In WGP mode the waves of a work-group can be executing on either CU of
2319     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2320     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2321     // the L0 does not need to be invalidated.
2322     if (ST.isCuModeEnabled())
2323       return false;
2324 
2325     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2326     break;
2327   case SIAtomicScope::WAVEFRONT:
2328   case SIAtomicScope::SINGLETHREAD:
2329     // No cache to invalidate.
2330     return false;
2331   default:
2332     llvm_unreachable("Unsupported synchronization scope");
2333   }
2334 
2335   if (Pos == Position::AFTER)
2336     ++MI;
2337 
2338   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2339 
2340   if (Pos == Position::AFTER)
2341     --MI;
2342 
2343   return true;
2344 }
2345 
2346 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2347     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2348     bool IsVolatile, bool IsNonTemporal) const {
2349 
2350   // Only handle load and store, not atomic read-modify-write instructions.
2351   assert(MI->mayLoad() ^ MI->mayStore());
2352 
2353   // Only update load and store, not LLVM IR atomic read-modify-write
2354   // instructions. The latter are always marked as volatile so cannot sensibly
2355   // handle it as do not want to pessimize all atomics. Also they do not support
2356   // the nontemporal attribute.
2357   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2358 
2359   bool Changed = false;
2360 
2361   if (IsNonTemporal) {
2362     // Set non-temporal hint for all cache levels.
2363     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2364   }
2365 
2366   if (IsVolatile) {
2367     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2368 
2369     // Ensure operation has completed at system scope to cause all volatile
2370     // operations to be visible outside the program in a global order. Do not
2371     // request cross address space as only the global address space can be
2372     // observable outside the program, so no need to cause a waitcnt for LDS
2373     // address space operations.
2374     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2375                           Position::AFTER);
2376   }
2377 
2378   return Changed;
2379 }
2380 
2381 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2382   if (AtomicPseudoMIs.empty())
2383     return false;
2384 
2385   for (auto &MI : AtomicPseudoMIs)
2386     MI->eraseFromParent();
2387 
2388   AtomicPseudoMIs.clear();
2389   return true;
2390 }
2391 
2392 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2393                                    MachineBasicBlock::iterator &MI) {
2394   assert(MI->mayLoad() && !MI->mayStore());
2395 
2396   bool Changed = false;
2397 
2398   if (MOI.isAtomic()) {
2399     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2400         MOI.getOrdering() == AtomicOrdering::Acquire ||
2401         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2402       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2403                                            MOI.getOrderingAddrSpace());
2404     }
2405 
2406     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2407       Changed |= CC->insertWait(MI, MOI.getScope(),
2408                                 MOI.getOrderingAddrSpace(),
2409                                 SIMemOp::LOAD | SIMemOp::STORE,
2410                                 MOI.getIsCrossAddressSpaceOrdering(),
2411                                 Position::BEFORE);
2412 
2413     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2414         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2415       Changed |= CC->insertWait(MI, MOI.getScope(),
2416                                 MOI.getInstrAddrSpace(),
2417                                 SIMemOp::LOAD,
2418                                 MOI.getIsCrossAddressSpaceOrdering(),
2419                                 Position::AFTER);
2420       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2421                                    MOI.getOrderingAddrSpace(),
2422                                    Position::AFTER);
2423     }
2424 
2425     return Changed;
2426   }
2427 
2428   // Atomic instructions already bypass caches to the scope specified by the
2429   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2430   // need additional treatment.
2431   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2432                                                 SIMemOp::LOAD, MOI.isVolatile(),
2433                                                 MOI.isNonTemporal());
2434   return Changed;
2435 }
2436 
2437 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2438                                     MachineBasicBlock::iterator &MI) {
2439   assert(!MI->mayLoad() && MI->mayStore());
2440 
2441   bool Changed = false;
2442 
2443   if (MOI.isAtomic()) {
2444     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2445         MOI.getOrdering() == AtomicOrdering::Release ||
2446         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2447       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2448                                             MOI.getOrderingAddrSpace());
2449     }
2450 
2451     if (MOI.getOrdering() == AtomicOrdering::Release ||
2452         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2453       Changed |= CC->insertRelease(MI, MOI.getScope(),
2454                                    MOI.getOrderingAddrSpace(),
2455                                    MOI.getIsCrossAddressSpaceOrdering(),
2456                                    Position::BEFORE);
2457 
2458     return Changed;
2459   }
2460 
2461   // Atomic instructions already bypass caches to the scope specified by the
2462   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2463   // need additional treatment.
2464   Changed |= CC->enableVolatileAndOrNonTemporal(
2465       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2466       MOI.isNonTemporal());
2467   return Changed;
2468 }
2469 
2470 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2471                                           MachineBasicBlock::iterator &MI) {
2472   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2473 
2474   AtomicPseudoMIs.push_back(MI);
2475   bool Changed = false;
2476 
2477   if (MOI.isAtomic()) {
2478     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2479       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2480                                 SIMemOp::LOAD | SIMemOp::STORE,
2481                                 MOI.getIsCrossAddressSpaceOrdering(),
2482                                 Position::BEFORE);
2483 
2484     if (MOI.getOrdering() == AtomicOrdering::Release ||
2485         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2486         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2487       /// TODO: This relies on a barrier always generating a waitcnt
2488       /// for LDS to ensure it is not reordered with the completion of
2489       /// the proceeding LDS operations. If barrier had a memory
2490       /// ordering and memory scope, then library does not need to
2491       /// generate a fence. Could add support in this file for
2492       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2493       /// adding S_WAITCNT before a S_BARRIER.
2494       Changed |= CC->insertRelease(MI, MOI.getScope(),
2495                                    MOI.getOrderingAddrSpace(),
2496                                    MOI.getIsCrossAddressSpaceOrdering(),
2497                                    Position::BEFORE);
2498 
2499     // TODO: If both release and invalidate are happening they could be combined
2500     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2501     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2502     // track cache invalidate and write back instructions.
2503 
2504     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2505         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2506         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2507       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2508                                    MOI.getOrderingAddrSpace(),
2509                                    Position::BEFORE);
2510 
2511     return Changed;
2512   }
2513 
2514   return Changed;
2515 }
2516 
2517 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2518   MachineBasicBlock::iterator &MI) {
2519   assert(MI->mayLoad() && MI->mayStore());
2520 
2521   bool Changed = false;
2522 
2523   if (MOI.isAtomic()) {
2524     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2525         MOI.getOrdering() == AtomicOrdering::Acquire ||
2526         MOI.getOrdering() == AtomicOrdering::Release ||
2527         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2528         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2529       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2530                                           MOI.getInstrAddrSpace());
2531     }
2532 
2533     if (MOI.getOrdering() == AtomicOrdering::Release ||
2534         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2535         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2536         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2537       Changed |= CC->insertRelease(MI, MOI.getScope(),
2538                                    MOI.getOrderingAddrSpace(),
2539                                    MOI.getIsCrossAddressSpaceOrdering(),
2540                                    Position::BEFORE);
2541 
2542     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2543         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2544         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2545         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2546         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2547       Changed |= CC->insertWait(MI, MOI.getScope(),
2548                                 MOI.getInstrAddrSpace(),
2549                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2550                                                    SIMemOp::STORE,
2551                                 MOI.getIsCrossAddressSpaceOrdering(),
2552                                 Position::AFTER);
2553       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2554                                    MOI.getOrderingAddrSpace(),
2555                                    Position::AFTER);
2556     }
2557 
2558     return Changed;
2559   }
2560 
2561   return Changed;
2562 }
2563 
2564 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2565   bool Changed = false;
2566 
2567   SIMemOpAccess MOA(MF);
2568   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2569 
2570   for (auto &MBB : MF) {
2571     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2572 
2573       // Unbundle instructions after the post-RA scheduler.
2574       if (MI->isBundle() && MI->mayLoadOrStore()) {
2575         MachineBasicBlock::instr_iterator II(MI->getIterator());
2576         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2577              I != E && I->isBundledWithPred(); ++I) {
2578           I->unbundleFromPred();
2579           for (MachineOperand &MO : I->operands())
2580             if (MO.isReg())
2581               MO.setIsInternalRead(false);
2582         }
2583 
2584         MI->eraseFromParent();
2585         MI = II->getIterator();
2586       }
2587 
2588       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2589         continue;
2590 
2591       if (const auto &MOI = MOA.getLoadInfo(MI))
2592         Changed |= expandLoad(*MOI, MI);
2593       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2594         Changed |= expandStore(*MOI, MI);
2595         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2596       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2597         Changed |= expandAtomicFence(*MOI, MI);
2598       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2599         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2600     }
2601   }
2602 
2603   Changed |= removeAtomicPseudoMIs();
2604   return Changed;
2605 }
2606 
2607 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2608 
2609 char SIMemoryLegalizer::ID = 0;
2610 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2611 
2612 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2613   return new SIMemoryLegalizer();
2614 }
2615