xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   /// Inserts any necessary instructions at position \p Pos relative
316   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317   /// \p Op associated with address spaces \p AddrSpace have completed. Used
318   /// between memory instructions to enforce the order they become visible as
319   /// observed by other memory instructions executing in memory scope \p Scope.
320   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321   /// address spaces. Returns true iff any instructions inserted.
322   virtual bool insertWait(MachineBasicBlock::iterator &MI,
323                           SIAtomicScope Scope,
324                           SIAtomicAddrSpace AddrSpace,
325                           SIMemOp Op,
326                           bool IsCrossAddrSpaceOrdering,
327                           Position Pos) const = 0;
328 
329   /// Inserts any necessary instructions at position \p Pos relative to
330   /// instruction \p MI to ensure any subsequent memory instructions of this
331   /// thread with address spaces \p AddrSpace will observe the previous memory
332   /// operations by any thread for memory scopes up to memory scope \p Scope .
333   /// Returns true iff any instructions inserted.
334   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335                              SIAtomicScope Scope,
336                              SIAtomicAddrSpace AddrSpace,
337                              Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure previous memory instructions by this thread
341   /// with address spaces \p AddrSpace have completed and can be observed by
342   /// subsequent memory instructions by any thread executing in memory scope \p
343   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344   /// between address spaces. Returns true iff any instructions inserted.
345   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              bool IsCrossAddrSpaceOrdering,
349                              Position Pos) const = 0;
350 
351   /// Virtual destructor to allow derivations to be deleted.
352   virtual ~SICacheControl() = default;
353 
354   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
355                                    MachineBasicBlock::iterator &MI) const {
356     return false;
357   }
358 };
359 
360 class SIGfx6CacheControl : public SICacheControl {
361 protected:
362 
363   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::GLC);
367   }
368 
369   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
370   /// is modified, false otherwise.
371   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
372     return enableNamedBit(MI, AMDGPU::CPol::SLC);
373   }
374 
375 public:
376 
377   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
378 
379   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380                              SIAtomicScope Scope,
381                              SIAtomicAddrSpace AddrSpace) const override;
382 
383   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
384                               SIAtomicScope Scope,
385                               SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
388                             SIAtomicScope Scope,
389                             SIAtomicAddrSpace AddrSpace) const override;
390 
391   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
392                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
393                                       bool IsVolatile,
394                                       bool IsNonTemporal) const override;
395 
396   bool insertWait(MachineBasicBlock::iterator &MI,
397                   SIAtomicScope Scope,
398                   SIAtomicAddrSpace AddrSpace,
399                   SIMemOp Op,
400                   bool IsCrossAddrSpaceOrdering,
401                   Position Pos) const override;
402 
403   bool insertAcquire(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      Position Pos) const override;
407 
408   bool insertRelease(MachineBasicBlock::iterator &MI,
409                      SIAtomicScope Scope,
410                      SIAtomicAddrSpace AddrSpace,
411                      bool IsCrossAddrSpaceOrdering,
412                      Position Pos) const override;
413 };
414 
415 class SIGfx7CacheControl : public SIGfx6CacheControl {
416 public:
417 
418   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
419 
420   bool insertAcquire(MachineBasicBlock::iterator &MI,
421                      SIAtomicScope Scope,
422                      SIAtomicAddrSpace AddrSpace,
423                      Position Pos) const override;
424 
425 };
426 
427 class SIGfx90ACacheControl : public SIGfx7CacheControl {
428 public:
429 
430   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
431 
432   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
433                              SIAtomicScope Scope,
434                              SIAtomicAddrSpace AddrSpace) const override;
435 
436   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
437                               SIAtomicScope Scope,
438                               SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
441                             SIAtomicScope Scope,
442                             SIAtomicAddrSpace AddrSpace) const override;
443 
444   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
445                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
446                                       bool IsVolatile,
447                                       bool IsNonTemporal) const override;
448 
449   bool insertWait(MachineBasicBlock::iterator &MI,
450                   SIAtomicScope Scope,
451                   SIAtomicAddrSpace AddrSpace,
452                   SIMemOp Op,
453                   bool IsCrossAddrSpaceOrdering,
454                   Position Pos) const override;
455 
456   bool insertAcquire(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      Position Pos) const override;
460 
461   bool insertRelease(MachineBasicBlock::iterator &MI,
462                      SIAtomicScope Scope,
463                      SIAtomicAddrSpace AddrSpace,
464                      bool IsCrossAddrSpaceOrdering,
465                      Position Pos) const override;
466 };
467 
468 class SIGfx940CacheControl : public SIGfx90ACacheControl {
469 protected:
470 
471   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
472   /// is modified, false otherwise.
473   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
474     return enableNamedBit(MI, AMDGPU::CPol::SC0);
475   }
476 
477   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
478   /// is modified, false otherwise.
479   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
480     return enableNamedBit(MI, AMDGPU::CPol::SC1);
481   }
482 
483   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
484   /// is modified, false otherwise.
485   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
486     return enableNamedBit(MI, AMDGPU::CPol::NT);
487   }
488 
489 public:
490 
491   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
492 
493   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
494                              SIAtomicScope Scope,
495                              SIAtomicAddrSpace AddrSpace) const override;
496 
497   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
498                               SIAtomicScope Scope,
499                               SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
502                             SIAtomicScope Scope,
503                             SIAtomicAddrSpace AddrSpace) const override;
504 
505   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
506                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
507                                       bool IsVolatile,
508                                       bool IsNonTemporal) const override;
509 
510   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
511                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
512 
513   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
514                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
515                      Position Pos) const override;
516 
517   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
518                            MachineBasicBlock::iterator &MI) const override {
519     bool Changed = false;
520     if (ST.hasForceStoreSC0SC1() &&
521         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
522                                     SIAtomicAddrSpace::GLOBAL |
523                                     SIAtomicAddrSpace::OTHER)) !=
524          SIAtomicAddrSpace::NONE) {
525       Changed |= enableSC0Bit(MI);
526       Changed |= enableSC1Bit(MI);
527     }
528     return Changed;
529   }
530 };
531 
532 class SIGfx10CacheControl : public SIGfx7CacheControl {
533 protected:
534 
535   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
536   /// is modified, false otherwise.
537   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
538     return enableNamedBit(MI, AMDGPU::CPol::DLC);
539   }
540 
541 public:
542 
543   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
544 
545   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546                              SIAtomicScope Scope,
547                              SIAtomicAddrSpace AddrSpace) const override;
548 
549   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
550                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
551                                       bool IsVolatile,
552                                       bool IsNonTemporal) const override;
553 
554   bool insertWait(MachineBasicBlock::iterator &MI,
555                   SIAtomicScope Scope,
556                   SIAtomicAddrSpace AddrSpace,
557                   SIMemOp Op,
558                   bool IsCrossAddrSpaceOrdering,
559                   Position Pos) const override;
560 
561   bool insertAcquire(MachineBasicBlock::iterator &MI,
562                      SIAtomicScope Scope,
563                      SIAtomicAddrSpace AddrSpace,
564                      Position Pos) const override;
565 };
566 
567 class SIGfx11CacheControl : public SIGfx10CacheControl {
568 public:
569   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
570 
571   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
572                              SIAtomicScope Scope,
573                              SIAtomicAddrSpace AddrSpace) const override;
574 
575   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
576                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577                                       bool IsVolatile,
578                                       bool IsNonTemporal) const override;
579 };
580 
581 class SIGfx12CacheControl : public SIGfx11CacheControl {
582 public:
583   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
584 
585   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
586                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
587 };
588 
589 class SIMemoryLegalizer final : public MachineFunctionPass {
590 private:
591 
592   /// Cache Control.
593   std::unique_ptr<SICacheControl> CC = nullptr;
594 
595   /// List of atomic pseudo instructions.
596   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
597 
598   /// Return true iff instruction \p MI is a atomic instruction that
599   /// returns a result.
600   bool isAtomicRet(const MachineInstr &MI) const {
601     return SIInstrInfo::isAtomicRet(MI);
602   }
603 
604   /// Removes all processed atomic pseudo instructions from the current
605   /// function. Returns true if current function is modified, false otherwise.
606   bool removeAtomicPseudoMIs();
607 
608   /// Expands load operation \p MI. Returns true if instructions are
609   /// added/deleted or \p MI is modified, false otherwise.
610   bool expandLoad(const SIMemOpInfo &MOI,
611                   MachineBasicBlock::iterator &MI);
612   /// Expands store operation \p MI. Returns true if instructions are
613   /// added/deleted or \p MI is modified, false otherwise.
614   bool expandStore(const SIMemOpInfo &MOI,
615                    MachineBasicBlock::iterator &MI);
616   /// Expands atomic fence operation \p MI. Returns true if
617   /// instructions are added/deleted or \p MI is modified, false otherwise.
618   bool expandAtomicFence(const SIMemOpInfo &MOI,
619                          MachineBasicBlock::iterator &MI);
620   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
621   /// instructions are added/deleted or \p MI is modified, false otherwise.
622   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
623                                 MachineBasicBlock::iterator &MI);
624 
625 public:
626   static char ID;
627 
628   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
629 
630   void getAnalysisUsage(AnalysisUsage &AU) const override {
631     AU.setPreservesCFG();
632     MachineFunctionPass::getAnalysisUsage(AU);
633   }
634 
635   StringRef getPassName() const override {
636     return PASS_NAME;
637   }
638 
639   bool runOnMachineFunction(MachineFunction &MF) override;
640 };
641 
642 } // end namespace anonymous
643 
644 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
645                                       const char *Msg) const {
646   const Function &Func = MI->getParent()->getParent()->getFunction();
647   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
648   Func.getContext().diagnose(Diag);
649 }
650 
651 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
652 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
653                                SIAtomicAddrSpace InstrAddrSpace) const {
654   if (SSID == SyncScope::System)
655     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
656   if (SSID == MMI->getAgentSSID())
657     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
658   if (SSID == MMI->getWorkgroupSSID())
659     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
660                       true);
661   if (SSID == MMI->getWavefrontSSID())
662     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
663                       true);
664   if (SSID == SyncScope::SingleThread)
665     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
666                       true);
667   if (SSID == MMI->getSystemOneAddressSpaceSSID())
668     return std::tuple(SIAtomicScope::SYSTEM,
669                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
670   if (SSID == MMI->getAgentOneAddressSpaceSSID())
671     return std::tuple(SIAtomicScope::AGENT,
672                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
673   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
674     return std::tuple(SIAtomicScope::WORKGROUP,
675                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
676   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
677     return std::tuple(SIAtomicScope::WAVEFRONT,
678                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
679   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
680     return std::tuple(SIAtomicScope::SINGLETHREAD,
681                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
682   return std::nullopt;
683 }
684 
685 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
686   if (AS == AMDGPUAS::FLAT_ADDRESS)
687     return SIAtomicAddrSpace::FLAT;
688   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
689     return SIAtomicAddrSpace::GLOBAL;
690   if (AS == AMDGPUAS::LOCAL_ADDRESS)
691     return SIAtomicAddrSpace::LDS;
692   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
693     return SIAtomicAddrSpace::SCRATCH;
694   if (AS == AMDGPUAS::REGION_ADDRESS)
695     return SIAtomicAddrSpace::GDS;
696 
697   return SIAtomicAddrSpace::OTHER;
698 }
699 
700 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
701   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
702 }
703 
704 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
705     const MachineBasicBlock::iterator &MI) const {
706   assert(MI->getNumMemOperands() > 0);
707 
708   SyncScope::ID SSID = SyncScope::SingleThread;
709   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
710   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
711   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
712   bool IsNonTemporal = true;
713   bool IsVolatile = false;
714 
715   // Validator should check whether or not MMOs cover the entire set of
716   // locations accessed by the memory instruction.
717   for (const auto &MMO : MI->memoperands()) {
718     IsNonTemporal &= MMO->isNonTemporal();
719     IsVolatile |= MMO->isVolatile();
720     InstrAddrSpace |=
721       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
722     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
723     if (OpOrdering != AtomicOrdering::NotAtomic) {
724       const auto &IsSyncScopeInclusion =
725           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
726       if (!IsSyncScopeInclusion) {
727         reportUnsupported(MI,
728           "Unsupported non-inclusive atomic synchronization scope");
729         return std::nullopt;
730       }
731 
732       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
733       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
734       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
735              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
736       FailureOrdering =
737           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
738     }
739   }
740 
741   SIAtomicScope Scope = SIAtomicScope::NONE;
742   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
743   bool IsCrossAddressSpaceOrdering = false;
744   if (Ordering != AtomicOrdering::NotAtomic) {
745     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
746     if (!ScopeOrNone) {
747       reportUnsupported(MI, "Unsupported atomic synchronization scope");
748       return std::nullopt;
749     }
750     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
751         *ScopeOrNone;
752     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
753         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
754         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
755       reportUnsupported(MI, "Unsupported atomic address space");
756       return std::nullopt;
757     }
758   }
759   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
760                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
761                      IsNonTemporal);
762 }
763 
764 std::optional<SIMemOpInfo>
765 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
766   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
767 
768   if (!(MI->mayLoad() && !MI->mayStore()))
769     return std::nullopt;
770 
771   // Be conservative if there are no memory operands.
772   if (MI->getNumMemOperands() == 0)
773     return SIMemOpInfo();
774 
775   return constructFromMIWithMMO(MI);
776 }
777 
778 std::optional<SIMemOpInfo>
779 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
780   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
781 
782   if (!(!MI->mayLoad() && MI->mayStore()))
783     return std::nullopt;
784 
785   // Be conservative if there are no memory operands.
786   if (MI->getNumMemOperands() == 0)
787     return SIMemOpInfo();
788 
789   return constructFromMIWithMMO(MI);
790 }
791 
792 std::optional<SIMemOpInfo>
793 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
794   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
795 
796   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
797     return std::nullopt;
798 
799   AtomicOrdering Ordering =
800     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
801 
802   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
803   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
804   if (!ScopeOrNone) {
805     reportUnsupported(MI, "Unsupported atomic synchronization scope");
806     return std::nullopt;
807   }
808 
809   SIAtomicScope Scope = SIAtomicScope::NONE;
810   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
811   bool IsCrossAddressSpaceOrdering = false;
812   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
813       *ScopeOrNone;
814 
815   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
816       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
817     reportUnsupported(MI, "Unsupported atomic address space");
818     return std::nullopt;
819   }
820 
821   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
822                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
823 }
824 
825 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
826     const MachineBasicBlock::iterator &MI) const {
827   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
828 
829   if (!(MI->mayLoad() && MI->mayStore()))
830     return std::nullopt;
831 
832   // Be conservative if there are no memory operands.
833   if (MI->getNumMemOperands() == 0)
834     return SIMemOpInfo();
835 
836   return constructFromMIWithMMO(MI);
837 }
838 
839 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
840   TII = ST.getInstrInfo();
841   IV = getIsaVersion(ST.getCPU());
842   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
843 }
844 
845 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
846                                     AMDGPU::CPol::CPol Bit) const {
847   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
848   if (!CPol)
849     return false;
850 
851   CPol->setImm(CPol->getImm() | Bit);
852   return true;
853 }
854 
855 /* static */
856 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
857   GCNSubtarget::Generation Generation = ST.getGeneration();
858   if (ST.hasGFX940Insts())
859     return std::make_unique<SIGfx940CacheControl>(ST);
860   if (ST.hasGFX90AInsts())
861     return std::make_unique<SIGfx90ACacheControl>(ST);
862   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
863     return std::make_unique<SIGfx6CacheControl>(ST);
864   if (Generation < AMDGPUSubtarget::GFX10)
865     return std::make_unique<SIGfx7CacheControl>(ST);
866   if (Generation < AMDGPUSubtarget::GFX11)
867     return std::make_unique<SIGfx10CacheControl>(ST);
868   if (Generation < AMDGPUSubtarget::GFX12)
869     return std::make_unique<SIGfx11CacheControl>(ST);
870   return std::make_unique<SIGfx12CacheControl>(ST);
871 }
872 
873 bool SIGfx6CacheControl::enableLoadCacheBypass(
874     const MachineBasicBlock::iterator &MI,
875     SIAtomicScope Scope,
876     SIAtomicAddrSpace AddrSpace) const {
877   assert(MI->mayLoad() && !MI->mayStore());
878   bool Changed = false;
879 
880   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
881     switch (Scope) {
882     case SIAtomicScope::SYSTEM:
883     case SIAtomicScope::AGENT:
884       // Set L1 cache policy to MISS_EVICT.
885       // Note: there is no L2 cache bypass policy at the ISA level.
886       Changed |= enableGLCBit(MI);
887       break;
888     case SIAtomicScope::WORKGROUP:
889     case SIAtomicScope::WAVEFRONT:
890     case SIAtomicScope::SINGLETHREAD:
891       // No cache to bypass.
892       break;
893     default:
894       llvm_unreachable("Unsupported synchronization scope");
895     }
896   }
897 
898   /// The scratch address space does not need the global memory caches
899   /// to be bypassed as all memory operations by the same thread are
900   /// sequentially consistent, and no other thread can access scratch
901   /// memory.
902 
903   /// Other address spaces do not have a cache.
904 
905   return Changed;
906 }
907 
908 bool SIGfx6CacheControl::enableStoreCacheBypass(
909     const MachineBasicBlock::iterator &MI,
910     SIAtomicScope Scope,
911     SIAtomicAddrSpace AddrSpace) const {
912   assert(!MI->mayLoad() && MI->mayStore());
913   bool Changed = false;
914 
915   /// The L1 cache is write through so does not need to be bypassed. There is no
916   /// bypass control for the L2 cache at the isa level.
917 
918   return Changed;
919 }
920 
921 bool SIGfx6CacheControl::enableRMWCacheBypass(
922     const MachineBasicBlock::iterator &MI,
923     SIAtomicScope Scope,
924     SIAtomicAddrSpace AddrSpace) const {
925   assert(MI->mayLoad() && MI->mayStore());
926   bool Changed = false;
927 
928   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
929   /// bypassed, and the GLC bit is instead used to indicate if they are
930   /// return or no-return.
931   /// Note: there is no L2 cache coherent bypass control at the ISA level.
932 
933   return Changed;
934 }
935 
936 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
937     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
938     bool IsVolatile, bool IsNonTemporal) const {
939   // Only handle load and store, not atomic read-modify-write insructions. The
940   // latter use glc to indicate if the atomic returns a result and so must not
941   // be used for cache control.
942   assert(MI->mayLoad() ^ MI->mayStore());
943 
944   // Only update load and store, not LLVM IR atomic read-modify-write
945   // instructions. The latter are always marked as volatile so cannot sensibly
946   // handle it as do not want to pessimize all atomics. Also they do not support
947   // the nontemporal attribute.
948   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
949 
950   bool Changed = false;
951 
952   if (IsVolatile) {
953     // Set L1 cache policy to be MISS_EVICT for load instructions
954     // and MISS_LRU for store instructions.
955     // Note: there is no L2 cache bypass policy at the ISA level.
956     if (Op == SIMemOp::LOAD)
957       Changed |= enableGLCBit(MI);
958 
959     // Ensure operation has completed at system scope to cause all volatile
960     // operations to be visible outside the program in a global order. Do not
961     // request cross address space as only the global address space can be
962     // observable outside the program, so no need to cause a waitcnt for LDS
963     // address space operations.
964     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
965                           Position::AFTER);
966 
967     return Changed;
968   }
969 
970   if (IsNonTemporal) {
971     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
972     // for both loads and stores, and the L2 cache policy to STREAM.
973     Changed |= enableGLCBit(MI);
974     Changed |= enableSLCBit(MI);
975     return Changed;
976   }
977 
978   return Changed;
979 }
980 
981 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
982                                     SIAtomicScope Scope,
983                                     SIAtomicAddrSpace AddrSpace,
984                                     SIMemOp Op,
985                                     bool IsCrossAddrSpaceOrdering,
986                                     Position Pos) const {
987   bool Changed = false;
988 
989   MachineBasicBlock &MBB = *MI->getParent();
990   DebugLoc DL = MI->getDebugLoc();
991 
992   if (Pos == Position::AFTER)
993     ++MI;
994 
995   bool VMCnt = false;
996   bool LGKMCnt = false;
997 
998   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
999       SIAtomicAddrSpace::NONE) {
1000     switch (Scope) {
1001     case SIAtomicScope::SYSTEM:
1002     case SIAtomicScope::AGENT:
1003       VMCnt |= true;
1004       break;
1005     case SIAtomicScope::WORKGROUP:
1006     case SIAtomicScope::WAVEFRONT:
1007     case SIAtomicScope::SINGLETHREAD:
1008       // The L1 cache keeps all memory operations in order for
1009       // wavefronts in the same work-group.
1010       break;
1011     default:
1012       llvm_unreachable("Unsupported synchronization scope");
1013     }
1014   }
1015 
1016   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1017     switch (Scope) {
1018     case SIAtomicScope::SYSTEM:
1019     case SIAtomicScope::AGENT:
1020     case SIAtomicScope::WORKGROUP:
1021       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1022       // not needed as LDS operations for all waves are executed in a total
1023       // global ordering as observed by all waves. Required if also
1024       // synchronizing with global/GDS memory as LDS operations could be
1025       // reordered with respect to later global/GDS memory operations of the
1026       // same wave.
1027       LGKMCnt |= IsCrossAddrSpaceOrdering;
1028       break;
1029     case SIAtomicScope::WAVEFRONT:
1030     case SIAtomicScope::SINGLETHREAD:
1031       // The LDS keeps all memory operations in order for
1032       // the same wavefront.
1033       break;
1034     default:
1035       llvm_unreachable("Unsupported synchronization scope");
1036     }
1037   }
1038 
1039   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1040     switch (Scope) {
1041     case SIAtomicScope::SYSTEM:
1042     case SIAtomicScope::AGENT:
1043       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1044       // is not needed as GDS operations for all waves are executed in a total
1045       // global ordering as observed by all waves. Required if also
1046       // synchronizing with global/LDS memory as GDS operations could be
1047       // reordered with respect to later global/LDS memory operations of the
1048       // same wave.
1049       LGKMCnt |= IsCrossAddrSpaceOrdering;
1050       break;
1051     case SIAtomicScope::WORKGROUP:
1052     case SIAtomicScope::WAVEFRONT:
1053     case SIAtomicScope::SINGLETHREAD:
1054       // The GDS keeps all memory operations in order for
1055       // the same work-group.
1056       break;
1057     default:
1058       llvm_unreachable("Unsupported synchronization scope");
1059     }
1060   }
1061 
1062   if (VMCnt || LGKMCnt) {
1063     unsigned WaitCntImmediate =
1064       AMDGPU::encodeWaitcnt(IV,
1065                             VMCnt ? 0 : getVmcntBitMask(IV),
1066                             getExpcntBitMask(IV),
1067                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1068     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1069         .addImm(WaitCntImmediate);
1070     Changed = true;
1071   }
1072 
1073   if (Pos == Position::AFTER)
1074     --MI;
1075 
1076   return Changed;
1077 }
1078 
1079 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1080                                        SIAtomicScope Scope,
1081                                        SIAtomicAddrSpace AddrSpace,
1082                                        Position Pos) const {
1083   if (!InsertCacheInv)
1084     return false;
1085 
1086   bool Changed = false;
1087 
1088   MachineBasicBlock &MBB = *MI->getParent();
1089   DebugLoc DL = MI->getDebugLoc();
1090 
1091   if (Pos == Position::AFTER)
1092     ++MI;
1093 
1094   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1095     switch (Scope) {
1096     case SIAtomicScope::SYSTEM:
1097     case SIAtomicScope::AGENT:
1098       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1099       Changed = true;
1100       break;
1101     case SIAtomicScope::WORKGROUP:
1102     case SIAtomicScope::WAVEFRONT:
1103     case SIAtomicScope::SINGLETHREAD:
1104       // No cache to invalidate.
1105       break;
1106     default:
1107       llvm_unreachable("Unsupported synchronization scope");
1108     }
1109   }
1110 
1111   /// The scratch address space does not need the global memory cache
1112   /// to be flushed as all memory operations by the same thread are
1113   /// sequentially consistent, and no other thread can access scratch
1114   /// memory.
1115 
1116   /// Other address spaces do not have a cache.
1117 
1118   if (Pos == Position::AFTER)
1119     --MI;
1120 
1121   return Changed;
1122 }
1123 
1124 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1125                                        SIAtomicScope Scope,
1126                                        SIAtomicAddrSpace AddrSpace,
1127                                        bool IsCrossAddrSpaceOrdering,
1128                                        Position Pos) const {
1129   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1130                     IsCrossAddrSpaceOrdering, Pos);
1131 }
1132 
1133 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1134                                        SIAtomicScope Scope,
1135                                        SIAtomicAddrSpace AddrSpace,
1136                                        Position Pos) const {
1137   if (!InsertCacheInv)
1138     return false;
1139 
1140   bool Changed = false;
1141 
1142   MachineBasicBlock &MBB = *MI->getParent();
1143   DebugLoc DL = MI->getDebugLoc();
1144 
1145   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1146 
1147   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1148                                     ? AMDGPU::BUFFER_WBINVL1
1149                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1150 
1151   if (Pos == Position::AFTER)
1152     ++MI;
1153 
1154   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1155     switch (Scope) {
1156     case SIAtomicScope::SYSTEM:
1157     case SIAtomicScope::AGENT:
1158       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1159       Changed = true;
1160       break;
1161     case SIAtomicScope::WORKGROUP:
1162     case SIAtomicScope::WAVEFRONT:
1163     case SIAtomicScope::SINGLETHREAD:
1164       // No cache to invalidate.
1165       break;
1166     default:
1167       llvm_unreachable("Unsupported synchronization scope");
1168     }
1169   }
1170 
1171   /// The scratch address space does not need the global memory cache
1172   /// to be flushed as all memory operations by the same thread are
1173   /// sequentially consistent, and no other thread can access scratch
1174   /// memory.
1175 
1176   /// Other address spaces do not have a cache.
1177 
1178   if (Pos == Position::AFTER)
1179     --MI;
1180 
1181   return Changed;
1182 }
1183 
1184 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1185     const MachineBasicBlock::iterator &MI,
1186     SIAtomicScope Scope,
1187     SIAtomicAddrSpace AddrSpace) const {
1188   assert(MI->mayLoad() && !MI->mayStore());
1189   bool Changed = false;
1190 
1191   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1192     switch (Scope) {
1193     case SIAtomicScope::SYSTEM:
1194     case SIAtomicScope::AGENT:
1195       // Set the L1 cache policy to MISS_LRU.
1196       // Note: there is no L2 cache bypass policy at the ISA level.
1197       Changed |= enableGLCBit(MI);
1198       break;
1199     case SIAtomicScope::WORKGROUP:
1200       // In threadgroup split mode the waves of a work-group can be executing on
1201       // different CUs. Therefore need to bypass the L1 which is per CU.
1202       // Otherwise in non-threadgroup split mode all waves of a work-group are
1203       // on the same CU, and so the L1 does not need to be bypassed.
1204       if (ST.isTgSplitEnabled())
1205         Changed |= enableGLCBit(MI);
1206       break;
1207     case SIAtomicScope::WAVEFRONT:
1208     case SIAtomicScope::SINGLETHREAD:
1209       // No cache to bypass.
1210       break;
1211     default:
1212       llvm_unreachable("Unsupported synchronization scope");
1213     }
1214   }
1215 
1216   /// The scratch address space does not need the global memory caches
1217   /// to be bypassed as all memory operations by the same thread are
1218   /// sequentially consistent, and no other thread can access scratch
1219   /// memory.
1220 
1221   /// Other address spaces do not have a cache.
1222 
1223   return Changed;
1224 }
1225 
1226 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1227     const MachineBasicBlock::iterator &MI,
1228     SIAtomicScope Scope,
1229     SIAtomicAddrSpace AddrSpace) const {
1230   assert(!MI->mayLoad() && MI->mayStore());
1231   bool Changed = false;
1232 
1233   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1234     switch (Scope) {
1235     case SIAtomicScope::SYSTEM:
1236     case SIAtomicScope::AGENT:
1237       /// Do not set glc for store atomic operations as they implicitly write
1238       /// through the L1 cache.
1239       break;
1240     case SIAtomicScope::WORKGROUP:
1241     case SIAtomicScope::WAVEFRONT:
1242     case SIAtomicScope::SINGLETHREAD:
1243       // No cache to bypass. Store atomics implicitly write through the L1
1244       // cache.
1245       break;
1246     default:
1247       llvm_unreachable("Unsupported synchronization scope");
1248     }
1249   }
1250 
1251   /// The scratch address space does not need the global memory caches
1252   /// to be bypassed as all memory operations by the same thread are
1253   /// sequentially consistent, and no other thread can access scratch
1254   /// memory.
1255 
1256   /// Other address spaces do not have a cache.
1257 
1258   return Changed;
1259 }
1260 
1261 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1262     const MachineBasicBlock::iterator &MI,
1263     SIAtomicScope Scope,
1264     SIAtomicAddrSpace AddrSpace) const {
1265   assert(MI->mayLoad() && MI->mayStore());
1266   bool Changed = false;
1267 
1268   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1269     switch (Scope) {
1270     case SIAtomicScope::SYSTEM:
1271     case SIAtomicScope::AGENT:
1272       /// Do not set glc for RMW atomic operations as they implicitly bypass
1273       /// the L1 cache, and the glc bit is instead used to indicate if they are
1274       /// return or no-return.
1275       break;
1276     case SIAtomicScope::WORKGROUP:
1277     case SIAtomicScope::WAVEFRONT:
1278     case SIAtomicScope::SINGLETHREAD:
1279       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1280       break;
1281     default:
1282       llvm_unreachable("Unsupported synchronization scope");
1283     }
1284   }
1285 
1286   return Changed;
1287 }
1288 
1289 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1290     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1291     bool IsVolatile, bool IsNonTemporal) const {
1292   // Only handle load and store, not atomic read-modify-write insructions. The
1293   // latter use glc to indicate if the atomic returns a result and so must not
1294   // be used for cache control.
1295   assert(MI->mayLoad() ^ MI->mayStore());
1296 
1297   // Only update load and store, not LLVM IR atomic read-modify-write
1298   // instructions. The latter are always marked as volatile so cannot sensibly
1299   // handle it as do not want to pessimize all atomics. Also they do not support
1300   // the nontemporal attribute.
1301   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1302 
1303   bool Changed = false;
1304 
1305   if (IsVolatile) {
1306     // Set L1 cache policy to be MISS_EVICT for load instructions
1307     // and MISS_LRU for store instructions.
1308     // Note: there is no L2 cache bypass policy at the ISA level.
1309     if (Op == SIMemOp::LOAD)
1310       Changed |= enableGLCBit(MI);
1311 
1312     // Ensure operation has completed at system scope to cause all volatile
1313     // operations to be visible outside the program in a global order. Do not
1314     // request cross address space as only the global address space can be
1315     // observable outside the program, so no need to cause a waitcnt for LDS
1316     // address space operations.
1317     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1318                           Position::AFTER);
1319 
1320     return Changed;
1321   }
1322 
1323   if (IsNonTemporal) {
1324     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1325     // for both loads and stores, and the L2 cache policy to STREAM.
1326     Changed |= enableGLCBit(MI);
1327     Changed |= enableSLCBit(MI);
1328     return Changed;
1329   }
1330 
1331   return Changed;
1332 }
1333 
1334 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1335                                       SIAtomicScope Scope,
1336                                       SIAtomicAddrSpace AddrSpace,
1337                                       SIMemOp Op,
1338                                       bool IsCrossAddrSpaceOrdering,
1339                                       Position Pos) const {
1340   if (ST.isTgSplitEnabled()) {
1341     // In threadgroup split mode the waves of a work-group can be executing on
1342     // different CUs. Therefore need to wait for global or GDS memory operations
1343     // to complete to ensure they are visible to waves in the other CUs.
1344     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1345     // the same CU, so no need to wait for global memory as all waves in the
1346     // work-group access the same the L1, nor wait for GDS as access are ordered
1347     // on a CU.
1348     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1349                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1350         (Scope == SIAtomicScope::WORKGROUP)) {
1351       // Same as GFX7 using agent scope.
1352       Scope = SIAtomicScope::AGENT;
1353     }
1354     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1355     // LDS memory operations.
1356     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1357   }
1358   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1359                                         IsCrossAddrSpaceOrdering, Pos);
1360 }
1361 
1362 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1363                                          SIAtomicScope Scope,
1364                                          SIAtomicAddrSpace AddrSpace,
1365                                          Position Pos) const {
1366   if (!InsertCacheInv)
1367     return false;
1368 
1369   bool Changed = false;
1370 
1371   MachineBasicBlock &MBB = *MI->getParent();
1372   DebugLoc DL = MI->getDebugLoc();
1373 
1374   if (Pos == Position::AFTER)
1375     ++MI;
1376 
1377   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1378     switch (Scope) {
1379     case SIAtomicScope::SYSTEM:
1380       // Ensures that following loads will not see stale remote VMEM data or
1381       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1382       // CC will never be stale due to the local memory probes.
1383       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1384       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1385       // hardware does not reorder memory operations by the same wave with
1386       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1387       // remove any cache lines of earlier writes by the same wave and ensures
1388       // later reads by the same wave will refetch the cache lines.
1389       Changed = true;
1390       break;
1391     case SIAtomicScope::AGENT:
1392       // Same as GFX7.
1393       break;
1394     case SIAtomicScope::WORKGROUP:
1395       // In threadgroup split mode the waves of a work-group can be executing on
1396       // different CUs. Therefore need to invalidate the L1 which is per CU.
1397       // Otherwise in non-threadgroup split mode all waves of a work-group are
1398       // on the same CU, and so the L1 does not need to be invalidated.
1399       if (ST.isTgSplitEnabled()) {
1400         // Same as GFX7 using agent scope.
1401         Scope = SIAtomicScope::AGENT;
1402       }
1403       break;
1404     case SIAtomicScope::WAVEFRONT:
1405     case SIAtomicScope::SINGLETHREAD:
1406       // Same as GFX7.
1407       break;
1408     default:
1409       llvm_unreachable("Unsupported synchronization scope");
1410     }
1411   }
1412 
1413   /// The scratch address space does not need the global memory cache
1414   /// to be flushed as all memory operations by the same thread are
1415   /// sequentially consistent, and no other thread can access scratch
1416   /// memory.
1417 
1418   /// Other address spaces do not have a cache.
1419 
1420   if (Pos == Position::AFTER)
1421     --MI;
1422 
1423   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1424 
1425   return Changed;
1426 }
1427 
1428 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1429                                          SIAtomicScope Scope,
1430                                          SIAtomicAddrSpace AddrSpace,
1431                                          bool IsCrossAddrSpaceOrdering,
1432                                          Position Pos) const {
1433   bool Changed = false;
1434 
1435   MachineBasicBlock &MBB = *MI->getParent();
1436   const DebugLoc &DL = MI->getDebugLoc();
1437 
1438   if (Pos == Position::AFTER)
1439     ++MI;
1440 
1441   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1442     switch (Scope) {
1443     case SIAtomicScope::SYSTEM:
1444       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1445       // hardware does not reorder memory operations by the same wave with
1446       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1447       // to initiate writeback of any dirty cache lines of earlier writes by the
1448       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1449       // writeback has completed.
1450       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1451         // Set SC bits to indicate system scope.
1452         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1453       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1454       // vmcnt(0)" needed by the "BUFFER_WBL2".
1455       Changed = true;
1456       break;
1457     case SIAtomicScope::AGENT:
1458     case SIAtomicScope::WORKGROUP:
1459     case SIAtomicScope::WAVEFRONT:
1460     case SIAtomicScope::SINGLETHREAD:
1461       // Same as GFX7.
1462       break;
1463     default:
1464       llvm_unreachable("Unsupported synchronization scope");
1465     }
1466   }
1467 
1468   if (Pos == Position::AFTER)
1469     --MI;
1470 
1471   Changed |=
1472       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1473                                         IsCrossAddrSpaceOrdering, Pos);
1474 
1475   return Changed;
1476 }
1477 
1478 bool SIGfx940CacheControl::enableLoadCacheBypass(
1479     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1480     SIAtomicAddrSpace AddrSpace) const {
1481   assert(MI->mayLoad() && !MI->mayStore());
1482   bool Changed = false;
1483 
1484   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1485     switch (Scope) {
1486     case SIAtomicScope::SYSTEM:
1487       // Set SC bits to indicate system scope.
1488       Changed |= enableSC0Bit(MI);
1489       Changed |= enableSC1Bit(MI);
1490       break;
1491     case SIAtomicScope::AGENT:
1492       // Set SC bits to indicate agent scope.
1493       Changed |= enableSC1Bit(MI);
1494       break;
1495     case SIAtomicScope::WORKGROUP:
1496       // In threadgroup split mode the waves of a work-group can be executing on
1497       // different CUs. Therefore need to bypass the L1 which is per CU.
1498       // Otherwise in non-threadgroup split mode all waves of a work-group are
1499       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1500       // bits to indicate work-group scope will do this automatically.
1501       Changed |= enableSC0Bit(MI);
1502       break;
1503     case SIAtomicScope::WAVEFRONT:
1504     case SIAtomicScope::SINGLETHREAD:
1505       // Leave SC bits unset to indicate wavefront scope.
1506       break;
1507     default:
1508       llvm_unreachable("Unsupported synchronization scope");
1509     }
1510   }
1511 
1512   /// The scratch address space does not need the global memory caches
1513   /// to be bypassed as all memory operations by the same thread are
1514   /// sequentially consistent, and no other thread can access scratch
1515   /// memory.
1516 
1517   /// Other address spaces do not have a cache.
1518 
1519   return Changed;
1520 }
1521 
1522 bool SIGfx940CacheControl::enableStoreCacheBypass(
1523     const MachineBasicBlock::iterator &MI,
1524     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1525   assert(!MI->mayLoad() && MI->mayStore());
1526   bool Changed = false;
1527 
1528   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1529     switch (Scope) {
1530     case SIAtomicScope::SYSTEM:
1531       // Set SC bits to indicate system scope.
1532       Changed |= enableSC0Bit(MI);
1533       Changed |= enableSC1Bit(MI);
1534       break;
1535     case SIAtomicScope::AGENT:
1536       // Set SC bits to indicate agent scope.
1537       Changed |= enableSC1Bit(MI);
1538       break;
1539     case SIAtomicScope::WORKGROUP:
1540       // Set SC bits to indicate workgroup scope.
1541       Changed |= enableSC0Bit(MI);
1542       break;
1543     case SIAtomicScope::WAVEFRONT:
1544     case SIAtomicScope::SINGLETHREAD:
1545       // Leave SC bits unset to indicate wavefront scope.
1546       break;
1547     default:
1548       llvm_unreachable("Unsupported synchronization scope");
1549     }
1550   }
1551 
1552   /// The scratch address space does not need the global memory caches
1553   /// to be bypassed as all memory operations by the same thread are
1554   /// sequentially consistent, and no other thread can access scratch
1555   /// memory.
1556 
1557   /// Other address spaces do not have a cache.
1558 
1559   return Changed;
1560 }
1561 
1562 bool SIGfx940CacheControl::enableRMWCacheBypass(
1563     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1564     SIAtomicAddrSpace AddrSpace) const {
1565   assert(MI->mayLoad() && MI->mayStore());
1566   bool Changed = false;
1567 
1568   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1569     switch (Scope) {
1570     case SIAtomicScope::SYSTEM:
1571       // Set SC1 bit to indicate system scope.
1572       Changed |= enableSC1Bit(MI);
1573       break;
1574     case SIAtomicScope::AGENT:
1575     case SIAtomicScope::WORKGROUP:
1576     case SIAtomicScope::WAVEFRONT:
1577     case SIAtomicScope::SINGLETHREAD:
1578       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1579       // to indicate system or agent scope. The SC0 bit is used to indicate if
1580       // they are return or no-return. Leave SC1 bit unset to indicate agent
1581       // scope.
1582       break;
1583     default:
1584       llvm_unreachable("Unsupported synchronization scope");
1585     }
1586   }
1587 
1588   return Changed;
1589 }
1590 
1591 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1592     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1593     bool IsVolatile, bool IsNonTemporal) const {
1594   // Only handle load and store, not atomic read-modify-write insructions. The
1595   // latter use glc to indicate if the atomic returns a result and so must not
1596   // be used for cache control.
1597   assert(MI->mayLoad() ^ MI->mayStore());
1598 
1599   // Only update load and store, not LLVM IR atomic read-modify-write
1600   // instructions. The latter are always marked as volatile so cannot sensibly
1601   // handle it as do not want to pessimize all atomics. Also they do not support
1602   // the nontemporal attribute.
1603   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1604 
1605   bool Changed = false;
1606 
1607   if (IsVolatile) {
1608     // Set SC bits to indicate system scope.
1609     Changed |= enableSC0Bit(MI);
1610     Changed |= enableSC1Bit(MI);
1611 
1612     // Ensure operation has completed at system scope to cause all volatile
1613     // operations to be visible outside the program in a global order. Do not
1614     // request cross address space as only the global address space can be
1615     // observable outside the program, so no need to cause a waitcnt for LDS
1616     // address space operations.
1617     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1618                           Position::AFTER);
1619 
1620     return Changed;
1621   }
1622 
1623   if (IsNonTemporal) {
1624     Changed |= enableNTBit(MI);
1625     return Changed;
1626   }
1627 
1628   return Changed;
1629 }
1630 
1631 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1632                                          SIAtomicScope Scope,
1633                                          SIAtomicAddrSpace AddrSpace,
1634                                          Position Pos) const {
1635   if (!InsertCacheInv)
1636     return false;
1637 
1638   bool Changed = false;
1639 
1640   MachineBasicBlock &MBB = *MI->getParent();
1641   DebugLoc DL = MI->getDebugLoc();
1642 
1643   if (Pos == Position::AFTER)
1644     ++MI;
1645 
1646   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1647     switch (Scope) {
1648     case SIAtomicScope::SYSTEM:
1649       // Ensures that following loads will not see stale remote VMEM data or
1650       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1651       // CC will never be stale due to the local memory probes.
1652       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1653           // Set SC bits to indicate system scope.
1654           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1655       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1656       // hardware does not reorder memory operations by the same wave with
1657       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1658       // remove any cache lines of earlier writes by the same wave and ensures
1659       // later reads by the same wave will refetch the cache lines.
1660       Changed = true;
1661       break;
1662     case SIAtomicScope::AGENT:
1663       // Ensures that following loads will not see stale remote date or local
1664       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1665       // due to the memory probes.
1666       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1667           // Set SC bits to indicate agent scope.
1668           .addImm(AMDGPU::CPol::SC1);
1669       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1670       // does not reorder memory operations with respect to preceeding buffer
1671       // invalidate. The invalidate is guaranteed to remove any cache lines of
1672       // earlier writes and ensures later writes will refetch the cache lines.
1673       Changed = true;
1674       break;
1675     case SIAtomicScope::WORKGROUP:
1676       // In threadgroup split mode the waves of a work-group can be executing on
1677       // different CUs. Therefore need to invalidate the L1 which is per CU.
1678       // Otherwise in non-threadgroup split mode all waves of a work-group are
1679       // on the same CU, and so the L1 does not need to be invalidated.
1680       if (ST.isTgSplitEnabled()) {
1681         // Ensures L1 is invalidated if in threadgroup split mode. In
1682         // non-threadgroup split mode it is a NOP, but no point generating it in
1683         // that case if know not in that mode.
1684         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1685             // Set SC bits to indicate work-group scope.
1686             .addImm(AMDGPU::CPol::SC0);
1687         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1688         // does not reorder memory operations with respect to preceeding buffer
1689         // invalidate. The invalidate is guaranteed to remove any cache lines of
1690         // earlier writes and ensures later writes will refetch the cache lines.
1691         Changed = true;
1692       }
1693       break;
1694     case SIAtomicScope::WAVEFRONT:
1695     case SIAtomicScope::SINGLETHREAD:
1696       // Could generate "BUFFER_INV" but it would do nothing as there are no
1697       // caches to invalidate.
1698       break;
1699     default:
1700       llvm_unreachable("Unsupported synchronization scope");
1701     }
1702   }
1703 
1704   /// The scratch address space does not need the global memory cache
1705   /// to be flushed as all memory operations by the same thread are
1706   /// sequentially consistent, and no other thread can access scratch
1707   /// memory.
1708 
1709   /// Other address spaces do not have a cache.
1710 
1711   if (Pos == Position::AFTER)
1712     --MI;
1713 
1714   return Changed;
1715 }
1716 
1717 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1718                                          SIAtomicScope Scope,
1719                                          SIAtomicAddrSpace AddrSpace,
1720                                          bool IsCrossAddrSpaceOrdering,
1721                                          Position Pos) const {
1722   bool Changed = false;
1723 
1724   MachineBasicBlock &MBB = *MI->getParent();
1725   DebugLoc DL = MI->getDebugLoc();
1726 
1727   if (Pos == Position::AFTER)
1728     ++MI;
1729 
1730   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1731     switch (Scope) {
1732     case SIAtomicScope::SYSTEM:
1733       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1734       // hardware does not reorder memory operations by the same wave with
1735       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1736       // to initiate writeback of any dirty cache lines of earlier writes by the
1737       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1738       // writeback has completed.
1739       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1740           // Set SC bits to indicate system scope.
1741           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1742       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1743       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1744       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1745       Changed = true;
1746       break;
1747     case SIAtomicScope::AGENT:
1748       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1749           // Set SC bits to indicate agent scope.
1750           .addImm(AMDGPU::CPol::SC1);
1751 
1752       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1753       // SIAtomicScope::AGENT, the following insertWait will generate the
1754       // required "S_WAITCNT vmcnt(0)".
1755       Changed = true;
1756       break;
1757     case SIAtomicScope::WORKGROUP:
1758     case SIAtomicScope::WAVEFRONT:
1759     case SIAtomicScope::SINGLETHREAD:
1760       // Do not generate "BUFFER_WBL2" as there are no caches it would
1761       // writeback, and would require an otherwise unnecessary
1762       // "S_WAITCNT vmcnt(0)".
1763       break;
1764     default:
1765       llvm_unreachable("Unsupported synchronization scope");
1766     }
1767   }
1768 
1769   if (Pos == Position::AFTER)
1770     --MI;
1771 
1772   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1773   // S_WAITCNT needed.
1774   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1775                         IsCrossAddrSpaceOrdering, Pos);
1776 
1777   return Changed;
1778 }
1779 
1780 bool SIGfx10CacheControl::enableLoadCacheBypass(
1781     const MachineBasicBlock::iterator &MI,
1782     SIAtomicScope Scope,
1783     SIAtomicAddrSpace AddrSpace) const {
1784   assert(MI->mayLoad() && !MI->mayStore());
1785   bool Changed = false;
1786 
1787   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1788     switch (Scope) {
1789     case SIAtomicScope::SYSTEM:
1790     case SIAtomicScope::AGENT:
1791       // Set the L0 and L1 cache policies to MISS_EVICT.
1792       // Note: there is no L2 cache coherent bypass control at the ISA level.
1793       Changed |= enableGLCBit(MI);
1794       Changed |= enableDLCBit(MI);
1795       break;
1796     case SIAtomicScope::WORKGROUP:
1797       // In WGP mode the waves of a work-group can be executing on either CU of
1798       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1799       // CU mode all waves of a work-group are on the same CU, and so the L0
1800       // does not need to be bypassed.
1801       if (!ST.isCuModeEnabled())
1802         Changed |= enableGLCBit(MI);
1803       break;
1804     case SIAtomicScope::WAVEFRONT:
1805     case SIAtomicScope::SINGLETHREAD:
1806       // No cache to bypass.
1807       break;
1808     default:
1809       llvm_unreachable("Unsupported synchronization scope");
1810     }
1811   }
1812 
1813   /// The scratch address space does not need the global memory caches
1814   /// to be bypassed as all memory operations by the same thread are
1815   /// sequentially consistent, and no other thread can access scratch
1816   /// memory.
1817 
1818   /// Other address spaces do not have a cache.
1819 
1820   return Changed;
1821 }
1822 
1823 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1824     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1825     bool IsVolatile, bool IsNonTemporal) const {
1826 
1827   // Only handle load and store, not atomic read-modify-write insructions. The
1828   // latter use glc to indicate if the atomic returns a result and so must not
1829   // be used for cache control.
1830   assert(MI->mayLoad() ^ MI->mayStore());
1831 
1832   // Only update load and store, not LLVM IR atomic read-modify-write
1833   // instructions. The latter are always marked as volatile so cannot sensibly
1834   // handle it as do not want to pessimize all atomics. Also they do not support
1835   // the nontemporal attribute.
1836   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1837 
1838   bool Changed = false;
1839 
1840   if (IsVolatile) {
1841     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1842     // and MISS_LRU for store instructions.
1843     // Note: there is no L2 cache coherent bypass control at the ISA level.
1844     if (Op == SIMemOp::LOAD) {
1845       Changed |= enableGLCBit(MI);
1846       Changed |= enableDLCBit(MI);
1847     }
1848 
1849     // Ensure operation has completed at system scope to cause all volatile
1850     // operations to be visible outside the program in a global order. Do not
1851     // request cross address space as only the global address space can be
1852     // observable outside the program, so no need to cause a waitcnt for LDS
1853     // address space operations.
1854     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1855                           Position::AFTER);
1856     return Changed;
1857   }
1858 
1859   if (IsNonTemporal) {
1860     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1861     // and L2 cache policy to STREAM.
1862     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1863     // to MISS_EVICT and the L2 cache policy to STREAM.
1864     if (Op == SIMemOp::STORE)
1865       Changed |= enableGLCBit(MI);
1866     Changed |= enableSLCBit(MI);
1867 
1868     return Changed;
1869   }
1870 
1871   return Changed;
1872 }
1873 
1874 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1875                                      SIAtomicScope Scope,
1876                                      SIAtomicAddrSpace AddrSpace,
1877                                      SIMemOp Op,
1878                                      bool IsCrossAddrSpaceOrdering,
1879                                      Position Pos) const {
1880   bool Changed = false;
1881 
1882   MachineBasicBlock &MBB = *MI->getParent();
1883   DebugLoc DL = MI->getDebugLoc();
1884 
1885   if (Pos == Position::AFTER)
1886     ++MI;
1887 
1888   bool VMCnt = false;
1889   bool VSCnt = false;
1890   bool LGKMCnt = false;
1891 
1892   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1893       SIAtomicAddrSpace::NONE) {
1894     switch (Scope) {
1895     case SIAtomicScope::SYSTEM:
1896     case SIAtomicScope::AGENT:
1897       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1898         VMCnt |= true;
1899       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1900         VSCnt |= true;
1901       break;
1902     case SIAtomicScope::WORKGROUP:
1903       // In WGP mode the waves of a work-group can be executing on either CU of
1904       // the WGP. Therefore need to wait for operations to complete to ensure
1905       // they are visible to waves in the other CU as the L0 is per CU.
1906       // Otherwise in CU mode and all waves of a work-group are on the same CU
1907       // which shares the same L0.
1908       if (!ST.isCuModeEnabled()) {
1909         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1910           VMCnt |= true;
1911         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1912           VSCnt |= true;
1913       }
1914       break;
1915     case SIAtomicScope::WAVEFRONT:
1916     case SIAtomicScope::SINGLETHREAD:
1917       // The L0 cache keeps all memory operations in order for
1918       // work-items in the same wavefront.
1919       break;
1920     default:
1921       llvm_unreachable("Unsupported synchronization scope");
1922     }
1923   }
1924 
1925   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1926     switch (Scope) {
1927     case SIAtomicScope::SYSTEM:
1928     case SIAtomicScope::AGENT:
1929     case SIAtomicScope::WORKGROUP:
1930       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1931       // not needed as LDS operations for all waves are executed in a total
1932       // global ordering as observed by all waves. Required if also
1933       // synchronizing with global/GDS memory as LDS operations could be
1934       // reordered with respect to later global/GDS memory operations of the
1935       // same wave.
1936       LGKMCnt |= IsCrossAddrSpaceOrdering;
1937       break;
1938     case SIAtomicScope::WAVEFRONT:
1939     case SIAtomicScope::SINGLETHREAD:
1940       // The LDS keeps all memory operations in order for
1941       // the same wavefront.
1942       break;
1943     default:
1944       llvm_unreachable("Unsupported synchronization scope");
1945     }
1946   }
1947 
1948   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1949     switch (Scope) {
1950     case SIAtomicScope::SYSTEM:
1951     case SIAtomicScope::AGENT:
1952       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1953       // is not needed as GDS operations for all waves are executed in a total
1954       // global ordering as observed by all waves. Required if also
1955       // synchronizing with global/LDS memory as GDS operations could be
1956       // reordered with respect to later global/LDS memory operations of the
1957       // same wave.
1958       LGKMCnt |= IsCrossAddrSpaceOrdering;
1959       break;
1960     case SIAtomicScope::WORKGROUP:
1961     case SIAtomicScope::WAVEFRONT:
1962     case SIAtomicScope::SINGLETHREAD:
1963       // The GDS keeps all memory operations in order for
1964       // the same work-group.
1965       break;
1966     default:
1967       llvm_unreachable("Unsupported synchronization scope");
1968     }
1969   }
1970 
1971   if (VMCnt || LGKMCnt) {
1972     unsigned WaitCntImmediate =
1973       AMDGPU::encodeWaitcnt(IV,
1974                             VMCnt ? 0 : getVmcntBitMask(IV),
1975                             getExpcntBitMask(IV),
1976                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1977     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1978         .addImm(WaitCntImmediate);
1979     Changed = true;
1980   }
1981 
1982   if (VSCnt) {
1983     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1984         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1985         .addImm(0);
1986     Changed = true;
1987   }
1988 
1989   if (Pos == Position::AFTER)
1990     --MI;
1991 
1992   return Changed;
1993 }
1994 
1995 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1996                                         SIAtomicScope Scope,
1997                                         SIAtomicAddrSpace AddrSpace,
1998                                         Position Pos) const {
1999   if (!InsertCacheInv)
2000     return false;
2001 
2002   bool Changed = false;
2003 
2004   MachineBasicBlock &MBB = *MI->getParent();
2005   DebugLoc DL = MI->getDebugLoc();
2006 
2007   if (Pos == Position::AFTER)
2008     ++MI;
2009 
2010   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2011     switch (Scope) {
2012     case SIAtomicScope::SYSTEM:
2013     case SIAtomicScope::AGENT:
2014       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2015       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2016       Changed = true;
2017       break;
2018     case SIAtomicScope::WORKGROUP:
2019       // In WGP mode the waves of a work-group can be executing on either CU of
2020       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2021       // in CU mode and all waves of a work-group are on the same CU, and so the
2022       // L0 does not need to be invalidated.
2023       if (!ST.isCuModeEnabled()) {
2024         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2025         Changed = true;
2026       }
2027       break;
2028     case SIAtomicScope::WAVEFRONT:
2029     case SIAtomicScope::SINGLETHREAD:
2030       // No cache to invalidate.
2031       break;
2032     default:
2033       llvm_unreachable("Unsupported synchronization scope");
2034     }
2035   }
2036 
2037   /// The scratch address space does not need the global memory cache
2038   /// to be flushed as all memory operations by the same thread are
2039   /// sequentially consistent, and no other thread can access scratch
2040   /// memory.
2041 
2042   /// Other address spaces do not have a cache.
2043 
2044   if (Pos == Position::AFTER)
2045     --MI;
2046 
2047   return Changed;
2048 }
2049 
2050 bool SIGfx11CacheControl::enableLoadCacheBypass(
2051     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2052     SIAtomicAddrSpace AddrSpace) const {
2053   assert(MI->mayLoad() && !MI->mayStore());
2054   bool Changed = false;
2055 
2056   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2057     switch (Scope) {
2058     case SIAtomicScope::SYSTEM:
2059     case SIAtomicScope::AGENT:
2060       // Set the L0 and L1 cache policies to MISS_EVICT.
2061       // Note: there is no L2 cache coherent bypass control at the ISA level.
2062       Changed |= enableGLCBit(MI);
2063       break;
2064     case SIAtomicScope::WORKGROUP:
2065       // In WGP mode the waves of a work-group can be executing on either CU of
2066       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2067       // CU mode all waves of a work-group are on the same CU, and so the L0
2068       // does not need to be bypassed.
2069       if (!ST.isCuModeEnabled())
2070         Changed |= enableGLCBit(MI);
2071       break;
2072     case SIAtomicScope::WAVEFRONT:
2073     case SIAtomicScope::SINGLETHREAD:
2074       // No cache to bypass.
2075       break;
2076     default:
2077       llvm_unreachable("Unsupported synchronization scope");
2078     }
2079   }
2080 
2081   /// The scratch address space does not need the global memory caches
2082   /// to be bypassed as all memory operations by the same thread are
2083   /// sequentially consistent, and no other thread can access scratch
2084   /// memory.
2085 
2086   /// Other address spaces do not have a cache.
2087 
2088   return Changed;
2089 }
2090 
2091 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2092     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2093     bool IsVolatile, bool IsNonTemporal) const {
2094 
2095   // Only handle load and store, not atomic read-modify-write insructions. The
2096   // latter use glc to indicate if the atomic returns a result and so must not
2097   // be used for cache control.
2098   assert(MI->mayLoad() ^ MI->mayStore());
2099 
2100   // Only update load and store, not LLVM IR atomic read-modify-write
2101   // instructions. The latter are always marked as volatile so cannot sensibly
2102   // handle it as do not want to pessimize all atomics. Also they do not support
2103   // the nontemporal attribute.
2104   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2105 
2106   bool Changed = false;
2107 
2108   if (IsVolatile) {
2109     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2110     // and MISS_LRU for store instructions.
2111     // Note: there is no L2 cache coherent bypass control at the ISA level.
2112     if (Op == SIMemOp::LOAD)
2113       Changed |= enableGLCBit(MI);
2114 
2115     // Set MALL NOALLOC for load and store instructions.
2116     Changed |= enableDLCBit(MI);
2117 
2118     // Ensure operation has completed at system scope to cause all volatile
2119     // operations to be visible outside the program in a global order. Do not
2120     // request cross address space as only the global address space can be
2121     // observable outside the program, so no need to cause a waitcnt for LDS
2122     // address space operations.
2123     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2124                           Position::AFTER);
2125     return Changed;
2126   }
2127 
2128   if (IsNonTemporal) {
2129     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2130     // and L2 cache policy to STREAM.
2131     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2132     // to MISS_EVICT and the L2 cache policy to STREAM.
2133     if (Op == SIMemOp::STORE)
2134       Changed |= enableGLCBit(MI);
2135     Changed |= enableSLCBit(MI);
2136 
2137     // Set MALL NOALLOC for load and store instructions.
2138     Changed |= enableDLCBit(MI);
2139     return Changed;
2140   }
2141 
2142   return Changed;
2143 }
2144 
2145 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2146                                         SIAtomicScope Scope,
2147                                         SIAtomicAddrSpace AddrSpace,
2148                                         Position Pos) const {
2149   if (!InsertCacheInv)
2150     return false;
2151 
2152   MachineBasicBlock &MBB = *MI->getParent();
2153   DebugLoc DL = MI->getDebugLoc();
2154 
2155   /// The scratch address space does not need the global memory cache
2156   /// to be flushed as all memory operations by the same thread are
2157   /// sequentially consistent, and no other thread can access scratch
2158   /// memory.
2159 
2160   /// Other address spaces do not have a cache.
2161   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2162     return false;
2163 
2164   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2165   switch (Scope) {
2166   case SIAtomicScope::SYSTEM:
2167     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2168     break;
2169   case SIAtomicScope::AGENT:
2170     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2171     break;
2172   case SIAtomicScope::WORKGROUP:
2173     // In WGP mode the waves of a work-group can be executing on either CU of
2174     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2175     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2176     // the L0 does not need to be invalidated.
2177     if (ST.isCuModeEnabled())
2178       return false;
2179 
2180     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2181     break;
2182   case SIAtomicScope::WAVEFRONT:
2183   case SIAtomicScope::SINGLETHREAD:
2184     // No cache to invalidate.
2185     return false;
2186   default:
2187     llvm_unreachable("Unsupported synchronization scope");
2188   }
2189 
2190   if (Pos == Position::AFTER)
2191     ++MI;
2192 
2193   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2194 
2195   if (Pos == Position::AFTER)
2196     --MI;
2197 
2198   return true;
2199 }
2200 
2201 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2202   if (AtomicPseudoMIs.empty())
2203     return false;
2204 
2205   for (auto &MI : AtomicPseudoMIs)
2206     MI->eraseFromParent();
2207 
2208   AtomicPseudoMIs.clear();
2209   return true;
2210 }
2211 
2212 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2213                                    MachineBasicBlock::iterator &MI) {
2214   assert(MI->mayLoad() && !MI->mayStore());
2215 
2216   bool Changed = false;
2217 
2218   if (MOI.isAtomic()) {
2219     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2220         MOI.getOrdering() == AtomicOrdering::Acquire ||
2221         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2222       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2223                                            MOI.getOrderingAddrSpace());
2224     }
2225 
2226     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2227       Changed |= CC->insertWait(MI, MOI.getScope(),
2228                                 MOI.getOrderingAddrSpace(),
2229                                 SIMemOp::LOAD | SIMemOp::STORE,
2230                                 MOI.getIsCrossAddressSpaceOrdering(),
2231                                 Position::BEFORE);
2232 
2233     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2234         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2235       Changed |= CC->insertWait(MI, MOI.getScope(),
2236                                 MOI.getInstrAddrSpace(),
2237                                 SIMemOp::LOAD,
2238                                 MOI.getIsCrossAddressSpaceOrdering(),
2239                                 Position::AFTER);
2240       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2241                                    MOI.getOrderingAddrSpace(),
2242                                    Position::AFTER);
2243     }
2244 
2245     return Changed;
2246   }
2247 
2248   // Atomic instructions already bypass caches to the scope specified by the
2249   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2250   // need additional treatment.
2251   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2252                                                 SIMemOp::LOAD, MOI.isVolatile(),
2253                                                 MOI.isNonTemporal());
2254   return Changed;
2255 }
2256 
2257 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2258                                     MachineBasicBlock::iterator &MI) {
2259   assert(!MI->mayLoad() && MI->mayStore());
2260 
2261   bool Changed = false;
2262 
2263   if (MOI.isAtomic()) {
2264     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2265         MOI.getOrdering() == AtomicOrdering::Release ||
2266         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2267       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2268                                             MOI.getOrderingAddrSpace());
2269     }
2270 
2271     if (MOI.getOrdering() == AtomicOrdering::Release ||
2272         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2273       Changed |= CC->insertRelease(MI, MOI.getScope(),
2274                                    MOI.getOrderingAddrSpace(),
2275                                    MOI.getIsCrossAddressSpaceOrdering(),
2276                                    Position::BEFORE);
2277 
2278     return Changed;
2279   }
2280 
2281   // Atomic instructions already bypass caches to the scope specified by the
2282   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2283   // need additional treatment.
2284   Changed |= CC->enableVolatileAndOrNonTemporal(
2285       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2286       MOI.isNonTemporal());
2287   return Changed;
2288 }
2289 
2290 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2291                                           MachineBasicBlock::iterator &MI) {
2292   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2293 
2294   AtomicPseudoMIs.push_back(MI);
2295   bool Changed = false;
2296 
2297   if (MOI.isAtomic()) {
2298     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2299       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2300                                 SIMemOp::LOAD | SIMemOp::STORE,
2301                                 MOI.getIsCrossAddressSpaceOrdering(),
2302                                 Position::BEFORE);
2303 
2304     if (MOI.getOrdering() == AtomicOrdering::Release ||
2305         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2306         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2307       /// TODO: This relies on a barrier always generating a waitcnt
2308       /// for LDS to ensure it is not reordered with the completion of
2309       /// the proceeding LDS operations. If barrier had a memory
2310       /// ordering and memory scope, then library does not need to
2311       /// generate a fence. Could add support in this file for
2312       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2313       /// adding S_WAITCNT before a S_BARRIER.
2314       Changed |= CC->insertRelease(MI, MOI.getScope(),
2315                                    MOI.getOrderingAddrSpace(),
2316                                    MOI.getIsCrossAddressSpaceOrdering(),
2317                                    Position::BEFORE);
2318 
2319     // TODO: If both release and invalidate are happening they could be combined
2320     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2321     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2322     // track cache invalidate and write back instructions.
2323 
2324     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2325         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2326         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2327       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2328                                    MOI.getOrderingAddrSpace(),
2329                                    Position::BEFORE);
2330 
2331     return Changed;
2332   }
2333 
2334   return Changed;
2335 }
2336 
2337 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2338   MachineBasicBlock::iterator &MI) {
2339   assert(MI->mayLoad() && MI->mayStore());
2340 
2341   bool Changed = false;
2342 
2343   if (MOI.isAtomic()) {
2344     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2345         MOI.getOrdering() == AtomicOrdering::Acquire ||
2346         MOI.getOrdering() == AtomicOrdering::Release ||
2347         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2348         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2349       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2350                                           MOI.getInstrAddrSpace());
2351     }
2352 
2353     if (MOI.getOrdering() == AtomicOrdering::Release ||
2354         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2355         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2356         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2357       Changed |= CC->insertRelease(MI, MOI.getScope(),
2358                                    MOI.getOrderingAddrSpace(),
2359                                    MOI.getIsCrossAddressSpaceOrdering(),
2360                                    Position::BEFORE);
2361 
2362     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2363         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2364         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2365         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2366         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2367       Changed |= CC->insertWait(MI, MOI.getScope(),
2368                                 MOI.getInstrAddrSpace(),
2369                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2370                                                    SIMemOp::STORE,
2371                                 MOI.getIsCrossAddressSpaceOrdering(),
2372                                 Position::AFTER);
2373       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2374                                    MOI.getOrderingAddrSpace(),
2375                                    Position::AFTER);
2376     }
2377 
2378     return Changed;
2379   }
2380 
2381   return Changed;
2382 }
2383 
2384 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2385   bool Changed = false;
2386 
2387   SIMemOpAccess MOA(MF);
2388   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2389 
2390   for (auto &MBB : MF) {
2391     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2392 
2393       // Unbundle instructions after the post-RA scheduler.
2394       if (MI->isBundle() && MI->mayLoadOrStore()) {
2395         MachineBasicBlock::instr_iterator II(MI->getIterator());
2396         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2397              I != E && I->isBundledWithPred(); ++I) {
2398           I->unbundleFromPred();
2399           for (MachineOperand &MO : I->operands())
2400             if (MO.isReg())
2401               MO.setIsInternalRead(false);
2402         }
2403 
2404         MI->eraseFromParent();
2405         MI = II->getIterator();
2406       }
2407 
2408       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2409         continue;
2410 
2411       if (const auto &MOI = MOA.getLoadInfo(MI))
2412         Changed |= expandLoad(*MOI, MI);
2413       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2414         Changed |= expandStore(*MOI, MI);
2415         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2416       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2417         Changed |= expandAtomicFence(*MOI, MI);
2418       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2419         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2420     }
2421   }
2422 
2423   Changed |= removeAtomicPseudoMIs();
2424   return Changed;
2425 }
2426 
2427 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2428 
2429 char SIMemoryLegalizer::ID = 0;
2430 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2431 
2432 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2433   return new SIMemoryLegalizer();
2434 }
2435