xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision b23dbabb7f3edb3f323a64f03e37be2c9a8b2a45)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/Support/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   /// Inserts any necessary instructions at position \p Pos relative
316   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317   /// \p Op associated with address spaces \p AddrSpace have completed. Used
318   /// between memory instructions to enforce the order they become visible as
319   /// observed by other memory instructions executing in memory scope \p Scope.
320   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321   /// address spaces. Returns true iff any instructions inserted.
322   virtual bool insertWait(MachineBasicBlock::iterator &MI,
323                           SIAtomicScope Scope,
324                           SIAtomicAddrSpace AddrSpace,
325                           SIMemOp Op,
326                           bool IsCrossAddrSpaceOrdering,
327                           Position Pos) const = 0;
328 
329   /// Inserts any necessary instructions at position \p Pos relative to
330   /// instruction \p MI to ensure any subsequent memory instructions of this
331   /// thread with address spaces \p AddrSpace will observe the previous memory
332   /// operations by any thread for memory scopes up to memory scope \p Scope .
333   /// Returns true iff any instructions inserted.
334   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335                              SIAtomicScope Scope,
336                              SIAtomicAddrSpace AddrSpace,
337                              Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure previous memory instructions by this thread
341   /// with address spaces \p AddrSpace have completed and can be observed by
342   /// subsequent memory instructions by any thread executing in memory scope \p
343   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344   /// between address spaces. Returns true iff any instructions inserted.
345   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              bool IsCrossAddrSpaceOrdering,
349                              Position Pos) const = 0;
350 
351   /// Virtual destructor to allow derivations to be deleted.
352   virtual ~SICacheControl() = default;
353 
354 };
355 
356 class SIGfx6CacheControl : public SICacheControl {
357 protected:
358 
359   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
360   /// is modified, false otherwise.
361   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
362     return enableNamedBit(MI, AMDGPU::CPol::GLC);
363   }
364 
365   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
366   /// is modified, false otherwise.
367   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
368     return enableNamedBit(MI, AMDGPU::CPol::SLC);
369   }
370 
371 public:
372 
373   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
374 
375   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
376                              SIAtomicScope Scope,
377                              SIAtomicAddrSpace AddrSpace) const override;
378 
379   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
380                               SIAtomicScope Scope,
381                               SIAtomicAddrSpace AddrSpace) const override;
382 
383   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
384                             SIAtomicScope Scope,
385                             SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
388                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
389                                       bool IsVolatile,
390                                       bool IsNonTemporal) const override;
391 
392   bool insertWait(MachineBasicBlock::iterator &MI,
393                   SIAtomicScope Scope,
394                   SIAtomicAddrSpace AddrSpace,
395                   SIMemOp Op,
396                   bool IsCrossAddrSpaceOrdering,
397                   Position Pos) const override;
398 
399   bool insertAcquire(MachineBasicBlock::iterator &MI,
400                      SIAtomicScope Scope,
401                      SIAtomicAddrSpace AddrSpace,
402                      Position Pos) const override;
403 
404   bool insertRelease(MachineBasicBlock::iterator &MI,
405                      SIAtomicScope Scope,
406                      SIAtomicAddrSpace AddrSpace,
407                      bool IsCrossAddrSpaceOrdering,
408                      Position Pos) const override;
409 };
410 
411 class SIGfx7CacheControl : public SIGfx6CacheControl {
412 public:
413 
414   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
415 
416   bool insertAcquire(MachineBasicBlock::iterator &MI,
417                      SIAtomicScope Scope,
418                      SIAtomicAddrSpace AddrSpace,
419                      Position Pos) const override;
420 
421 };
422 
423 class SIGfx90ACacheControl : public SIGfx7CacheControl {
424 public:
425 
426   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
427 
428   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
429                              SIAtomicScope Scope,
430                              SIAtomicAddrSpace AddrSpace) const override;
431 
432   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
433                               SIAtomicScope Scope,
434                               SIAtomicAddrSpace AddrSpace) const override;
435 
436   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
437                             SIAtomicScope Scope,
438                             SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
441                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
442                                       bool IsVolatile,
443                                       bool IsNonTemporal) const override;
444 
445   bool insertWait(MachineBasicBlock::iterator &MI,
446                   SIAtomicScope Scope,
447                   SIAtomicAddrSpace AddrSpace,
448                   SIMemOp Op,
449                   bool IsCrossAddrSpaceOrdering,
450                   Position Pos) const override;
451 
452   bool insertAcquire(MachineBasicBlock::iterator &MI,
453                      SIAtomicScope Scope,
454                      SIAtomicAddrSpace AddrSpace,
455                      Position Pos) const override;
456 
457   bool insertRelease(MachineBasicBlock::iterator &MI,
458                      SIAtomicScope Scope,
459                      SIAtomicAddrSpace AddrSpace,
460                      bool IsCrossAddrSpaceOrdering,
461                      Position Pos) const override;
462 };
463 
464 class SIGfx940CacheControl : public SIGfx90ACacheControl {
465 protected:
466 
467   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
468   /// is modified, false otherwise.
469   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
470     return enableNamedBit(MI, AMDGPU::CPol::SC0);
471   }
472 
473   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
474   /// is modified, false otherwise.
475   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
476     return enableNamedBit(MI, AMDGPU::CPol::SC1);
477   }
478 
479   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
480   /// is modified, false otherwise.
481   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
482     return enableNamedBit(MI, AMDGPU::CPol::NT);
483   }
484 
485 public:
486 
487   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
488 
489   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
490                              SIAtomicScope Scope,
491                              SIAtomicAddrSpace AddrSpace) const override;
492 
493   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
494                               SIAtomicScope Scope,
495                               SIAtomicAddrSpace AddrSpace) const override;
496 
497   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
498                             SIAtomicScope Scope,
499                             SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
502                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
503                                       bool IsVolatile,
504                                       bool IsNonTemporal) const override;
505 
506   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
507                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
508 
509   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
510                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
511                      Position Pos) const override;
512 };
513 
514 class SIGfx10CacheControl : public SIGfx7CacheControl {
515 protected:
516 
517   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
518   /// is modified, false otherwise.
519   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
520     return enableNamedBit(MI, AMDGPU::CPol::DLC);
521   }
522 
523 public:
524 
525   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
526 
527   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
528                              SIAtomicScope Scope,
529                              SIAtomicAddrSpace AddrSpace) const override;
530 
531   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
532                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
533                                       bool IsVolatile,
534                                       bool IsNonTemporal) const override;
535 
536   bool insertWait(MachineBasicBlock::iterator &MI,
537                   SIAtomicScope Scope,
538                   SIAtomicAddrSpace AddrSpace,
539                   SIMemOp Op,
540                   bool IsCrossAddrSpaceOrdering,
541                   Position Pos) const override;
542 
543   bool insertAcquire(MachineBasicBlock::iterator &MI,
544                      SIAtomicScope Scope,
545                      SIAtomicAddrSpace AddrSpace,
546                      Position Pos) const override;
547 };
548 
549 class SIGfx11CacheControl : public SIGfx10CacheControl {
550 public:
551   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
552 
553   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
554                              SIAtomicScope Scope,
555                              SIAtomicAddrSpace AddrSpace) const override;
556 
557   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
558                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
559                                       bool IsVolatile,
560                                       bool IsNonTemporal) const override;
561 };
562 
563 class SIMemoryLegalizer final : public MachineFunctionPass {
564 private:
565 
566   /// Cache Control.
567   std::unique_ptr<SICacheControl> CC = nullptr;
568 
569   /// List of atomic pseudo instructions.
570   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
571 
572   /// Return true iff instruction \p MI is a atomic instruction that
573   /// returns a result.
574   bool isAtomicRet(const MachineInstr &MI) const {
575     return SIInstrInfo::isAtomicRet(MI);
576   }
577 
578   /// Removes all processed atomic pseudo instructions from the current
579   /// function. Returns true if current function is modified, false otherwise.
580   bool removeAtomicPseudoMIs();
581 
582   /// Expands load operation \p MI. Returns true if instructions are
583   /// added/deleted or \p MI is modified, false otherwise.
584   bool expandLoad(const SIMemOpInfo &MOI,
585                   MachineBasicBlock::iterator &MI);
586   /// Expands store operation \p MI. Returns true if instructions are
587   /// added/deleted or \p MI is modified, false otherwise.
588   bool expandStore(const SIMemOpInfo &MOI,
589                    MachineBasicBlock::iterator &MI);
590   /// Expands atomic fence operation \p MI. Returns true if
591   /// instructions are added/deleted or \p MI is modified, false otherwise.
592   bool expandAtomicFence(const SIMemOpInfo &MOI,
593                          MachineBasicBlock::iterator &MI);
594   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
595   /// instructions are added/deleted or \p MI is modified, false otherwise.
596   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
597                                 MachineBasicBlock::iterator &MI);
598 
599 public:
600   static char ID;
601 
602   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
603 
604   void getAnalysisUsage(AnalysisUsage &AU) const override {
605     AU.setPreservesCFG();
606     MachineFunctionPass::getAnalysisUsage(AU);
607   }
608 
609   StringRef getPassName() const override {
610     return PASS_NAME;
611   }
612 
613   bool runOnMachineFunction(MachineFunction &MF) override;
614 };
615 
616 } // end namespace anonymous
617 
618 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
619                                       const char *Msg) const {
620   const Function &Func = MI->getParent()->getParent()->getFunction();
621   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
622   Func.getContext().diagnose(Diag);
623 }
624 
625 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
626 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
627                                SIAtomicAddrSpace InstrAddrSpace) const {
628   if (SSID == SyncScope::System)
629     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
630   if (SSID == MMI->getAgentSSID())
631     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
632   if (SSID == MMI->getWorkgroupSSID())
633     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
634                       true);
635   if (SSID == MMI->getWavefrontSSID())
636     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
637                       true);
638   if (SSID == SyncScope::SingleThread)
639     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
640                       true);
641   if (SSID == MMI->getSystemOneAddressSpaceSSID())
642     return std::tuple(SIAtomicScope::SYSTEM,
643                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
644   if (SSID == MMI->getAgentOneAddressSpaceSSID())
645     return std::tuple(SIAtomicScope::AGENT,
646                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
647   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
648     return std::tuple(SIAtomicScope::WORKGROUP,
649                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
650   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
651     return std::tuple(SIAtomicScope::WAVEFRONT,
652                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
653   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
654     return std::tuple(SIAtomicScope::SINGLETHREAD,
655                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
656   return std::nullopt;
657 }
658 
659 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
660   if (AS == AMDGPUAS::FLAT_ADDRESS)
661     return SIAtomicAddrSpace::FLAT;
662   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
663     return SIAtomicAddrSpace::GLOBAL;
664   if (AS == AMDGPUAS::LOCAL_ADDRESS)
665     return SIAtomicAddrSpace::LDS;
666   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
667     return SIAtomicAddrSpace::SCRATCH;
668   if (AS == AMDGPUAS::REGION_ADDRESS)
669     return SIAtomicAddrSpace::GDS;
670 
671   return SIAtomicAddrSpace::OTHER;
672 }
673 
674 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
675   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
676 }
677 
678 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
679     const MachineBasicBlock::iterator &MI) const {
680   assert(MI->getNumMemOperands() > 0);
681 
682   SyncScope::ID SSID = SyncScope::SingleThread;
683   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
684   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
685   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
686   bool IsNonTemporal = true;
687   bool IsVolatile = false;
688 
689   // Validator should check whether or not MMOs cover the entire set of
690   // locations accessed by the memory instruction.
691   for (const auto &MMO : MI->memoperands()) {
692     IsNonTemporal &= MMO->isNonTemporal();
693     IsVolatile |= MMO->isVolatile();
694     InstrAddrSpace |=
695       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
696     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
697     if (OpOrdering != AtomicOrdering::NotAtomic) {
698       const auto &IsSyncScopeInclusion =
699           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
700       if (!IsSyncScopeInclusion) {
701         reportUnsupported(MI,
702           "Unsupported non-inclusive atomic synchronization scope");
703         return std::nullopt;
704       }
705 
706       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
707       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
708       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
709              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
710       FailureOrdering =
711           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
712     }
713   }
714 
715   SIAtomicScope Scope = SIAtomicScope::NONE;
716   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
717   bool IsCrossAddressSpaceOrdering = false;
718   if (Ordering != AtomicOrdering::NotAtomic) {
719     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
720     if (!ScopeOrNone) {
721       reportUnsupported(MI, "Unsupported atomic synchronization scope");
722       return std::nullopt;
723     }
724     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
725         *ScopeOrNone;
726     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
727         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
728         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
729       reportUnsupported(MI, "Unsupported atomic address space");
730       return std::nullopt;
731     }
732   }
733   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
734                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
735                      IsNonTemporal);
736 }
737 
738 std::optional<SIMemOpInfo>
739 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
740   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
741 
742   if (!(MI->mayLoad() && !MI->mayStore()))
743     return std::nullopt;
744 
745   // Be conservative if there are no memory operands.
746   if (MI->getNumMemOperands() == 0)
747     return SIMemOpInfo();
748 
749   return constructFromMIWithMMO(MI);
750 }
751 
752 std::optional<SIMemOpInfo>
753 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
754   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
755 
756   if (!(!MI->mayLoad() && MI->mayStore()))
757     return std::nullopt;
758 
759   // Be conservative if there are no memory operands.
760   if (MI->getNumMemOperands() == 0)
761     return SIMemOpInfo();
762 
763   return constructFromMIWithMMO(MI);
764 }
765 
766 std::optional<SIMemOpInfo>
767 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
768   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
769 
770   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
771     return std::nullopt;
772 
773   AtomicOrdering Ordering =
774     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
775 
776   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
777   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
778   if (!ScopeOrNone) {
779     reportUnsupported(MI, "Unsupported atomic synchronization scope");
780     return std::nullopt;
781   }
782 
783   SIAtomicScope Scope = SIAtomicScope::NONE;
784   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
785   bool IsCrossAddressSpaceOrdering = false;
786   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
787       *ScopeOrNone;
788 
789   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
790       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
791     reportUnsupported(MI, "Unsupported atomic address space");
792     return std::nullopt;
793   }
794 
795   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
796                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
797 }
798 
799 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
800     const MachineBasicBlock::iterator &MI) const {
801   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
802 
803   if (!(MI->mayLoad() && MI->mayStore()))
804     return std::nullopt;
805 
806   // Be conservative if there are no memory operands.
807   if (MI->getNumMemOperands() == 0)
808     return SIMemOpInfo();
809 
810   return constructFromMIWithMMO(MI);
811 }
812 
813 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
814   TII = ST.getInstrInfo();
815   IV = getIsaVersion(ST.getCPU());
816   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
817 }
818 
819 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
820                                     AMDGPU::CPol::CPol Bit) const {
821   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
822   if (!CPol)
823     return false;
824 
825   CPol->setImm(CPol->getImm() | Bit);
826   return true;
827 }
828 
829 /* static */
830 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
831   GCNSubtarget::Generation Generation = ST.getGeneration();
832   if (ST.hasGFX940Insts())
833     return std::make_unique<SIGfx940CacheControl>(ST);
834   if (ST.hasGFX90AInsts())
835     return std::make_unique<SIGfx90ACacheControl>(ST);
836   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
837     return std::make_unique<SIGfx6CacheControl>(ST);
838   if (Generation < AMDGPUSubtarget::GFX10)
839     return std::make_unique<SIGfx7CacheControl>(ST);
840   if (Generation < AMDGPUSubtarget::GFX11)
841     return std::make_unique<SIGfx10CacheControl>(ST);
842   return std::make_unique<SIGfx11CacheControl>(ST);
843 }
844 
845 bool SIGfx6CacheControl::enableLoadCacheBypass(
846     const MachineBasicBlock::iterator &MI,
847     SIAtomicScope Scope,
848     SIAtomicAddrSpace AddrSpace) const {
849   assert(MI->mayLoad() && !MI->mayStore());
850   bool Changed = false;
851 
852   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
853     switch (Scope) {
854     case SIAtomicScope::SYSTEM:
855     case SIAtomicScope::AGENT:
856       // Set L1 cache policy to MISS_EVICT.
857       // Note: there is no L2 cache bypass policy at the ISA level.
858       Changed |= enableGLCBit(MI);
859       break;
860     case SIAtomicScope::WORKGROUP:
861     case SIAtomicScope::WAVEFRONT:
862     case SIAtomicScope::SINGLETHREAD:
863       // No cache to bypass.
864       break;
865     default:
866       llvm_unreachable("Unsupported synchronization scope");
867     }
868   }
869 
870   /// The scratch address space does not need the global memory caches
871   /// to be bypassed as all memory operations by the same thread are
872   /// sequentially consistent, and no other thread can access scratch
873   /// memory.
874 
875   /// Other address spaces do not have a cache.
876 
877   return Changed;
878 }
879 
880 bool SIGfx6CacheControl::enableStoreCacheBypass(
881     const MachineBasicBlock::iterator &MI,
882     SIAtomicScope Scope,
883     SIAtomicAddrSpace AddrSpace) const {
884   assert(!MI->mayLoad() && MI->mayStore());
885   bool Changed = false;
886 
887   /// The L1 cache is write through so does not need to be bypassed. There is no
888   /// bypass control for the L2 cache at the isa level.
889 
890   return Changed;
891 }
892 
893 bool SIGfx6CacheControl::enableRMWCacheBypass(
894     const MachineBasicBlock::iterator &MI,
895     SIAtomicScope Scope,
896     SIAtomicAddrSpace AddrSpace) const {
897   assert(MI->mayLoad() && MI->mayStore());
898   bool Changed = false;
899 
900   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
901   /// bypassed, and the GLC bit is instead used to indicate if they are
902   /// return or no-return.
903   /// Note: there is no L2 cache coherent bypass control at the ISA level.
904 
905   return Changed;
906 }
907 
908 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
909     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
910     bool IsVolatile, bool IsNonTemporal) const {
911   // Only handle load and store, not atomic read-modify-write insructions. The
912   // latter use glc to indicate if the atomic returns a result and so must not
913   // be used for cache control.
914   assert(MI->mayLoad() ^ MI->mayStore());
915 
916   // Only update load and store, not LLVM IR atomic read-modify-write
917   // instructions. The latter are always marked as volatile so cannot sensibly
918   // handle it as do not want to pessimize all atomics. Also they do not support
919   // the nontemporal attribute.
920   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
921 
922   bool Changed = false;
923 
924   if (IsVolatile) {
925     // Set L1 cache policy to be MISS_EVICT for load instructions
926     // and MISS_LRU for store instructions.
927     // Note: there is no L2 cache bypass policy at the ISA level.
928     if (Op == SIMemOp::LOAD)
929       Changed |= enableGLCBit(MI);
930 
931     // Ensure operation has completed at system scope to cause all volatile
932     // operations to be visible outside the program in a global order. Do not
933     // request cross address space as only the global address space can be
934     // observable outside the program, so no need to cause a waitcnt for LDS
935     // address space operations.
936     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
937                           Position::AFTER);
938 
939     return Changed;
940   }
941 
942   if (IsNonTemporal) {
943     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
944     // for both loads and stores, and the L2 cache policy to STREAM.
945     Changed |= enableGLCBit(MI);
946     Changed |= enableSLCBit(MI);
947     return Changed;
948   }
949 
950   return Changed;
951 }
952 
953 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
954                                     SIAtomicScope Scope,
955                                     SIAtomicAddrSpace AddrSpace,
956                                     SIMemOp Op,
957                                     bool IsCrossAddrSpaceOrdering,
958                                     Position Pos) const {
959   bool Changed = false;
960 
961   MachineBasicBlock &MBB = *MI->getParent();
962   DebugLoc DL = MI->getDebugLoc();
963 
964   if (Pos == Position::AFTER)
965     ++MI;
966 
967   bool VMCnt = false;
968   bool LGKMCnt = false;
969 
970   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
971       SIAtomicAddrSpace::NONE) {
972     switch (Scope) {
973     case SIAtomicScope::SYSTEM:
974     case SIAtomicScope::AGENT:
975       VMCnt |= true;
976       break;
977     case SIAtomicScope::WORKGROUP:
978     case SIAtomicScope::WAVEFRONT:
979     case SIAtomicScope::SINGLETHREAD:
980       // The L1 cache keeps all memory operations in order for
981       // wavefronts in the same work-group.
982       break;
983     default:
984       llvm_unreachable("Unsupported synchronization scope");
985     }
986   }
987 
988   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
989     switch (Scope) {
990     case SIAtomicScope::SYSTEM:
991     case SIAtomicScope::AGENT:
992     case SIAtomicScope::WORKGROUP:
993       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
994       // not needed as LDS operations for all waves are executed in a total
995       // global ordering as observed by all waves. Required if also
996       // synchronizing with global/GDS memory as LDS operations could be
997       // reordered with respect to later global/GDS memory operations of the
998       // same wave.
999       LGKMCnt |= IsCrossAddrSpaceOrdering;
1000       break;
1001     case SIAtomicScope::WAVEFRONT:
1002     case SIAtomicScope::SINGLETHREAD:
1003       // The LDS keeps all memory operations in order for
1004       // the same wavefront.
1005       break;
1006     default:
1007       llvm_unreachable("Unsupported synchronization scope");
1008     }
1009   }
1010 
1011   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1012     switch (Scope) {
1013     case SIAtomicScope::SYSTEM:
1014     case SIAtomicScope::AGENT:
1015       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1016       // is not needed as GDS operations for all waves are executed in a total
1017       // global ordering as observed by all waves. Required if also
1018       // synchronizing with global/LDS memory as GDS operations could be
1019       // reordered with respect to later global/LDS memory operations of the
1020       // same wave.
1021       LGKMCnt |= IsCrossAddrSpaceOrdering;
1022       break;
1023     case SIAtomicScope::WORKGROUP:
1024     case SIAtomicScope::WAVEFRONT:
1025     case SIAtomicScope::SINGLETHREAD:
1026       // The GDS keeps all memory operations in order for
1027       // the same work-group.
1028       break;
1029     default:
1030       llvm_unreachable("Unsupported synchronization scope");
1031     }
1032   }
1033 
1034   if (VMCnt || LGKMCnt) {
1035     unsigned WaitCntImmediate =
1036       AMDGPU::encodeWaitcnt(IV,
1037                             VMCnt ? 0 : getVmcntBitMask(IV),
1038                             getExpcntBitMask(IV),
1039                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1040     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1041     Changed = true;
1042   }
1043 
1044   if (Pos == Position::AFTER)
1045     --MI;
1046 
1047   return Changed;
1048 }
1049 
1050 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1051                                        SIAtomicScope Scope,
1052                                        SIAtomicAddrSpace AddrSpace,
1053                                        Position Pos) const {
1054   if (!InsertCacheInv)
1055     return false;
1056 
1057   bool Changed = false;
1058 
1059   MachineBasicBlock &MBB = *MI->getParent();
1060   DebugLoc DL = MI->getDebugLoc();
1061 
1062   if (Pos == Position::AFTER)
1063     ++MI;
1064 
1065   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1066     switch (Scope) {
1067     case SIAtomicScope::SYSTEM:
1068     case SIAtomicScope::AGENT:
1069       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1070       Changed = true;
1071       break;
1072     case SIAtomicScope::WORKGROUP:
1073     case SIAtomicScope::WAVEFRONT:
1074     case SIAtomicScope::SINGLETHREAD:
1075       // No cache to invalidate.
1076       break;
1077     default:
1078       llvm_unreachable("Unsupported synchronization scope");
1079     }
1080   }
1081 
1082   /// The scratch address space does not need the global memory cache
1083   /// to be flushed as all memory operations by the same thread are
1084   /// sequentially consistent, and no other thread can access scratch
1085   /// memory.
1086 
1087   /// Other address spaces do not have a cache.
1088 
1089   if (Pos == Position::AFTER)
1090     --MI;
1091 
1092   return Changed;
1093 }
1094 
1095 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1096                                        SIAtomicScope Scope,
1097                                        SIAtomicAddrSpace AddrSpace,
1098                                        bool IsCrossAddrSpaceOrdering,
1099                                        Position Pos) const {
1100   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1101                     IsCrossAddrSpaceOrdering, Pos);
1102 }
1103 
1104 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1105                                        SIAtomicScope Scope,
1106                                        SIAtomicAddrSpace AddrSpace,
1107                                        Position Pos) const {
1108   if (!InsertCacheInv)
1109     return false;
1110 
1111   bool Changed = false;
1112 
1113   MachineBasicBlock &MBB = *MI->getParent();
1114   DebugLoc DL = MI->getDebugLoc();
1115 
1116   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1117 
1118   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1119                                     ? AMDGPU::BUFFER_WBINVL1
1120                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1121 
1122   if (Pos == Position::AFTER)
1123     ++MI;
1124 
1125   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1126     switch (Scope) {
1127     case SIAtomicScope::SYSTEM:
1128     case SIAtomicScope::AGENT:
1129       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1130       Changed = true;
1131       break;
1132     case SIAtomicScope::WORKGROUP:
1133     case SIAtomicScope::WAVEFRONT:
1134     case SIAtomicScope::SINGLETHREAD:
1135       // No cache to invalidate.
1136       break;
1137     default:
1138       llvm_unreachable("Unsupported synchronization scope");
1139     }
1140   }
1141 
1142   /// The scratch address space does not need the global memory cache
1143   /// to be flushed as all memory operations by the same thread are
1144   /// sequentially consistent, and no other thread can access scratch
1145   /// memory.
1146 
1147   /// Other address spaces do not have a cache.
1148 
1149   if (Pos == Position::AFTER)
1150     --MI;
1151 
1152   return Changed;
1153 }
1154 
1155 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1156     const MachineBasicBlock::iterator &MI,
1157     SIAtomicScope Scope,
1158     SIAtomicAddrSpace AddrSpace) const {
1159   assert(MI->mayLoad() && !MI->mayStore());
1160   bool Changed = false;
1161 
1162   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1163     switch (Scope) {
1164     case SIAtomicScope::SYSTEM:
1165     case SIAtomicScope::AGENT:
1166       // Set the L1 cache policy to MISS_LRU.
1167       // Note: there is no L2 cache bypass policy at the ISA level.
1168       Changed |= enableGLCBit(MI);
1169       break;
1170     case SIAtomicScope::WORKGROUP:
1171       // In threadgroup split mode the waves of a work-group can be executing on
1172       // different CUs. Therefore need to bypass the L1 which is per CU.
1173       // Otherwise in non-threadgroup split mode all waves of a work-group are
1174       // on the same CU, and so the L1 does not need to be bypassed.
1175       if (ST.isTgSplitEnabled())
1176         Changed |= enableGLCBit(MI);
1177       break;
1178     case SIAtomicScope::WAVEFRONT:
1179     case SIAtomicScope::SINGLETHREAD:
1180       // No cache to bypass.
1181       break;
1182     default:
1183       llvm_unreachable("Unsupported synchronization scope");
1184     }
1185   }
1186 
1187   /// The scratch address space does not need the global memory caches
1188   /// to be bypassed as all memory operations by the same thread are
1189   /// sequentially consistent, and no other thread can access scratch
1190   /// memory.
1191 
1192   /// Other address spaces do not have a cache.
1193 
1194   return Changed;
1195 }
1196 
1197 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1198     const MachineBasicBlock::iterator &MI,
1199     SIAtomicScope Scope,
1200     SIAtomicAddrSpace AddrSpace) const {
1201   assert(!MI->mayLoad() && MI->mayStore());
1202   bool Changed = false;
1203 
1204   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1205     switch (Scope) {
1206     case SIAtomicScope::SYSTEM:
1207     case SIAtomicScope::AGENT:
1208       /// Do not set glc for store atomic operations as they implicitly write
1209       /// through the L1 cache.
1210       break;
1211     case SIAtomicScope::WORKGROUP:
1212     case SIAtomicScope::WAVEFRONT:
1213     case SIAtomicScope::SINGLETHREAD:
1214       // No cache to bypass. Store atomics implicitly write through the L1
1215       // cache.
1216       break;
1217     default:
1218       llvm_unreachable("Unsupported synchronization scope");
1219     }
1220   }
1221 
1222   /// The scratch address space does not need the global memory caches
1223   /// to be bypassed as all memory operations by the same thread are
1224   /// sequentially consistent, and no other thread can access scratch
1225   /// memory.
1226 
1227   /// Other address spaces do not have a cache.
1228 
1229   return Changed;
1230 }
1231 
1232 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1233     const MachineBasicBlock::iterator &MI,
1234     SIAtomicScope Scope,
1235     SIAtomicAddrSpace AddrSpace) const {
1236   assert(MI->mayLoad() && MI->mayStore());
1237   bool Changed = false;
1238 
1239   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1240     switch (Scope) {
1241     case SIAtomicScope::SYSTEM:
1242     case SIAtomicScope::AGENT:
1243       /// Do not set glc for RMW atomic operations as they implicitly bypass
1244       /// the L1 cache, and the glc bit is instead used to indicate if they are
1245       /// return or no-return.
1246       break;
1247     case SIAtomicScope::WORKGROUP:
1248     case SIAtomicScope::WAVEFRONT:
1249     case SIAtomicScope::SINGLETHREAD:
1250       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1251       break;
1252     default:
1253       llvm_unreachable("Unsupported synchronization scope");
1254     }
1255   }
1256 
1257   return Changed;
1258 }
1259 
1260 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1261     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1262     bool IsVolatile, bool IsNonTemporal) const {
1263   // Only handle load and store, not atomic read-modify-write insructions. The
1264   // latter use glc to indicate if the atomic returns a result and so must not
1265   // be used for cache control.
1266   assert(MI->mayLoad() ^ MI->mayStore());
1267 
1268   // Only update load and store, not LLVM IR atomic read-modify-write
1269   // instructions. The latter are always marked as volatile so cannot sensibly
1270   // handle it as do not want to pessimize all atomics. Also they do not support
1271   // the nontemporal attribute.
1272   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1273 
1274   bool Changed = false;
1275 
1276   if (IsVolatile) {
1277     // Set L1 cache policy to be MISS_EVICT for load instructions
1278     // and MISS_LRU for store instructions.
1279     // Note: there is no L2 cache bypass policy at the ISA level.
1280     if (Op == SIMemOp::LOAD)
1281       Changed |= enableGLCBit(MI);
1282 
1283     // Ensure operation has completed at system scope to cause all volatile
1284     // operations to be visible outside the program in a global order. Do not
1285     // request cross address space as only the global address space can be
1286     // observable outside the program, so no need to cause a waitcnt for LDS
1287     // address space operations.
1288     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1289                           Position::AFTER);
1290 
1291     return Changed;
1292   }
1293 
1294   if (IsNonTemporal) {
1295     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1296     // for both loads and stores, and the L2 cache policy to STREAM.
1297     Changed |= enableGLCBit(MI);
1298     Changed |= enableSLCBit(MI);
1299     return Changed;
1300   }
1301 
1302   return Changed;
1303 }
1304 
1305 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1306                                       SIAtomicScope Scope,
1307                                       SIAtomicAddrSpace AddrSpace,
1308                                       SIMemOp Op,
1309                                       bool IsCrossAddrSpaceOrdering,
1310                                       Position Pos) const {
1311   if (ST.isTgSplitEnabled()) {
1312     // In threadgroup split mode the waves of a work-group can be executing on
1313     // different CUs. Therefore need to wait for global or GDS memory operations
1314     // to complete to ensure they are visible to waves in the other CUs.
1315     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1316     // the same CU, so no need to wait for global memory as all waves in the
1317     // work-group access the same the L1, nor wait for GDS as access are ordered
1318     // on a CU.
1319     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1320                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1321         (Scope == SIAtomicScope::WORKGROUP)) {
1322       // Same as GFX7 using agent scope.
1323       Scope = SIAtomicScope::AGENT;
1324     }
1325     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1326     // LDS memory operations.
1327     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1328   }
1329   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1330                                         IsCrossAddrSpaceOrdering, Pos);
1331 }
1332 
1333 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1334                                          SIAtomicScope Scope,
1335                                          SIAtomicAddrSpace AddrSpace,
1336                                          Position Pos) const {
1337   if (!InsertCacheInv)
1338     return false;
1339 
1340   bool Changed = false;
1341 
1342   MachineBasicBlock &MBB = *MI->getParent();
1343   DebugLoc DL = MI->getDebugLoc();
1344 
1345   if (Pos == Position::AFTER)
1346     ++MI;
1347 
1348   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1349     switch (Scope) {
1350     case SIAtomicScope::SYSTEM:
1351       // Ensures that following loads will not see stale remote VMEM data or
1352       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1353       // CC will never be stale due to the local memory probes.
1354       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1355       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1356       // hardware does not reorder memory operations by the same wave with
1357       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1358       // remove any cache lines of earlier writes by the same wave and ensures
1359       // later reads by the same wave will refetch the cache lines.
1360       Changed = true;
1361       break;
1362     case SIAtomicScope::AGENT:
1363       // Same as GFX7.
1364       break;
1365     case SIAtomicScope::WORKGROUP:
1366       // In threadgroup split mode the waves of a work-group can be executing on
1367       // different CUs. Therefore need to invalidate the L1 which is per CU.
1368       // Otherwise in non-threadgroup split mode all waves of a work-group are
1369       // on the same CU, and so the L1 does not need to be invalidated.
1370       if (ST.isTgSplitEnabled()) {
1371         // Same as GFX7 using agent scope.
1372         Scope = SIAtomicScope::AGENT;
1373       }
1374       break;
1375     case SIAtomicScope::WAVEFRONT:
1376     case SIAtomicScope::SINGLETHREAD:
1377       // Same as GFX7.
1378       break;
1379     default:
1380       llvm_unreachable("Unsupported synchronization scope");
1381     }
1382   }
1383 
1384   /// The scratch address space does not need the global memory cache
1385   /// to be flushed as all memory operations by the same thread are
1386   /// sequentially consistent, and no other thread can access scratch
1387   /// memory.
1388 
1389   /// Other address spaces do not have a cache.
1390 
1391   if (Pos == Position::AFTER)
1392     --MI;
1393 
1394   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1395 
1396   return Changed;
1397 }
1398 
1399 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1400                                          SIAtomicScope Scope,
1401                                          SIAtomicAddrSpace AddrSpace,
1402                                          bool IsCrossAddrSpaceOrdering,
1403                                          Position Pos) const {
1404   bool Changed = false;
1405 
1406   MachineBasicBlock &MBB = *MI->getParent();
1407   DebugLoc DL = MI->getDebugLoc();
1408 
1409   if (Pos == Position::AFTER)
1410     ++MI;
1411 
1412   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1413     switch (Scope) {
1414     case SIAtomicScope::SYSTEM:
1415       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1416       // hardware does not reorder memory operations by the same wave with
1417       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1418       // to initiate writeback of any dirty cache lines of earlier writes by the
1419       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1420       // writeback has completed.
1421       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1422         // Set SC bits to indicate system scope.
1423         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1424       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1425       // vmcnt(0)" needed by the "BUFFER_WBL2".
1426       Changed = true;
1427       break;
1428     case SIAtomicScope::AGENT:
1429     case SIAtomicScope::WORKGROUP:
1430     case SIAtomicScope::WAVEFRONT:
1431     case SIAtomicScope::SINGLETHREAD:
1432       // Same as GFX7.
1433       break;
1434     default:
1435       llvm_unreachable("Unsupported synchronization scope");
1436     }
1437   }
1438 
1439   if (Pos == Position::AFTER)
1440     --MI;
1441 
1442   Changed |=
1443       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1444                                         IsCrossAddrSpaceOrdering, Pos);
1445 
1446   return Changed;
1447 }
1448 
1449 bool SIGfx940CacheControl::enableLoadCacheBypass(
1450     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1451     SIAtomicAddrSpace AddrSpace) const {
1452   assert(MI->mayLoad() && !MI->mayStore());
1453   bool Changed = false;
1454 
1455   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1456     switch (Scope) {
1457     case SIAtomicScope::SYSTEM:
1458       // Set SC bits to indicate system scope.
1459       Changed |= enableSC0Bit(MI);
1460       Changed |= enableSC1Bit(MI);
1461       break;
1462     case SIAtomicScope::AGENT:
1463       // Set SC bits to indicate agent scope.
1464       Changed |= enableSC1Bit(MI);
1465       break;
1466     case SIAtomicScope::WORKGROUP:
1467       // In threadgroup split mode the waves of a work-group can be executing on
1468       // different CUs. Therefore need to bypass the L1 which is per CU.
1469       // Otherwise in non-threadgroup split mode all waves of a work-group are
1470       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1471       // bits to indicate work-group scope will do this automatically.
1472       Changed |= enableSC0Bit(MI);
1473       break;
1474     case SIAtomicScope::WAVEFRONT:
1475     case SIAtomicScope::SINGLETHREAD:
1476       // Leave SC bits unset to indicate wavefront scope.
1477       break;
1478     default:
1479       llvm_unreachable("Unsupported synchronization scope");
1480     }
1481   }
1482 
1483   /// The scratch address space does not need the global memory caches
1484   /// to be bypassed as all memory operations by the same thread are
1485   /// sequentially consistent, and no other thread can access scratch
1486   /// memory.
1487 
1488   /// Other address spaces do not have a cache.
1489 
1490   return Changed;
1491 }
1492 
1493 bool SIGfx940CacheControl::enableStoreCacheBypass(
1494     const MachineBasicBlock::iterator &MI,
1495     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1496   assert(!MI->mayLoad() && MI->mayStore());
1497   bool Changed = false;
1498 
1499   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1500     switch (Scope) {
1501     case SIAtomicScope::SYSTEM:
1502       // Set SC bits to indicate system scope.
1503       Changed |= enableSC0Bit(MI);
1504       Changed |= enableSC1Bit(MI);
1505       break;
1506     case SIAtomicScope::AGENT:
1507       // Set SC bits to indicate agent scope.
1508       Changed |= enableSC1Bit(MI);
1509       break;
1510     case SIAtomicScope::WORKGROUP:
1511       // Set SC bits to indicate workgroup scope.
1512       Changed |= enableSC0Bit(MI);
1513       break;
1514     case SIAtomicScope::WAVEFRONT:
1515     case SIAtomicScope::SINGLETHREAD:
1516       // Leave SC bits unset to indicate wavefront scope.
1517       break;
1518     default:
1519       llvm_unreachable("Unsupported synchronization scope");
1520     }
1521   }
1522 
1523   /// The scratch address space does not need the global memory caches
1524   /// to be bypassed as all memory operations by the same thread are
1525   /// sequentially consistent, and no other thread can access scratch
1526   /// memory.
1527 
1528   /// Other address spaces do not have a cache.
1529 
1530   return Changed;
1531 }
1532 
1533 bool SIGfx940CacheControl::enableRMWCacheBypass(
1534     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1535     SIAtomicAddrSpace AddrSpace) const {
1536   assert(MI->mayLoad() && MI->mayStore());
1537   bool Changed = false;
1538 
1539   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1540     switch (Scope) {
1541     case SIAtomicScope::SYSTEM:
1542       // Set SC1 bit to indicate system scope.
1543       Changed |= enableSC1Bit(MI);
1544       break;
1545     case SIAtomicScope::AGENT:
1546     case SIAtomicScope::WORKGROUP:
1547     case SIAtomicScope::WAVEFRONT:
1548     case SIAtomicScope::SINGLETHREAD:
1549       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1550       // to indicate system or agent scope. The SC0 bit is used to indicate if
1551       // they are return or no-return. Leave SC1 bit unset to indicate agent
1552       // scope.
1553       break;
1554     default:
1555       llvm_unreachable("Unsupported synchronization scope");
1556     }
1557   }
1558 
1559   return Changed;
1560 }
1561 
1562 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1563     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1564     bool IsVolatile, bool IsNonTemporal) const {
1565   // Only handle load and store, not atomic read-modify-write insructions. The
1566   // latter use glc to indicate if the atomic returns a result and so must not
1567   // be used for cache control.
1568   assert(MI->mayLoad() ^ MI->mayStore());
1569 
1570   // Only update load and store, not LLVM IR atomic read-modify-write
1571   // instructions. The latter are always marked as volatile so cannot sensibly
1572   // handle it as do not want to pessimize all atomics. Also they do not support
1573   // the nontemporal attribute.
1574   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1575 
1576   bool Changed = false;
1577 
1578   if (IsVolatile) {
1579     // Set SC bits to indicate system scope.
1580     Changed |= enableSC0Bit(MI);
1581     Changed |= enableSC1Bit(MI);
1582 
1583     // Ensure operation has completed at system scope to cause all volatile
1584     // operations to be visible outside the program in a global order. Do not
1585     // request cross address space as only the global address space can be
1586     // observable outside the program, so no need to cause a waitcnt for LDS
1587     // address space operations.
1588     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1589                           Position::AFTER);
1590 
1591     return Changed;
1592   }
1593 
1594   if (IsNonTemporal) {
1595     Changed |= enableNTBit(MI);
1596     return Changed;
1597   }
1598 
1599   return Changed;
1600 }
1601 
1602 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1603                                          SIAtomicScope Scope,
1604                                          SIAtomicAddrSpace AddrSpace,
1605                                          Position Pos) const {
1606   if (!InsertCacheInv)
1607     return false;
1608 
1609   bool Changed = false;
1610 
1611   MachineBasicBlock &MBB = *MI->getParent();
1612   DebugLoc DL = MI->getDebugLoc();
1613 
1614   if (Pos == Position::AFTER)
1615     ++MI;
1616 
1617   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1618     switch (Scope) {
1619     case SIAtomicScope::SYSTEM:
1620       // Ensures that following loads will not see stale remote VMEM data or
1621       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1622       // CC will never be stale due to the local memory probes.
1623       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1624           // Set SC bits to indicate system scope.
1625           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1626       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1627       // hardware does not reorder memory operations by the same wave with
1628       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1629       // remove any cache lines of earlier writes by the same wave and ensures
1630       // later reads by the same wave will refetch the cache lines.
1631       Changed = true;
1632       break;
1633     case SIAtomicScope::AGENT:
1634       // Ensures that following loads will not see stale remote date or local
1635       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1636       // due to the memory probes.
1637       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1638           // Set SC bits to indicate agent scope.
1639           .addImm(AMDGPU::CPol::SC1);
1640       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1641       // does not reorder memory operations with respect to preceeding buffer
1642       // invalidate. The invalidate is guaranteed to remove any cache lines of
1643       // earlier writes and ensures later writes will refetch the cache lines.
1644       Changed = true;
1645       break;
1646     case SIAtomicScope::WORKGROUP:
1647       // In threadgroup split mode the waves of a work-group can be executing on
1648       // different CUs. Therefore need to invalidate the L1 which is per CU.
1649       // Otherwise in non-threadgroup split mode all waves of a work-group are
1650       // on the same CU, and so the L1 does not need to be invalidated.
1651       if (ST.isTgSplitEnabled()) {
1652         // Ensures L1 is invalidated if in threadgroup split mode. In
1653         // non-threadgroup split mode it is a NOP, but no point generating it in
1654         // that case if know not in that mode.
1655         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1656             // Set SC bits to indicate work-group scope.
1657             .addImm(AMDGPU::CPol::SC0);
1658         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1659         // does not reorder memory operations with respect to preceeding buffer
1660         // invalidate. The invalidate is guaranteed to remove any cache lines of
1661         // earlier writes and ensures later writes will refetch the cache lines.
1662         Changed = true;
1663       }
1664       break;
1665     case SIAtomicScope::WAVEFRONT:
1666     case SIAtomicScope::SINGLETHREAD:
1667       // Could generate "BUFFER_INV" but it would do nothing as there are no
1668       // caches to invalidate.
1669       break;
1670     default:
1671       llvm_unreachable("Unsupported synchronization scope");
1672     }
1673   }
1674 
1675   /// The scratch address space does not need the global memory cache
1676   /// to be flushed as all memory operations by the same thread are
1677   /// sequentially consistent, and no other thread can access scratch
1678   /// memory.
1679 
1680   /// Other address spaces do not have a cache.
1681 
1682   if (Pos == Position::AFTER)
1683     --MI;
1684 
1685   return Changed;
1686 }
1687 
1688 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1689                                          SIAtomicScope Scope,
1690                                          SIAtomicAddrSpace AddrSpace,
1691                                          bool IsCrossAddrSpaceOrdering,
1692                                          Position Pos) const {
1693   bool Changed = false;
1694 
1695   MachineBasicBlock &MBB = *MI->getParent();
1696   DebugLoc DL = MI->getDebugLoc();
1697 
1698   if (Pos == Position::AFTER)
1699     ++MI;
1700 
1701   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1702     switch (Scope) {
1703     case SIAtomicScope::SYSTEM:
1704       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1705       // hardware does not reorder memory operations by the same wave with
1706       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1707       // to initiate writeback of any dirty cache lines of earlier writes by the
1708       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1709       // writeback has completed.
1710       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1711           // Set SC bits to indicate system scope.
1712           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1713       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1714       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1715       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1716       Changed = true;
1717       break;
1718     case SIAtomicScope::AGENT:
1719       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1720           // Set SC bits to indicate agent scope.
1721           .addImm(AMDGPU::CPol::SC1);
1722 
1723       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1724       // SIAtomicScope::AGENT, the following insertWait will generate the
1725       // required "S_WAITCNT vmcnt(0)".
1726       Changed = true;
1727       break;
1728     case SIAtomicScope::WORKGROUP:
1729     case SIAtomicScope::WAVEFRONT:
1730     case SIAtomicScope::SINGLETHREAD:
1731       // Do not generate "BUFFER_WBL2" as there are no caches it would
1732       // writeback, and would require an otherwise unnecessary
1733       // "S_WAITCNT vmcnt(0)".
1734       break;
1735     default:
1736       llvm_unreachable("Unsupported synchronization scope");
1737     }
1738   }
1739 
1740   if (Pos == Position::AFTER)
1741     --MI;
1742 
1743   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1744   // S_WAITCNT needed.
1745   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1746                         IsCrossAddrSpaceOrdering, Pos);
1747 
1748   return Changed;
1749 }
1750 
1751 bool SIGfx10CacheControl::enableLoadCacheBypass(
1752     const MachineBasicBlock::iterator &MI,
1753     SIAtomicScope Scope,
1754     SIAtomicAddrSpace AddrSpace) const {
1755   assert(MI->mayLoad() && !MI->mayStore());
1756   bool Changed = false;
1757 
1758   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1759     switch (Scope) {
1760     case SIAtomicScope::SYSTEM:
1761     case SIAtomicScope::AGENT:
1762       // Set the L0 and L1 cache policies to MISS_EVICT.
1763       // Note: there is no L2 cache coherent bypass control at the ISA level.
1764       Changed |= enableGLCBit(MI);
1765       Changed |= enableDLCBit(MI);
1766       break;
1767     case SIAtomicScope::WORKGROUP:
1768       // In WGP mode the waves of a work-group can be executing on either CU of
1769       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1770       // CU mode all waves of a work-group are on the same CU, and so the L0
1771       // does not need to be bypassed.
1772       if (!ST.isCuModeEnabled())
1773         Changed |= enableGLCBit(MI);
1774       break;
1775     case SIAtomicScope::WAVEFRONT:
1776     case SIAtomicScope::SINGLETHREAD:
1777       // No cache to bypass.
1778       break;
1779     default:
1780       llvm_unreachable("Unsupported synchronization scope");
1781     }
1782   }
1783 
1784   /// The scratch address space does not need the global memory caches
1785   /// to be bypassed as all memory operations by the same thread are
1786   /// sequentially consistent, and no other thread can access scratch
1787   /// memory.
1788 
1789   /// Other address spaces do not have a cache.
1790 
1791   return Changed;
1792 }
1793 
1794 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1795     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1796     bool IsVolatile, bool IsNonTemporal) const {
1797 
1798   // Only handle load and store, not atomic read-modify-write insructions. The
1799   // latter use glc to indicate if the atomic returns a result and so must not
1800   // be used for cache control.
1801   assert(MI->mayLoad() ^ MI->mayStore());
1802 
1803   // Only update load and store, not LLVM IR atomic read-modify-write
1804   // instructions. The latter are always marked as volatile so cannot sensibly
1805   // handle it as do not want to pessimize all atomics. Also they do not support
1806   // the nontemporal attribute.
1807   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1808 
1809   bool Changed = false;
1810 
1811   if (IsVolatile) {
1812     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1813     // and MISS_LRU for store instructions.
1814     // Note: there is no L2 cache coherent bypass control at the ISA level.
1815     if (Op == SIMemOp::LOAD) {
1816       Changed |= enableGLCBit(MI);
1817       Changed |= enableDLCBit(MI);
1818     }
1819 
1820     // Ensure operation has completed at system scope to cause all volatile
1821     // operations to be visible outside the program in a global order. Do not
1822     // request cross address space as only the global address space can be
1823     // observable outside the program, so no need to cause a waitcnt for LDS
1824     // address space operations.
1825     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1826                           Position::AFTER);
1827     return Changed;
1828   }
1829 
1830   if (IsNonTemporal) {
1831     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1832     // and L2 cache policy to STREAM.
1833     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1834     // to MISS_EVICT and the L2 cache policy to STREAM.
1835     if (Op == SIMemOp::STORE)
1836       Changed |= enableGLCBit(MI);
1837     Changed |= enableSLCBit(MI);
1838 
1839     return Changed;
1840   }
1841 
1842   return Changed;
1843 }
1844 
1845 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1846                                      SIAtomicScope Scope,
1847                                      SIAtomicAddrSpace AddrSpace,
1848                                      SIMemOp Op,
1849                                      bool IsCrossAddrSpaceOrdering,
1850                                      Position Pos) const {
1851   bool Changed = false;
1852 
1853   MachineBasicBlock &MBB = *MI->getParent();
1854   DebugLoc DL = MI->getDebugLoc();
1855 
1856   if (Pos == Position::AFTER)
1857     ++MI;
1858 
1859   bool VMCnt = false;
1860   bool VSCnt = false;
1861   bool LGKMCnt = false;
1862 
1863   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1864       SIAtomicAddrSpace::NONE) {
1865     switch (Scope) {
1866     case SIAtomicScope::SYSTEM:
1867     case SIAtomicScope::AGENT:
1868       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1869         VMCnt |= true;
1870       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1871         VSCnt |= true;
1872       break;
1873     case SIAtomicScope::WORKGROUP:
1874       // In WGP mode the waves of a work-group can be executing on either CU of
1875       // the WGP. Therefore need to wait for operations to complete to ensure
1876       // they are visible to waves in the other CU as the L0 is per CU.
1877       // Otherwise in CU mode and all waves of a work-group are on the same CU
1878       // which shares the same L0.
1879       if (!ST.isCuModeEnabled()) {
1880         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1881           VMCnt |= true;
1882         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1883           VSCnt |= true;
1884       }
1885       break;
1886     case SIAtomicScope::WAVEFRONT:
1887     case SIAtomicScope::SINGLETHREAD:
1888       // The L0 cache keeps all memory operations in order for
1889       // work-items in the same wavefront.
1890       break;
1891     default:
1892       llvm_unreachable("Unsupported synchronization scope");
1893     }
1894   }
1895 
1896   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1897     switch (Scope) {
1898     case SIAtomicScope::SYSTEM:
1899     case SIAtomicScope::AGENT:
1900     case SIAtomicScope::WORKGROUP:
1901       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1902       // not needed as LDS operations for all waves are executed in a total
1903       // global ordering as observed by all waves. Required if also
1904       // synchronizing with global/GDS memory as LDS operations could be
1905       // reordered with respect to later global/GDS memory operations of the
1906       // same wave.
1907       LGKMCnt |= IsCrossAddrSpaceOrdering;
1908       break;
1909     case SIAtomicScope::WAVEFRONT:
1910     case SIAtomicScope::SINGLETHREAD:
1911       // The LDS keeps all memory operations in order for
1912       // the same wavefront.
1913       break;
1914     default:
1915       llvm_unreachable("Unsupported synchronization scope");
1916     }
1917   }
1918 
1919   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1920     switch (Scope) {
1921     case SIAtomicScope::SYSTEM:
1922     case SIAtomicScope::AGENT:
1923       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1924       // is not needed as GDS operations for all waves are executed in a total
1925       // global ordering as observed by all waves. Required if also
1926       // synchronizing with global/LDS memory as GDS operations could be
1927       // reordered with respect to later global/LDS memory operations of the
1928       // same wave.
1929       LGKMCnt |= IsCrossAddrSpaceOrdering;
1930       break;
1931     case SIAtomicScope::WORKGROUP:
1932     case SIAtomicScope::WAVEFRONT:
1933     case SIAtomicScope::SINGLETHREAD:
1934       // The GDS keeps all memory operations in order for
1935       // the same work-group.
1936       break;
1937     default:
1938       llvm_unreachable("Unsupported synchronization scope");
1939     }
1940   }
1941 
1942   if (VMCnt || LGKMCnt) {
1943     unsigned WaitCntImmediate =
1944       AMDGPU::encodeWaitcnt(IV,
1945                             VMCnt ? 0 : getVmcntBitMask(IV),
1946                             getExpcntBitMask(IV),
1947                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1948     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1949     Changed = true;
1950   }
1951 
1952   if (VSCnt) {
1953     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1954       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1955       .addImm(0);
1956     Changed = true;
1957   }
1958 
1959   if (Pos == Position::AFTER)
1960     --MI;
1961 
1962   return Changed;
1963 }
1964 
1965 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1966                                         SIAtomicScope Scope,
1967                                         SIAtomicAddrSpace AddrSpace,
1968                                         Position Pos) const {
1969   if (!InsertCacheInv)
1970     return false;
1971 
1972   bool Changed = false;
1973 
1974   MachineBasicBlock &MBB = *MI->getParent();
1975   DebugLoc DL = MI->getDebugLoc();
1976 
1977   if (Pos == Position::AFTER)
1978     ++MI;
1979 
1980   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1981     switch (Scope) {
1982     case SIAtomicScope::SYSTEM:
1983     case SIAtomicScope::AGENT:
1984       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1985       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1986       Changed = true;
1987       break;
1988     case SIAtomicScope::WORKGROUP:
1989       // In WGP mode the waves of a work-group can be executing on either CU of
1990       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1991       // in CU mode and all waves of a work-group are on the same CU, and so the
1992       // L0 does not need to be invalidated.
1993       if (!ST.isCuModeEnabled()) {
1994         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1995         Changed = true;
1996       }
1997       break;
1998     case SIAtomicScope::WAVEFRONT:
1999     case SIAtomicScope::SINGLETHREAD:
2000       // No cache to invalidate.
2001       break;
2002     default:
2003       llvm_unreachable("Unsupported synchronization scope");
2004     }
2005   }
2006 
2007   /// The scratch address space does not need the global memory cache
2008   /// to be flushed as all memory operations by the same thread are
2009   /// sequentially consistent, and no other thread can access scratch
2010   /// memory.
2011 
2012   /// Other address spaces do not have a cache.
2013 
2014   if (Pos == Position::AFTER)
2015     --MI;
2016 
2017   return Changed;
2018 }
2019 
2020 bool SIGfx11CacheControl::enableLoadCacheBypass(
2021     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2022     SIAtomicAddrSpace AddrSpace) const {
2023   assert(MI->mayLoad() && !MI->mayStore());
2024   bool Changed = false;
2025 
2026   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2027     switch (Scope) {
2028     case SIAtomicScope::SYSTEM:
2029     case SIAtomicScope::AGENT:
2030       // Set the L0 and L1 cache policies to MISS_EVICT.
2031       // Note: there is no L2 cache coherent bypass control at the ISA level.
2032       Changed |= enableGLCBit(MI);
2033       break;
2034     case SIAtomicScope::WORKGROUP:
2035       // In WGP mode the waves of a work-group can be executing on either CU of
2036       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2037       // CU mode all waves of a work-group are on the same CU, and so the L0
2038       // does not need to be bypassed.
2039       if (!ST.isCuModeEnabled())
2040         Changed |= enableGLCBit(MI);
2041       break;
2042     case SIAtomicScope::WAVEFRONT:
2043     case SIAtomicScope::SINGLETHREAD:
2044       // No cache to bypass.
2045       break;
2046     default:
2047       llvm_unreachable("Unsupported synchronization scope");
2048     }
2049   }
2050 
2051   /// The scratch address space does not need the global memory caches
2052   /// to be bypassed as all memory operations by the same thread are
2053   /// sequentially consistent, and no other thread can access scratch
2054   /// memory.
2055 
2056   /// Other address spaces do not have a cache.
2057 
2058   return Changed;
2059 }
2060 
2061 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2062     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2063     bool IsVolatile, bool IsNonTemporal) const {
2064 
2065   // Only handle load and store, not atomic read-modify-write insructions. The
2066   // latter use glc to indicate if the atomic returns a result and so must not
2067   // be used for cache control.
2068   assert(MI->mayLoad() ^ MI->mayStore());
2069 
2070   // Only update load and store, not LLVM IR atomic read-modify-write
2071   // instructions. The latter are always marked as volatile so cannot sensibly
2072   // handle it as do not want to pessimize all atomics. Also they do not support
2073   // the nontemporal attribute.
2074   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2075 
2076   bool Changed = false;
2077 
2078   if (IsVolatile) {
2079     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2080     // and MISS_LRU for store instructions.
2081     // Note: there is no L2 cache coherent bypass control at the ISA level.
2082     if (Op == SIMemOp::LOAD)
2083       Changed |= enableGLCBit(MI);
2084 
2085     // Set MALL NOALLOC for load and store instructions.
2086     Changed |= enableDLCBit(MI);
2087 
2088     // Ensure operation has completed at system scope to cause all volatile
2089     // operations to be visible outside the program in a global order. Do not
2090     // request cross address space as only the global address space can be
2091     // observable outside the program, so no need to cause a waitcnt for LDS
2092     // address space operations.
2093     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2094                           Position::AFTER);
2095     return Changed;
2096   }
2097 
2098   if (IsNonTemporal) {
2099     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2100     // and L2 cache policy to STREAM.
2101     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2102     // to MISS_EVICT and the L2 cache policy to STREAM.
2103     if (Op == SIMemOp::STORE)
2104       Changed |= enableGLCBit(MI);
2105     Changed |= enableSLCBit(MI);
2106 
2107     // Set MALL NOALLOC for load and store instructions.
2108     Changed |= enableDLCBit(MI);
2109     return Changed;
2110   }
2111 
2112   return Changed;
2113 }
2114 
2115 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2116   if (AtomicPseudoMIs.empty())
2117     return false;
2118 
2119   for (auto &MI : AtomicPseudoMIs)
2120     MI->eraseFromParent();
2121 
2122   AtomicPseudoMIs.clear();
2123   return true;
2124 }
2125 
2126 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2127                                    MachineBasicBlock::iterator &MI) {
2128   assert(MI->mayLoad() && !MI->mayStore());
2129 
2130   bool Changed = false;
2131 
2132   if (MOI.isAtomic()) {
2133     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2134         MOI.getOrdering() == AtomicOrdering::Acquire ||
2135         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2136       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2137                                            MOI.getOrderingAddrSpace());
2138     }
2139 
2140     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2141       Changed |= CC->insertWait(MI, MOI.getScope(),
2142                                 MOI.getOrderingAddrSpace(),
2143                                 SIMemOp::LOAD | SIMemOp::STORE,
2144                                 MOI.getIsCrossAddressSpaceOrdering(),
2145                                 Position::BEFORE);
2146 
2147     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2148         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2149       Changed |= CC->insertWait(MI, MOI.getScope(),
2150                                 MOI.getInstrAddrSpace(),
2151                                 SIMemOp::LOAD,
2152                                 MOI.getIsCrossAddressSpaceOrdering(),
2153                                 Position::AFTER);
2154       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2155                                    MOI.getOrderingAddrSpace(),
2156                                    Position::AFTER);
2157     }
2158 
2159     return Changed;
2160   }
2161 
2162   // Atomic instructions already bypass caches to the scope specified by the
2163   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2164   // need additional treatment.
2165   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2166                                                 SIMemOp::LOAD, MOI.isVolatile(),
2167                                                 MOI.isNonTemporal());
2168   return Changed;
2169 }
2170 
2171 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2172                                     MachineBasicBlock::iterator &MI) {
2173   assert(!MI->mayLoad() && MI->mayStore());
2174 
2175   bool Changed = false;
2176 
2177   if (MOI.isAtomic()) {
2178     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2179         MOI.getOrdering() == AtomicOrdering::Release ||
2180         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2181       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2182                                             MOI.getOrderingAddrSpace());
2183     }
2184 
2185     if (MOI.getOrdering() == AtomicOrdering::Release ||
2186         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2187       Changed |= CC->insertRelease(MI, MOI.getScope(),
2188                                    MOI.getOrderingAddrSpace(),
2189                                    MOI.getIsCrossAddressSpaceOrdering(),
2190                                    Position::BEFORE);
2191 
2192     return Changed;
2193   }
2194 
2195   // Atomic instructions already bypass caches to the scope specified by the
2196   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2197   // need additional treatment.
2198   Changed |= CC->enableVolatileAndOrNonTemporal(
2199       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2200       MOI.isNonTemporal());
2201   return Changed;
2202 }
2203 
2204 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2205                                           MachineBasicBlock::iterator &MI) {
2206   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2207 
2208   AtomicPseudoMIs.push_back(MI);
2209   bool Changed = false;
2210 
2211   if (MOI.isAtomic()) {
2212     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2213         MOI.getOrdering() == AtomicOrdering::Release ||
2214         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2215         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2216       /// TODO: This relies on a barrier always generating a waitcnt
2217       /// for LDS to ensure it is not reordered with the completion of
2218       /// the proceeding LDS operations. If barrier had a memory
2219       /// ordering and memory scope, then library does not need to
2220       /// generate a fence. Could add support in this file for
2221       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2222       /// adding S_WAITCNT before a S_BARRIER.
2223       Changed |= CC->insertRelease(MI, MOI.getScope(),
2224                                    MOI.getOrderingAddrSpace(),
2225                                    MOI.getIsCrossAddressSpaceOrdering(),
2226                                    Position::BEFORE);
2227 
2228     // TODO: If both release and invalidate are happening they could be combined
2229     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2230     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2231     // track cache invalidate and write back instructions.
2232 
2233     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2234         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2235         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2236       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2237                                    MOI.getOrderingAddrSpace(),
2238                                    Position::BEFORE);
2239 
2240     return Changed;
2241   }
2242 
2243   return Changed;
2244 }
2245 
2246 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2247   MachineBasicBlock::iterator &MI) {
2248   assert(MI->mayLoad() && MI->mayStore());
2249 
2250   bool Changed = false;
2251 
2252   if (MOI.isAtomic()) {
2253     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2254         MOI.getOrdering() == AtomicOrdering::Acquire ||
2255         MOI.getOrdering() == AtomicOrdering::Release ||
2256         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2257         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2258       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2259                                           MOI.getInstrAddrSpace());
2260     }
2261 
2262     if (MOI.getOrdering() == AtomicOrdering::Release ||
2263         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2264         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2265         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2266       Changed |= CC->insertRelease(MI, MOI.getScope(),
2267                                    MOI.getOrderingAddrSpace(),
2268                                    MOI.getIsCrossAddressSpaceOrdering(),
2269                                    Position::BEFORE);
2270 
2271     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2272         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2273         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2274         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2275         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2276       Changed |= CC->insertWait(MI, MOI.getScope(),
2277                                 MOI.getInstrAddrSpace(),
2278                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2279                                                    SIMemOp::STORE,
2280                                 MOI.getIsCrossAddressSpaceOrdering(),
2281                                 Position::AFTER);
2282       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2283                                    MOI.getOrderingAddrSpace(),
2284                                    Position::AFTER);
2285     }
2286 
2287     return Changed;
2288   }
2289 
2290   return Changed;
2291 }
2292 
2293 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2294   bool Changed = false;
2295 
2296   SIMemOpAccess MOA(MF);
2297   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2298 
2299   for (auto &MBB : MF) {
2300     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2301 
2302       // Unbundle instructions after the post-RA scheduler.
2303       if (MI->isBundle() && MI->mayLoadOrStore()) {
2304         MachineBasicBlock::instr_iterator II(MI->getIterator());
2305         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2306              I != E && I->isBundledWithPred(); ++I) {
2307           I->unbundleFromPred();
2308           for (MachineOperand &MO : I->operands())
2309             if (MO.isReg())
2310               MO.setIsInternalRead(false);
2311         }
2312 
2313         MI->eraseFromParent();
2314         MI = II->getIterator();
2315       }
2316 
2317       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2318         continue;
2319 
2320       if (const auto &MOI = MOA.getLoadInfo(MI))
2321         Changed |= expandLoad(*MOI, MI);
2322       else if (const auto &MOI = MOA.getStoreInfo(MI))
2323         Changed |= expandStore(*MOI, MI);
2324       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2325         Changed |= expandAtomicFence(*MOI, MI);
2326       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2327         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2328     }
2329   }
2330 
2331   Changed |= removeAtomicPseudoMIs();
2332   return Changed;
2333 }
2334 
2335 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2336 
2337 char SIMemoryLegalizer::ID = 0;
2338 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2339 
2340 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2341   return new SIMemoryLegalizer();
2342 }
2343