xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision ec0ea6efa1ad229d75c394c1a9b9cac33af2b1d3)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE &&
130            !isStrongerThan(FailureOrdering, Ordering));
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SiMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SiMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   Optional<SIMemOpInfo> constructFromMIWithMMO(
233       const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "None" otherwise.
241   Optional<SIMemOpInfo> getLoadInfo(
242       const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "None" otherwise.
245   Optional<SIMemOpInfo> getStoreInfo(
246       const MachineBasicBlock::iterator &MI) const;
247 
248   /// \returns Atomic fence info if \p MI is an atomic fence operation,
249   /// "None" otherwise.
250   Optional<SIMemOpInfo> getAtomicFenceInfo(
251       const MachineBasicBlock::iterator &MI) const;
252 
253   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254   /// rmw operation, "None" otherwise.
255   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256       const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262   /// AMDGPU subtarget info.
263   const GCNSubtarget &ST;
264 
265   /// Instruction info.
266   const SIInstrInfo *TII = nullptr;
267 
268   IsaVersion IV;
269 
270   /// Whether to insert cache invalidating instructions.
271   bool InsertCacheInv;
272 
273   SICacheControl(const GCNSubtarget &ST);
274 
275   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276   /// \returns Returns true if \p MI is modified, false otherwise.
277   bool enableNamedBit(const MachineBasicBlock::iterator MI,
278                       AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282   /// Create a cache control for the subtarget \p ST.
283   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285   /// Update \p MI memory load instruction to bypass any caches up to
286   /// the \p Scope memory scope for address spaces \p
287   /// AddrSpace. Return true iff the instruction was modified.
288   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289                                      SIAtomicScope Scope,
290                                      SIAtomicAddrSpace AddrSpace) const = 0;
291 
292   /// Update \p MI memory store instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296                                       SIAtomicScope Scope,
297                                       SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory read-modify-write instruction to bypass any caches up
300   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301   /// iff the instruction was modified.
302   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303                                     SIAtomicScope Scope,
304                                     SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory instruction of kind \p Op associated with address
307   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308   /// true iff the instruction was modified.
309   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310                                               SIAtomicAddrSpace AddrSpace,
311                                               SIMemOp Op, bool IsVolatile,
312                                               bool IsNonTemporal) const = 0;
313 
314   /// Inserts any necessary instructions at position \p Pos relative
315   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316   /// \p Op associated with address spaces \p AddrSpace have completed. Used
317   /// between memory instructions to enforce the order they become visible as
318   /// observed by other memory instructions executing in memory scope \p Scope.
319   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320   /// address spaces. Returns true iff any instructions inserted.
321   virtual bool insertWait(MachineBasicBlock::iterator &MI,
322                           SIAtomicScope Scope,
323                           SIAtomicAddrSpace AddrSpace,
324                           SIMemOp Op,
325                           bool IsCrossAddrSpaceOrdering,
326                           Position Pos) const = 0;
327 
328   /// Inserts any necessary instructions at position \p Pos relative to
329   /// instruction \p MI to ensure any subsequent memory instructions of this
330   /// thread with address spaces \p AddrSpace will observe the previous memory
331   /// operations by any thread for memory scopes up to memory scope \p Scope .
332   /// Returns true iff any instructions inserted.
333   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334                              SIAtomicScope Scope,
335                              SIAtomicAddrSpace AddrSpace,
336                              Position Pos) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure previous memory instructions by this thread
340   /// with address spaces \p AddrSpace have completed and can be observed by
341   /// subsequent memory instructions by any thread executing in memory scope \p
342   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343   /// between address spaces. Returns true iff any instructions inserted.
344   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345                              SIAtomicScope Scope,
346                              SIAtomicAddrSpace AddrSpace,
347                              bool IsCrossAddrSpaceOrdering,
348                              Position Pos) const = 0;
349 
350   /// Virtual destructor to allow derivations to be deleted.
351   virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359   /// is modified, false otherwise.
360   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361     return enableNamedBit(MI, AMDGPU::CPol::GLC);
362   }
363 
364   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365   /// is modified, false otherwise.
366   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367     return enableNamedBit(MI, AMDGPU::CPol::SLC);
368   }
369 
370 public:
371 
372   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
373 
374   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375                              SIAtomicScope Scope,
376                              SIAtomicAddrSpace AddrSpace) const override;
377 
378   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379                               SIAtomicScope Scope,
380                               SIAtomicAddrSpace AddrSpace) const override;
381 
382   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383                             SIAtomicScope Scope,
384                             SIAtomicAddrSpace AddrSpace) const override;
385 
386   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388                                       bool IsVolatile,
389                                       bool IsNonTemporal) const override;
390 
391   bool insertWait(MachineBasicBlock::iterator &MI,
392                   SIAtomicScope Scope,
393                   SIAtomicAddrSpace AddrSpace,
394                   SIMemOp Op,
395                   bool IsCrossAddrSpaceOrdering,
396                   Position Pos) const override;
397 
398   bool insertAcquire(MachineBasicBlock::iterator &MI,
399                      SIAtomicScope Scope,
400                      SIAtomicAddrSpace AddrSpace,
401                      Position Pos) const override;
402 
403   bool insertRelease(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      bool IsCrossAddrSpaceOrdering,
407                      Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
413   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
414 
415   bool insertAcquire(MachineBasicBlock::iterator &MI,
416                      SIAtomicScope Scope,
417                      SIAtomicAddrSpace AddrSpace,
418                      Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424 
425   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
426 
427   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428                              SIAtomicScope Scope,
429                              SIAtomicAddrSpace AddrSpace) const override;
430 
431   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432                               SIAtomicScope Scope,
433                               SIAtomicAddrSpace AddrSpace) const override;
434 
435   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436                             SIAtomicScope Scope,
437                             SIAtomicAddrSpace AddrSpace) const override;
438 
439   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441                                       bool IsVolatile,
442                                       bool IsNonTemporal) const override;
443 
444   bool insertWait(MachineBasicBlock::iterator &MI,
445                   SIAtomicScope Scope,
446                   SIAtomicAddrSpace AddrSpace,
447                   SIMemOp Op,
448                   bool IsCrossAddrSpaceOrdering,
449                   Position Pos) const override;
450 
451   bool insertAcquire(MachineBasicBlock::iterator &MI,
452                      SIAtomicScope Scope,
453                      SIAtomicAddrSpace AddrSpace,
454                      Position Pos) const override;
455 
456   bool insertRelease(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      bool IsCrossAddrSpaceOrdering,
460                      Position Pos) const override;
461 };
462 
463 class SIGfx10CacheControl : public SIGfx7CacheControl {
464 protected:
465 
466   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
467   /// is modified, false otherwise.
468   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
469     return enableNamedBit(MI, AMDGPU::CPol::DLC);
470   }
471 
472 public:
473 
474   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
475 
476   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
477                              SIAtomicScope Scope,
478                              SIAtomicAddrSpace AddrSpace) const override;
479 
480   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
481                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
482                                       bool IsVolatile,
483                                       bool IsNonTemporal) const override;
484 
485   bool insertWait(MachineBasicBlock::iterator &MI,
486                   SIAtomicScope Scope,
487                   SIAtomicAddrSpace AddrSpace,
488                   SIMemOp Op,
489                   bool IsCrossAddrSpaceOrdering,
490                   Position Pos) const override;
491 
492   bool insertAcquire(MachineBasicBlock::iterator &MI,
493                      SIAtomicScope Scope,
494                      SIAtomicAddrSpace AddrSpace,
495                      Position Pos) const override;
496 };
497 
498 class SIMemoryLegalizer final : public MachineFunctionPass {
499 private:
500 
501   /// Cache Control.
502   std::unique_ptr<SICacheControl> CC = nullptr;
503 
504   /// List of atomic pseudo instructions.
505   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
506 
507   /// Return true iff instruction \p MI is a atomic instruction that
508   /// returns a result.
509   bool isAtomicRet(const MachineInstr &MI) const {
510     return SIInstrInfo::isAtomicRet(MI);
511   }
512 
513   /// Removes all processed atomic pseudo instructions from the current
514   /// function. Returns true if current function is modified, false otherwise.
515   bool removeAtomicPseudoMIs();
516 
517   /// Expands load operation \p MI. Returns true if instructions are
518   /// added/deleted or \p MI is modified, false otherwise.
519   bool expandLoad(const SIMemOpInfo &MOI,
520                   MachineBasicBlock::iterator &MI);
521   /// Expands store operation \p MI. Returns true if instructions are
522   /// added/deleted or \p MI is modified, false otherwise.
523   bool expandStore(const SIMemOpInfo &MOI,
524                    MachineBasicBlock::iterator &MI);
525   /// Expands atomic fence operation \p MI. Returns true if
526   /// instructions are added/deleted or \p MI is modified, false otherwise.
527   bool expandAtomicFence(const SIMemOpInfo &MOI,
528                          MachineBasicBlock::iterator &MI);
529   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
530   /// instructions are added/deleted or \p MI is modified, false otherwise.
531   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
532                                 MachineBasicBlock::iterator &MI);
533 
534 public:
535   static char ID;
536 
537   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
538 
539   void getAnalysisUsage(AnalysisUsage &AU) const override {
540     AU.setPreservesCFG();
541     MachineFunctionPass::getAnalysisUsage(AU);
542   }
543 
544   StringRef getPassName() const override {
545     return PASS_NAME;
546   }
547 
548   bool runOnMachineFunction(MachineFunction &MF) override;
549 };
550 
551 } // end namespace anonymous
552 
553 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
554                                       const char *Msg) const {
555   const Function &Func = MI->getParent()->getParent()->getFunction();
556   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
557   Func.getContext().diagnose(Diag);
558 }
559 
560 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
561 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
562                                SIAtomicAddrSpace InstrAddrSpace) const {
563   if (SSID == SyncScope::System)
564     return std::make_tuple(SIAtomicScope::SYSTEM,
565                            SIAtomicAddrSpace::ATOMIC,
566                            true);
567   if (SSID == MMI->getAgentSSID())
568     return std::make_tuple(SIAtomicScope::AGENT,
569                            SIAtomicAddrSpace::ATOMIC,
570                            true);
571   if (SSID == MMI->getWorkgroupSSID())
572     return std::make_tuple(SIAtomicScope::WORKGROUP,
573                            SIAtomicAddrSpace::ATOMIC,
574                            true);
575   if (SSID == MMI->getWavefrontSSID())
576     return std::make_tuple(SIAtomicScope::WAVEFRONT,
577                            SIAtomicAddrSpace::ATOMIC,
578                            true);
579   if (SSID == SyncScope::SingleThread)
580     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
581                            SIAtomicAddrSpace::ATOMIC,
582                            true);
583   if (SSID == MMI->getSystemOneAddressSpaceSSID())
584     return std::make_tuple(SIAtomicScope::SYSTEM,
585                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
586                            false);
587   if (SSID == MMI->getAgentOneAddressSpaceSSID())
588     return std::make_tuple(SIAtomicScope::AGENT,
589                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
590                            false);
591   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
592     return std::make_tuple(SIAtomicScope::WORKGROUP,
593                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
594                            false);
595   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
596     return std::make_tuple(SIAtomicScope::WAVEFRONT,
597                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
598                            false);
599   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
600     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
601                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
602                            false);
603   return None;
604 }
605 
606 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
607   if (AS == AMDGPUAS::FLAT_ADDRESS)
608     return SIAtomicAddrSpace::FLAT;
609   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
610     return SIAtomicAddrSpace::GLOBAL;
611   if (AS == AMDGPUAS::LOCAL_ADDRESS)
612     return SIAtomicAddrSpace::LDS;
613   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
614     return SIAtomicAddrSpace::SCRATCH;
615   if (AS == AMDGPUAS::REGION_ADDRESS)
616     return SIAtomicAddrSpace::GDS;
617 
618   return SIAtomicAddrSpace::OTHER;
619 }
620 
621 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
622   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
623 }
624 
625 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
626     const MachineBasicBlock::iterator &MI) const {
627   assert(MI->getNumMemOperands() > 0);
628 
629   SyncScope::ID SSID = SyncScope::SingleThread;
630   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
631   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
632   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
633   bool IsNonTemporal = true;
634   bool IsVolatile = false;
635 
636   // Validator should check whether or not MMOs cover the entire set of
637   // locations accessed by the memory instruction.
638   for (const auto &MMO : MI->memoperands()) {
639     IsNonTemporal &= MMO->isNonTemporal();
640     IsVolatile |= MMO->isVolatile();
641     InstrAddrSpace |=
642       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
643     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
644     if (OpOrdering != AtomicOrdering::NotAtomic) {
645       const auto &IsSyncScopeInclusion =
646           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
647       if (!IsSyncScopeInclusion) {
648         reportUnsupported(MI,
649           "Unsupported non-inclusive atomic synchronization scope");
650         return None;
651       }
652 
653       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
654       Ordering = isStrongerThan(Ordering, OpOrdering)
655                      ? Ordering
656                      : MMO->getSuccessOrdering();
657       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
658              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
659       FailureOrdering =
660           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
661               FailureOrdering : MMO->getFailureOrdering();
662     }
663   }
664 
665   SIAtomicScope Scope = SIAtomicScope::NONE;
666   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
667   bool IsCrossAddressSpaceOrdering = false;
668   if (Ordering != AtomicOrdering::NotAtomic) {
669     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
670     if (!ScopeOrNone) {
671       reportUnsupported(MI, "Unsupported atomic synchronization scope");
672       return None;
673     }
674     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
675       ScopeOrNone.getValue();
676     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
677         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
678         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
679       reportUnsupported(MI, "Unsupported atomic address space");
680       return None;
681     }
682   }
683   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
684                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
685                      IsNonTemporal);
686 }
687 
688 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
689     const MachineBasicBlock::iterator &MI) const {
690   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
691 
692   if (!(MI->mayLoad() && !MI->mayStore()))
693     return None;
694 
695   // Be conservative if there are no memory operands.
696   if (MI->getNumMemOperands() == 0)
697     return SIMemOpInfo();
698 
699   return constructFromMIWithMMO(MI);
700 }
701 
702 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
703     const MachineBasicBlock::iterator &MI) const {
704   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
705 
706   if (!(!MI->mayLoad() && MI->mayStore()))
707     return None;
708 
709   // Be conservative if there are no memory operands.
710   if (MI->getNumMemOperands() == 0)
711     return SIMemOpInfo();
712 
713   return constructFromMIWithMMO(MI);
714 }
715 
716 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
717     const MachineBasicBlock::iterator &MI) const {
718   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
719 
720   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
721     return None;
722 
723   AtomicOrdering Ordering =
724     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
725 
726   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
727   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
728   if (!ScopeOrNone) {
729     reportUnsupported(MI, "Unsupported atomic synchronization scope");
730     return None;
731   }
732 
733   SIAtomicScope Scope = SIAtomicScope::NONE;
734   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
735   bool IsCrossAddressSpaceOrdering = false;
736   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
737     ScopeOrNone.getValue();
738 
739   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
740       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
741     reportUnsupported(MI, "Unsupported atomic address space");
742     return None;
743   }
744 
745   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
746                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
747 }
748 
749 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
750     const MachineBasicBlock::iterator &MI) const {
751   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
752 
753   if (!(MI->mayLoad() && MI->mayStore()))
754     return None;
755 
756   // Be conservative if there are no memory operands.
757   if (MI->getNumMemOperands() == 0)
758     return SIMemOpInfo();
759 
760   return constructFromMIWithMMO(MI);
761 }
762 
763 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
764   TII = ST.getInstrInfo();
765   IV = getIsaVersion(ST.getCPU());
766   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
767 }
768 
769 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
770                                     AMDGPU::CPol::CPol Bit) const {
771   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
772   if (!CPol)
773     return false;
774 
775   CPol->setImm(CPol->getImm() | Bit);
776   return true;
777 }
778 
779 /* static */
780 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
781   GCNSubtarget::Generation Generation = ST.getGeneration();
782   if (ST.hasGFX90AInsts())
783     return std::make_unique<SIGfx90ACacheControl>(ST);
784   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
785     return std::make_unique<SIGfx6CacheControl>(ST);
786   if (Generation < AMDGPUSubtarget::GFX10)
787     return std::make_unique<SIGfx7CacheControl>(ST);
788   return std::make_unique<SIGfx10CacheControl>(ST);
789 }
790 
791 bool SIGfx6CacheControl::enableLoadCacheBypass(
792     const MachineBasicBlock::iterator &MI,
793     SIAtomicScope Scope,
794     SIAtomicAddrSpace AddrSpace) const {
795   assert(MI->mayLoad() && !MI->mayStore());
796   bool Changed = false;
797 
798   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
799     switch (Scope) {
800     case SIAtomicScope::SYSTEM:
801     case SIAtomicScope::AGENT:
802       Changed |= enableGLCBit(MI);
803       break;
804     case SIAtomicScope::WORKGROUP:
805     case SIAtomicScope::WAVEFRONT:
806     case SIAtomicScope::SINGLETHREAD:
807       // No cache to bypass.
808       break;
809     default:
810       llvm_unreachable("Unsupported synchronization scope");
811     }
812   }
813 
814   /// The scratch address space does not need the global memory caches
815   /// to be bypassed as all memory operations by the same thread are
816   /// sequentially consistent, and no other thread can access scratch
817   /// memory.
818 
819   /// Other address spaces do not have a cache.
820 
821   return Changed;
822 }
823 
824 bool SIGfx6CacheControl::enableStoreCacheBypass(
825     const MachineBasicBlock::iterator &MI,
826     SIAtomicScope Scope,
827     SIAtomicAddrSpace AddrSpace) const {
828   assert(!MI->mayLoad() && MI->mayStore());
829   bool Changed = false;
830 
831   /// The L1 cache is write through so does not need to be bypassed. There is no
832   /// bypass control for the L2 cache at the isa level.
833 
834   return Changed;
835 }
836 
837 bool SIGfx6CacheControl::enableRMWCacheBypass(
838     const MachineBasicBlock::iterator &MI,
839     SIAtomicScope Scope,
840     SIAtomicAddrSpace AddrSpace) const {
841   assert(MI->mayLoad() && MI->mayStore());
842   bool Changed = false;
843 
844   /// The L1 cache is write through so does not need to be bypassed. There is no
845   /// bypass control for the L2 cache at the isa level.
846 
847   return Changed;
848 }
849 
850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
851     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
852     bool IsVolatile, bool IsNonTemporal) const {
853   // Only handle load and store, not atomic read-modify-write insructions. The
854   // latter use glc to indicate if the atomic returns a result and so must not
855   // be used for cache control.
856   assert(MI->mayLoad() ^ MI->mayStore());
857 
858   // Only update load and store, not LLVM IR atomic read-modify-write
859   // instructions. The latter are always marked as volatile so cannot sensibly
860   // handle it as do not want to pessimize all atomics. Also they do not support
861   // the nontemporal attribute.
862   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
863 
864   bool Changed = false;
865 
866   if (IsVolatile) {
867     if (Op == SIMemOp::LOAD)
868       Changed |= enableGLCBit(MI);
869 
870     // Ensure operation has completed at system scope to cause all volatile
871     // operations to be visible outside the program in a global order. Do not
872     // request cross address space as only the global address space can be
873     // observable outside the program, so no need to cause a waitcnt for LDS
874     // address space operations.
875     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
876                           Position::AFTER);
877 
878     return Changed;
879   }
880 
881   if (IsNonTemporal) {
882     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
883     Changed |= enableGLCBit(MI);
884     Changed |= enableSLCBit(MI);
885     return Changed;
886   }
887 
888   return Changed;
889 }
890 
891 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
892                                     SIAtomicScope Scope,
893                                     SIAtomicAddrSpace AddrSpace,
894                                     SIMemOp Op,
895                                     bool IsCrossAddrSpaceOrdering,
896                                     Position Pos) const {
897   bool Changed = false;
898 
899   MachineBasicBlock &MBB = *MI->getParent();
900   DebugLoc DL = MI->getDebugLoc();
901 
902   if (Pos == Position::AFTER)
903     ++MI;
904 
905   bool VMCnt = false;
906   bool LGKMCnt = false;
907 
908   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
909       SIAtomicAddrSpace::NONE) {
910     switch (Scope) {
911     case SIAtomicScope::SYSTEM:
912     case SIAtomicScope::AGENT:
913       VMCnt |= true;
914       break;
915     case SIAtomicScope::WORKGROUP:
916     case SIAtomicScope::WAVEFRONT:
917     case SIAtomicScope::SINGLETHREAD:
918       // The L1 cache keeps all memory operations in order for
919       // wavefronts in the same work-group.
920       break;
921     default:
922       llvm_unreachable("Unsupported synchronization scope");
923     }
924   }
925 
926   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
927     switch (Scope) {
928     case SIAtomicScope::SYSTEM:
929     case SIAtomicScope::AGENT:
930     case SIAtomicScope::WORKGROUP:
931       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
932       // not needed as LDS operations for all waves are executed in a total
933       // global ordering as observed by all waves. Required if also
934       // synchronizing with global/GDS memory as LDS operations could be
935       // reordered with respect to later global/GDS memory operations of the
936       // same wave.
937       LGKMCnt |= IsCrossAddrSpaceOrdering;
938       break;
939     case SIAtomicScope::WAVEFRONT:
940     case SIAtomicScope::SINGLETHREAD:
941       // The LDS keeps all memory operations in order for
942       // the same wavesfront.
943       break;
944     default:
945       llvm_unreachable("Unsupported synchronization scope");
946     }
947   }
948 
949   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
950     switch (Scope) {
951     case SIAtomicScope::SYSTEM:
952     case SIAtomicScope::AGENT:
953       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
954       // is not needed as GDS operations for all waves are executed in a total
955       // global ordering as observed by all waves. Required if also
956       // synchronizing with global/LDS memory as GDS operations could be
957       // reordered with respect to later global/LDS memory operations of the
958       // same wave.
959       LGKMCnt |= IsCrossAddrSpaceOrdering;
960       break;
961     case SIAtomicScope::WORKGROUP:
962     case SIAtomicScope::WAVEFRONT:
963     case SIAtomicScope::SINGLETHREAD:
964       // The GDS keeps all memory operations in order for
965       // the same work-group.
966       break;
967     default:
968       llvm_unreachable("Unsupported synchronization scope");
969     }
970   }
971 
972   if (VMCnt || LGKMCnt) {
973     unsigned WaitCntImmediate =
974       AMDGPU::encodeWaitcnt(IV,
975                             VMCnt ? 0 : getVmcntBitMask(IV),
976                             getExpcntBitMask(IV),
977                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
978     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
979     Changed = true;
980   }
981 
982   if (Pos == Position::AFTER)
983     --MI;
984 
985   return Changed;
986 }
987 
988 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
989                                        SIAtomicScope Scope,
990                                        SIAtomicAddrSpace AddrSpace,
991                                        Position Pos) const {
992   if (!InsertCacheInv)
993     return false;
994 
995   bool Changed = false;
996 
997   MachineBasicBlock &MBB = *MI->getParent();
998   DebugLoc DL = MI->getDebugLoc();
999 
1000   if (Pos == Position::AFTER)
1001     ++MI;
1002 
1003   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1004     switch (Scope) {
1005     case SIAtomicScope::SYSTEM:
1006     case SIAtomicScope::AGENT:
1007       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1008       Changed = true;
1009       break;
1010     case SIAtomicScope::WORKGROUP:
1011     case SIAtomicScope::WAVEFRONT:
1012     case SIAtomicScope::SINGLETHREAD:
1013       // No cache to invalidate.
1014       break;
1015     default:
1016       llvm_unreachable("Unsupported synchronization scope");
1017     }
1018   }
1019 
1020   /// The scratch address space does not need the global memory cache
1021   /// to be flushed as all memory operations by the same thread are
1022   /// sequentially consistent, and no other thread can access scratch
1023   /// memory.
1024 
1025   /// Other address spaces do not have a cache.
1026 
1027   if (Pos == Position::AFTER)
1028     --MI;
1029 
1030   return Changed;
1031 }
1032 
1033 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1034                                        SIAtomicScope Scope,
1035                                        SIAtomicAddrSpace AddrSpace,
1036                                        bool IsCrossAddrSpaceOrdering,
1037                                        Position Pos) const {
1038     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1039                       IsCrossAddrSpaceOrdering, Pos);
1040 }
1041 
1042 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1043                                        SIAtomicScope Scope,
1044                                        SIAtomicAddrSpace AddrSpace,
1045                                        Position Pos) const {
1046   if (!InsertCacheInv)
1047     return false;
1048 
1049   bool Changed = false;
1050 
1051   MachineBasicBlock &MBB = *MI->getParent();
1052   DebugLoc DL = MI->getDebugLoc();
1053 
1054   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1055 
1056   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1057                                     ? AMDGPU::BUFFER_WBINVL1
1058                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1059 
1060   if (Pos == Position::AFTER)
1061     ++MI;
1062 
1063   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1064     switch (Scope) {
1065     case SIAtomicScope::SYSTEM:
1066     case SIAtomicScope::AGENT:
1067       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1068       Changed = true;
1069       break;
1070     case SIAtomicScope::WORKGROUP:
1071     case SIAtomicScope::WAVEFRONT:
1072     case SIAtomicScope::SINGLETHREAD:
1073       // No cache to invalidate.
1074       break;
1075     default:
1076       llvm_unreachable("Unsupported synchronization scope");
1077     }
1078   }
1079 
1080   /// The scratch address space does not need the global memory cache
1081   /// to be flushed as all memory operations by the same thread are
1082   /// sequentially consistent, and no other thread can access scratch
1083   /// memory.
1084 
1085   /// Other address spaces do not have a cache.
1086 
1087   if (Pos == Position::AFTER)
1088     --MI;
1089 
1090   return Changed;
1091 }
1092 
1093 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1094     const MachineBasicBlock::iterator &MI,
1095     SIAtomicScope Scope,
1096     SIAtomicAddrSpace AddrSpace) const {
1097   assert(MI->mayLoad() && !MI->mayStore());
1098   bool Changed = false;
1099 
1100   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1101     switch (Scope) {
1102     case SIAtomicScope::SYSTEM:
1103     case SIAtomicScope::AGENT:
1104       Changed |= enableGLCBit(MI);
1105       break;
1106     case SIAtomicScope::WORKGROUP:
1107       // In threadgroup split mode the waves of a work-group can be executing on
1108       // different CUs. Therefore need to bypass the L1 which is per CU.
1109       // Otherwise in non-threadgroup split mode all waves of a work-group are
1110       // on the same CU, and so the L1 does not need to be bypassed.
1111       if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1112       break;
1113     case SIAtomicScope::WAVEFRONT:
1114     case SIAtomicScope::SINGLETHREAD:
1115       // No cache to bypass.
1116       break;
1117     default:
1118       llvm_unreachable("Unsupported synchronization scope");
1119     }
1120   }
1121 
1122   /// The scratch address space does not need the global memory caches
1123   /// to be bypassed as all memory operations by the same thread are
1124   /// sequentially consistent, and no other thread can access scratch
1125   /// memory.
1126 
1127   /// Other address spaces do not have a cache.
1128 
1129   return Changed;
1130 }
1131 
1132 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1133     const MachineBasicBlock::iterator &MI,
1134     SIAtomicScope Scope,
1135     SIAtomicAddrSpace AddrSpace) const {
1136   assert(!MI->mayLoad() && MI->mayStore());
1137   bool Changed = false;
1138 
1139   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1140     switch (Scope) {
1141     case SIAtomicScope::SYSTEM:
1142     case SIAtomicScope::AGENT:
1143       /// Do not set glc for store atomic operations as they implicitly write
1144       /// through the L1 cache.
1145       break;
1146     case SIAtomicScope::WORKGROUP:
1147     case SIAtomicScope::WAVEFRONT:
1148     case SIAtomicScope::SINGLETHREAD:
1149       // No cache to bypass. Store atomics implicitly write through the L1
1150       // cache.
1151       break;
1152     default:
1153       llvm_unreachable("Unsupported synchronization scope");
1154     }
1155   }
1156 
1157   /// The scratch address space does not need the global memory caches
1158   /// to be bypassed as all memory operations by the same thread are
1159   /// sequentially consistent, and no other thread can access scratch
1160   /// memory.
1161 
1162   /// Other address spaces do not have a cache.
1163 
1164   return Changed;
1165 }
1166 
1167 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1168     const MachineBasicBlock::iterator &MI,
1169     SIAtomicScope Scope,
1170     SIAtomicAddrSpace AddrSpace) const {
1171   assert(MI->mayLoad() && MI->mayStore());
1172   bool Changed = false;
1173 
1174   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1175     switch (Scope) {
1176     case SIAtomicScope::SYSTEM:
1177     case SIAtomicScope::AGENT:
1178       /// Do not set glc for RMW atomic operations as they implicitly bypass
1179       /// the L1 cache, and the glc bit is instead used to indicate if they are
1180       /// return or no-return.
1181       break;
1182     case SIAtomicScope::WORKGROUP:
1183     case SIAtomicScope::WAVEFRONT:
1184     case SIAtomicScope::SINGLETHREAD:
1185       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1186       break;
1187     default:
1188       llvm_unreachable("Unsupported synchronization scope");
1189     }
1190   }
1191 
1192   return Changed;
1193 }
1194 
1195 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1196     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1197     bool IsVolatile, bool IsNonTemporal) const {
1198   // Only handle load and store, not atomic read-modify-write insructions. The
1199   // latter use glc to indicate if the atomic returns a result and so must not
1200   // be used for cache control.
1201   assert(MI->mayLoad() ^ MI->mayStore());
1202 
1203   // Only update load and store, not LLVM IR atomic read-modify-write
1204   // instructions. The latter are always marked as volatile so cannot sensibly
1205   // handle it as do not want to pessimize all atomics. Also they do not support
1206   // the nontemporal attribute.
1207   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1208 
1209   bool Changed = false;
1210 
1211   if (IsVolatile) {
1212     if (Op == SIMemOp::LOAD) {
1213       Changed |= enableGLCBit(MI);
1214     }
1215 
1216     // Ensure operation has completed at system scope to cause all volatile
1217     // operations to be visible outside the program in a global order. Do not
1218     // request cross address space as only the global address space can be
1219     // observable outside the program, so no need to cause a waitcnt for LDS
1220     // address space operations.
1221     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1222                           Position::AFTER);
1223 
1224     return Changed;
1225   }
1226 
1227   if (IsNonTemporal) {
1228     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1229     Changed |= enableGLCBit(MI);
1230     Changed |= enableSLCBit(MI);
1231     return Changed;
1232   }
1233 
1234   return Changed;
1235 }
1236 
1237 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1238                                       SIAtomicScope Scope,
1239                                       SIAtomicAddrSpace AddrSpace,
1240                                       SIMemOp Op,
1241                                       bool IsCrossAddrSpaceOrdering,
1242                                       Position Pos) const {
1243   if (ST.isTgSplitEnabled()) {
1244     // In threadgroup split mode the waves of a work-group can be executing on
1245     // different CUs. Therefore need to wait for global or GDS memory operations
1246     // to complete to ensure they are visible to waves in the other CUs.
1247     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1248     // the same CU, so no need to wait for global memory as all waves in the
1249     // work-group access the same the L1, nor wait for GDS as access are ordered
1250     // on a CU.
1251     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1252                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1253         (Scope == SIAtomicScope::WORKGROUP)) {
1254       // Same as GFX7 using agent scope.
1255       Scope = SIAtomicScope::AGENT;
1256     }
1257     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1258     // LDS memory operations.
1259     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1260   }
1261   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1262                                         IsCrossAddrSpaceOrdering, Pos);
1263 }
1264 
1265 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1266                                          SIAtomicScope Scope,
1267                                          SIAtomicAddrSpace AddrSpace,
1268                                          Position Pos) const {
1269   if (!InsertCacheInv)
1270     return false;
1271 
1272   bool Changed = false;
1273 
1274   MachineBasicBlock &MBB = *MI->getParent();
1275   DebugLoc DL = MI->getDebugLoc();
1276 
1277   if (Pos == Position::AFTER)
1278     ++MI;
1279 
1280   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1281     switch (Scope) {
1282     case SIAtomicScope::SYSTEM:
1283       // Ensures that following loads will not see stale remote VMEM data or
1284       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1285       // CC will never be stale due to the local memory probes.
1286       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1287       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1288       // hardware does not reorder memory operations by the same wave with
1289       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1290       // remove any cache lines of earlier writes by the same wave and ensures
1291       // later reads by the same wave will refetch the cache lines.
1292       Changed = true;
1293       break;
1294     case SIAtomicScope::AGENT:
1295       // Same as GFX7.
1296       break;
1297     case SIAtomicScope::WORKGROUP:
1298       // In threadgroup split mode the waves of a work-group can be executing on
1299       // different CUs. Therefore need to invalidate the L1 which is per CU.
1300       // Otherwise in non-threadgroup split mode all waves of a work-group are
1301       // on the same CU, and so the L1 does not need to be invalidated.
1302       if (ST.isTgSplitEnabled()) {
1303         // Same as GFX7 using agent scope.
1304         Scope = SIAtomicScope::AGENT;
1305       }
1306       break;
1307     case SIAtomicScope::WAVEFRONT:
1308     case SIAtomicScope::SINGLETHREAD:
1309       // Same as GFX7.
1310       break;
1311     default:
1312       llvm_unreachable("Unsupported synchronization scope");
1313     }
1314   }
1315 
1316   /// The scratch address space does not need the global memory cache
1317   /// to be flushed as all memory operations by the same thread are
1318   /// sequentially consistent, and no other thread can access scratch
1319   /// memory.
1320 
1321   /// Other address spaces do not have a cache.
1322 
1323   if (Pos == Position::AFTER)
1324     --MI;
1325 
1326   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1327 
1328   return Changed;
1329 }
1330 
1331 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1332                                          SIAtomicScope Scope,
1333                                          SIAtomicAddrSpace AddrSpace,
1334                                          bool IsCrossAddrSpaceOrdering,
1335                                          Position Pos) const {
1336   bool Changed = false;
1337 
1338   MachineBasicBlock &MBB = *MI->getParent();
1339   DebugLoc DL = MI->getDebugLoc();
1340 
1341   if (Pos == Position::AFTER)
1342     ++MI;
1343 
1344   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1345     switch (Scope) {
1346     case SIAtomicScope::SYSTEM:
1347       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1348       // hardware does not reorder memory operations by the same wave with
1349       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1350       // to initiate writeback of any dirty cache lines of earlier writes by the
1351       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1352       // writeback has completed.
1353       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1354       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1355       // vmcnt(0)" needed by the "BUFFER_WBL2".
1356       Changed = true;
1357       break;
1358     case SIAtomicScope::AGENT:
1359     case SIAtomicScope::WORKGROUP:
1360     case SIAtomicScope::WAVEFRONT:
1361     case SIAtomicScope::SINGLETHREAD:
1362       // Same as GFX7.
1363       break;
1364     default:
1365       llvm_unreachable("Unsupported synchronization scope");
1366     }
1367   }
1368 
1369   if (Pos == Position::AFTER)
1370     --MI;
1371 
1372   Changed |=
1373       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1374                                         IsCrossAddrSpaceOrdering, Pos);
1375 
1376   return Changed;
1377 }
1378 
1379 bool SIGfx10CacheControl::enableLoadCacheBypass(
1380     const MachineBasicBlock::iterator &MI,
1381     SIAtomicScope Scope,
1382     SIAtomicAddrSpace AddrSpace) const {
1383   assert(MI->mayLoad() && !MI->mayStore());
1384   bool Changed = false;
1385 
1386   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1387     /// TODO Do not set glc for rmw atomic operations as they
1388     /// implicitly bypass the L0/L1 caches.
1389 
1390     switch (Scope) {
1391     case SIAtomicScope::SYSTEM:
1392     case SIAtomicScope::AGENT:
1393       Changed |= enableGLCBit(MI);
1394       Changed |= enableDLCBit(MI);
1395       break;
1396     case SIAtomicScope::WORKGROUP:
1397       // In WGP mode the waves of a work-group can be executing on either CU of
1398       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1399       // CU mode all waves of a work-group are on the same CU, and so the L0
1400       // does not need to be bypassed.
1401       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1402       break;
1403     case SIAtomicScope::WAVEFRONT:
1404     case SIAtomicScope::SINGLETHREAD:
1405       // No cache to bypass.
1406       break;
1407     default:
1408       llvm_unreachable("Unsupported synchronization scope");
1409     }
1410   }
1411 
1412   /// The scratch address space does not need the global memory caches
1413   /// to be bypassed as all memory operations by the same thread are
1414   /// sequentially consistent, and no other thread can access scratch
1415   /// memory.
1416 
1417   /// Other address spaces do not have a cache.
1418 
1419   return Changed;
1420 }
1421 
1422 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1423     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1424     bool IsVolatile, bool IsNonTemporal) const {
1425 
1426   // Only handle load and store, not atomic read-modify-write insructions. The
1427   // latter use glc to indicate if the atomic returns a result and so must not
1428   // be used for cache control.
1429   assert(MI->mayLoad() ^ MI->mayStore());
1430 
1431   // Only update load and store, not LLVM IR atomic read-modify-write
1432   // instructions. The latter are always marked as volatile so cannot sensibly
1433   // handle it as do not want to pessimize all atomics. Also they do not support
1434   // the nontemporal attribute.
1435   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1436 
1437   bool Changed = false;
1438 
1439   if (IsVolatile) {
1440 
1441     if (Op == SIMemOp::LOAD) {
1442       Changed |= enableGLCBit(MI);
1443       Changed |= enableDLCBit(MI);
1444     }
1445 
1446     // Ensure operation has completed at system scope to cause all volatile
1447     // operations to be visible outside the program in a global order. Do not
1448     // request cross address space as only the global address space can be
1449     // observable outside the program, so no need to cause a waitcnt for LDS
1450     // address space operations.
1451     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1452                           Position::AFTER);
1453     return Changed;
1454   }
1455 
1456   if (IsNonTemporal) {
1457     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1458     Changed |= enableSLCBit(MI);
1459     return Changed;
1460   }
1461 
1462   return Changed;
1463 }
1464 
1465 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1466                                      SIAtomicScope Scope,
1467                                      SIAtomicAddrSpace AddrSpace,
1468                                      SIMemOp Op,
1469                                      bool IsCrossAddrSpaceOrdering,
1470                                      Position Pos) const {
1471   bool Changed = false;
1472 
1473   MachineBasicBlock &MBB = *MI->getParent();
1474   DebugLoc DL = MI->getDebugLoc();
1475 
1476   if (Pos == Position::AFTER)
1477     ++MI;
1478 
1479   bool VMCnt = false;
1480   bool VSCnt = false;
1481   bool LGKMCnt = false;
1482 
1483   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1484       SIAtomicAddrSpace::NONE) {
1485     switch (Scope) {
1486     case SIAtomicScope::SYSTEM:
1487     case SIAtomicScope::AGENT:
1488       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1489         VMCnt |= true;
1490       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1491         VSCnt |= true;
1492       break;
1493     case SIAtomicScope::WORKGROUP:
1494       // In WGP mode the waves of a work-group can be executing on either CU of
1495       // the WGP. Therefore need to wait for operations to complete to ensure
1496       // they are visible to waves in the other CU as the L0 is per CU.
1497       // Otherwise in CU mode and all waves of a work-group are on the same CU
1498       // which shares the same L0.
1499       if (!ST.isCuModeEnabled()) {
1500         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1501           VMCnt |= true;
1502         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1503           VSCnt |= true;
1504       }
1505       break;
1506     case SIAtomicScope::WAVEFRONT:
1507     case SIAtomicScope::SINGLETHREAD:
1508       // The L0 cache keeps all memory operations in order for
1509       // work-items in the same wavefront.
1510       break;
1511     default:
1512       llvm_unreachable("Unsupported synchronization scope");
1513     }
1514   }
1515 
1516   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1517     switch (Scope) {
1518     case SIAtomicScope::SYSTEM:
1519     case SIAtomicScope::AGENT:
1520     case SIAtomicScope::WORKGROUP:
1521       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1522       // not needed as LDS operations for all waves are executed in a total
1523       // global ordering as observed by all waves. Required if also
1524       // synchronizing with global/GDS memory as LDS operations could be
1525       // reordered with respect to later global/GDS memory operations of the
1526       // same wave.
1527       LGKMCnt |= IsCrossAddrSpaceOrdering;
1528       break;
1529     case SIAtomicScope::WAVEFRONT:
1530     case SIAtomicScope::SINGLETHREAD:
1531       // The LDS keeps all memory operations in order for
1532       // the same wavesfront.
1533       break;
1534     default:
1535       llvm_unreachable("Unsupported synchronization scope");
1536     }
1537   }
1538 
1539   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1540     switch (Scope) {
1541     case SIAtomicScope::SYSTEM:
1542     case SIAtomicScope::AGENT:
1543       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1544       // is not needed as GDS operations for all waves are executed in a total
1545       // global ordering as observed by all waves. Required if also
1546       // synchronizing with global/LDS memory as GDS operations could be
1547       // reordered with respect to later global/LDS memory operations of the
1548       // same wave.
1549       LGKMCnt |= IsCrossAddrSpaceOrdering;
1550       break;
1551     case SIAtomicScope::WORKGROUP:
1552     case SIAtomicScope::WAVEFRONT:
1553     case SIAtomicScope::SINGLETHREAD:
1554       // The GDS keeps all memory operations in order for
1555       // the same work-group.
1556       break;
1557     default:
1558       llvm_unreachable("Unsupported synchronization scope");
1559     }
1560   }
1561 
1562   if (VMCnt || LGKMCnt) {
1563     unsigned WaitCntImmediate =
1564       AMDGPU::encodeWaitcnt(IV,
1565                             VMCnt ? 0 : getVmcntBitMask(IV),
1566                             getExpcntBitMask(IV),
1567                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1568     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1569     Changed = true;
1570   }
1571 
1572   if (VSCnt) {
1573     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1574       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1575       .addImm(0);
1576     Changed = true;
1577   }
1578 
1579   if (Pos == Position::AFTER)
1580     --MI;
1581 
1582   return Changed;
1583 }
1584 
1585 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1586                                         SIAtomicScope Scope,
1587                                         SIAtomicAddrSpace AddrSpace,
1588                                         Position Pos) const {
1589   if (!InsertCacheInv)
1590     return false;
1591 
1592   bool Changed = false;
1593 
1594   MachineBasicBlock &MBB = *MI->getParent();
1595   DebugLoc DL = MI->getDebugLoc();
1596 
1597   if (Pos == Position::AFTER)
1598     ++MI;
1599 
1600   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1601     switch (Scope) {
1602     case SIAtomicScope::SYSTEM:
1603     case SIAtomicScope::AGENT:
1604       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1605       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1606       Changed = true;
1607       break;
1608     case SIAtomicScope::WORKGROUP:
1609       // In WGP mode the waves of a work-group can be executing on either CU of
1610       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1611       // in CU mode and all waves of a work-group are on the same CU, and so the
1612       // L0 does not need to be invalidated.
1613       if (!ST.isCuModeEnabled()) {
1614         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1615         Changed = true;
1616       }
1617       break;
1618     case SIAtomicScope::WAVEFRONT:
1619     case SIAtomicScope::SINGLETHREAD:
1620       // No cache to invalidate.
1621       break;
1622     default:
1623       llvm_unreachable("Unsupported synchronization scope");
1624     }
1625   }
1626 
1627   /// The scratch address space does not need the global memory cache
1628   /// to be flushed as all memory operations by the same thread are
1629   /// sequentially consistent, and no other thread can access scratch
1630   /// memory.
1631 
1632   /// Other address spaces do not have a cache.
1633 
1634   if (Pos == Position::AFTER)
1635     --MI;
1636 
1637   return Changed;
1638 }
1639 
1640 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1641   if (AtomicPseudoMIs.empty())
1642     return false;
1643 
1644   for (auto &MI : AtomicPseudoMIs)
1645     MI->eraseFromParent();
1646 
1647   AtomicPseudoMIs.clear();
1648   return true;
1649 }
1650 
1651 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1652                                    MachineBasicBlock::iterator &MI) {
1653   assert(MI->mayLoad() && !MI->mayStore());
1654 
1655   bool Changed = false;
1656 
1657   if (MOI.isAtomic()) {
1658     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1659         MOI.getOrdering() == AtomicOrdering::Acquire ||
1660         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1661       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1662                                            MOI.getOrderingAddrSpace());
1663     }
1664 
1665     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1666       Changed |= CC->insertWait(MI, MOI.getScope(),
1667                                 MOI.getOrderingAddrSpace(),
1668                                 SIMemOp::LOAD | SIMemOp::STORE,
1669                                 MOI.getIsCrossAddressSpaceOrdering(),
1670                                 Position::BEFORE);
1671 
1672     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1673         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1674       Changed |= CC->insertWait(MI, MOI.getScope(),
1675                                 MOI.getInstrAddrSpace(),
1676                                 SIMemOp::LOAD,
1677                                 MOI.getIsCrossAddressSpaceOrdering(),
1678                                 Position::AFTER);
1679       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1680                                    MOI.getOrderingAddrSpace(),
1681                                    Position::AFTER);
1682     }
1683 
1684     return Changed;
1685   }
1686 
1687   // Atomic instructions already bypass caches to the scope specified by the
1688   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1689   // need additional treatment.
1690   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1691                                                 SIMemOp::LOAD, MOI.isVolatile(),
1692                                                 MOI.isNonTemporal());
1693   return Changed;
1694 }
1695 
1696 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1697                                     MachineBasicBlock::iterator &MI) {
1698   assert(!MI->mayLoad() && MI->mayStore());
1699 
1700   bool Changed = false;
1701 
1702   if (MOI.isAtomic()) {
1703     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1704         MOI.getOrdering() == AtomicOrdering::Release ||
1705         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1706       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1707                                             MOI.getOrderingAddrSpace());
1708     }
1709 
1710     if (MOI.getOrdering() == AtomicOrdering::Release ||
1711         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1712       Changed |= CC->insertRelease(MI, MOI.getScope(),
1713                                    MOI.getOrderingAddrSpace(),
1714                                    MOI.getIsCrossAddressSpaceOrdering(),
1715                                    Position::BEFORE);
1716 
1717     return Changed;
1718   }
1719 
1720   // Atomic instructions already bypass caches to the scope specified by the
1721   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1722   // need additional treatment.
1723   Changed |= CC->enableVolatileAndOrNonTemporal(
1724       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1725       MOI.isNonTemporal());
1726   return Changed;
1727 }
1728 
1729 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1730                                           MachineBasicBlock::iterator &MI) {
1731   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1732 
1733   AtomicPseudoMIs.push_back(MI);
1734   bool Changed = false;
1735 
1736   if (MOI.isAtomic()) {
1737     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1738         MOI.getOrdering() == AtomicOrdering::Release ||
1739         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1740         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1741       /// TODO: This relies on a barrier always generating a waitcnt
1742       /// for LDS to ensure it is not reordered with the completion of
1743       /// the proceeding LDS operations. If barrier had a memory
1744       /// ordering and memory scope, then library does not need to
1745       /// generate a fence. Could add support in this file for
1746       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1747       /// adding S_WAITCNT before a S_BARRIER.
1748       Changed |= CC->insertRelease(MI, MOI.getScope(),
1749                                    MOI.getOrderingAddrSpace(),
1750                                    MOI.getIsCrossAddressSpaceOrdering(),
1751                                    Position::BEFORE);
1752 
1753     // TODO: If both release and invalidate are happening they could be combined
1754     // to use the single "BUFFER_WBINV*" instruction. This could be done by
1755     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1756     // track cache invalidate and write back instructions.
1757 
1758     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1759         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1760         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1761       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1762                                    MOI.getOrderingAddrSpace(),
1763                                    Position::BEFORE);
1764 
1765     return Changed;
1766   }
1767 
1768   return Changed;
1769 }
1770 
1771 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1772   MachineBasicBlock::iterator &MI) {
1773   assert(MI->mayLoad() && MI->mayStore());
1774 
1775   bool Changed = false;
1776 
1777   if (MOI.isAtomic()) {
1778     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1779         MOI.getOrdering() == AtomicOrdering::Acquire ||
1780         MOI.getOrdering() == AtomicOrdering::Release ||
1781         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1782         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1783       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1784                                           MOI.getInstrAddrSpace());
1785     }
1786 
1787     if (MOI.getOrdering() == AtomicOrdering::Release ||
1788         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1789         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1790         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1791       Changed |= CC->insertRelease(MI, MOI.getScope(),
1792                                    MOI.getOrderingAddrSpace(),
1793                                    MOI.getIsCrossAddressSpaceOrdering(),
1794                                    Position::BEFORE);
1795 
1796     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1797         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1798         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1799         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1800         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1801       Changed |= CC->insertWait(MI, MOI.getScope(),
1802                                 MOI.getInstrAddrSpace(),
1803                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1804                                                    SIMemOp::STORE,
1805                                 MOI.getIsCrossAddressSpaceOrdering(),
1806                                 Position::AFTER);
1807       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1808                                    MOI.getOrderingAddrSpace(),
1809                                    Position::AFTER);
1810     }
1811 
1812     return Changed;
1813   }
1814 
1815   return Changed;
1816 }
1817 
1818 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1819   bool Changed = false;
1820 
1821   SIMemOpAccess MOA(MF);
1822   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1823 
1824   for (auto &MBB : MF) {
1825     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1826 
1827       // Unbundle instructions after the post-RA scheduler.
1828       if (MI->isBundle() && MI->mayLoadOrStore()) {
1829         MachineBasicBlock::instr_iterator II(MI->getIterator());
1830         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1831              I != E && I->isBundledWithPred(); ++I) {
1832           I->unbundleFromPred();
1833           for (MachineOperand &MO : I->operands())
1834             if (MO.isReg())
1835               MO.setIsInternalRead(false);
1836         }
1837 
1838         MI->eraseFromParent();
1839         MI = II->getIterator();
1840       }
1841 
1842       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1843         continue;
1844 
1845       if (const auto &MOI = MOA.getLoadInfo(MI))
1846         Changed |= expandLoad(MOI.getValue(), MI);
1847       else if (const auto &MOI = MOA.getStoreInfo(MI))
1848         Changed |= expandStore(MOI.getValue(), MI);
1849       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1850         Changed |= expandAtomicFence(MOI.getValue(), MI);
1851       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1852         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1853     }
1854   }
1855 
1856   Changed |= removeAtomicPseudoMIs();
1857   return Changed;
1858 }
1859 
1860 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1861 
1862 char SIMemoryLegalizer::ID = 0;
1863 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1864 
1865 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1866   return new SIMemoryLegalizer();
1867 }
1868