xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE);
130 
131     // There is also no cross address space ordering if the ordering
132     // address space is the same as the instruction address space and
133     // only contains a single address space.
134     if ((OrderingAddrSpace == InstrAddrSpace) &&
135         isPowerOf2_32(uint32_t(InstrAddrSpace)))
136       this->IsCrossAddressSpaceOrdering = false;
137 
138     // Limit the scope to the maximum supported by the instruction's address
139     // spaces.
140     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
141         SIAtomicAddrSpace::NONE) {
142       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
143     } else if ((InstrAddrSpace &
144                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
145                SIAtomicAddrSpace::NONE) {
146       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
147     } else if ((InstrAddrSpace &
148                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
149                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
150       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
151     }
152   }
153 
154 public:
155   /// \returns Atomic synchronization scope of the machine instruction used to
156   /// create this SIMemOpInfo.
157   SIAtomicScope getScope() const {
158     return Scope;
159   }
160 
161   /// \returns Ordering constraint of the machine instruction used to
162   /// create this SIMemOpInfo.
163   AtomicOrdering getOrdering() const {
164     return Ordering;
165   }
166 
167   /// \returns Failure ordering constraint of the machine instruction used to
168   /// create this SIMemOpInfo.
169   AtomicOrdering getFailureOrdering() const {
170     return FailureOrdering;
171   }
172 
173   /// \returns The address spaces be accessed by the machine
174   /// instruction used to create this SiMemOpInfo.
175   SIAtomicAddrSpace getInstrAddrSpace() const {
176     return InstrAddrSpace;
177   }
178 
179   /// \returns The address spaces that must be ordered by the machine
180   /// instruction used to create this SiMemOpInfo.
181   SIAtomicAddrSpace getOrderingAddrSpace() const {
182     return OrderingAddrSpace;
183   }
184 
185   /// \returns Return true iff memory ordering of operations on
186   /// different address spaces is required.
187   bool getIsCrossAddressSpaceOrdering() const {
188     return IsCrossAddressSpaceOrdering;
189   }
190 
191   /// \returns True if memory access of the machine instruction used to
192   /// create this SIMemOpInfo is volatile, false otherwise.
193   bool isVolatile() const {
194     return IsVolatile;
195   }
196 
197   /// \returns True if memory access of the machine instruction used to
198   /// create this SIMemOpInfo is nontemporal, false otherwise.
199   bool isNonTemporal() const {
200     return IsNonTemporal;
201   }
202 
203   /// \returns True if ordering constraint of the machine instruction used to
204   /// create this SIMemOpInfo is unordered or higher, false otherwise.
205   bool isAtomic() const {
206     return Ordering != AtomicOrdering::NotAtomic;
207   }
208 
209 };
210 
211 class SIMemOpAccess final {
212 private:
213   AMDGPUMachineModuleInfo *MMI = nullptr;
214 
215   /// Reports unsupported message \p Msg for \p MI to LLVM context.
216   void reportUnsupported(const MachineBasicBlock::iterator &MI,
217                          const char *Msg) const;
218 
219   /// Inspects the target synchronization scope \p SSID and determines
220   /// the SI atomic scope it corresponds to, the address spaces it
221   /// covers, and whether the memory ordering applies between address
222   /// spaces.
223   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
224   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
225 
226   /// \return Return a bit set of the address spaces accessed by \p AS.
227   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
228 
229   /// \returns Info constructed from \p MI, which has at least machine memory
230   /// operand.
231   Optional<SIMemOpInfo> constructFromMIWithMMO(
232       const MachineBasicBlock::iterator &MI) const;
233 
234 public:
235   /// Construct class to support accessing the machine memory operands
236   /// of instructions in the machine function \p MF.
237   SIMemOpAccess(MachineFunction &MF);
238 
239   /// \returns Load info if \p MI is a load operation, "None" otherwise.
240   Optional<SIMemOpInfo> getLoadInfo(
241       const MachineBasicBlock::iterator &MI) const;
242 
243   /// \returns Store info if \p MI is a store operation, "None" otherwise.
244   Optional<SIMemOpInfo> getStoreInfo(
245       const MachineBasicBlock::iterator &MI) const;
246 
247   /// \returns Atomic fence info if \p MI is an atomic fence operation,
248   /// "None" otherwise.
249   Optional<SIMemOpInfo> getAtomicFenceInfo(
250       const MachineBasicBlock::iterator &MI) const;
251 
252   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
253   /// rmw operation, "None" otherwise.
254   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
255       const MachineBasicBlock::iterator &MI) const;
256 };
257 
258 class SICacheControl {
259 protected:
260 
261   /// AMDGPU subtarget info.
262   const GCNSubtarget &ST;
263 
264   /// Instruction info.
265   const SIInstrInfo *TII = nullptr;
266 
267   IsaVersion IV;
268 
269   /// Whether to insert cache invalidating instructions.
270   bool InsertCacheInv;
271 
272   SICacheControl(const GCNSubtarget &ST);
273 
274   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
275   /// \returns Returns true if \p MI is modified, false otherwise.
276   bool enableNamedBit(const MachineBasicBlock::iterator MI,
277                       AMDGPU::CPol::CPol Bit) const;
278 
279 public:
280 
281   /// Create a cache control for the subtarget \p ST.
282   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
283 
284   /// Update \p MI memory load instruction to bypass any caches up to
285   /// the \p Scope memory scope for address spaces \p
286   /// AddrSpace. Return true iff the instruction was modified.
287   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
288                                      SIAtomicScope Scope,
289                                      SIAtomicAddrSpace AddrSpace) const = 0;
290 
291   /// Update \p MI memory store instruction to bypass any caches up to
292   /// the \p Scope memory scope for address spaces \p
293   /// AddrSpace. Return true iff the instruction was modified.
294   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
295                                       SIAtomicScope Scope,
296                                       SIAtomicAddrSpace AddrSpace) const = 0;
297 
298   /// Update \p MI memory read-modify-write instruction to bypass any caches up
299   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
300   /// iff the instruction was modified.
301   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
302                                     SIAtomicScope Scope,
303                                     SIAtomicAddrSpace AddrSpace) const = 0;
304 
305   /// Update \p MI memory instruction of kind \p Op associated with address
306   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
307   /// true iff the instruction was modified.
308   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
309                                               SIAtomicAddrSpace AddrSpace,
310                                               SIMemOp Op, bool IsVolatile,
311                                               bool IsNonTemporal) const = 0;
312 
313   /// Inserts any necessary instructions at position \p Pos relative
314   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
315   /// \p Op associated with address spaces \p AddrSpace have completed. Used
316   /// between memory instructions to enforce the order they become visible as
317   /// observed by other memory instructions executing in memory scope \p Scope.
318   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
319   /// address spaces. Returns true iff any instructions inserted.
320   virtual bool insertWait(MachineBasicBlock::iterator &MI,
321                           SIAtomicScope Scope,
322                           SIAtomicAddrSpace AddrSpace,
323                           SIMemOp Op,
324                           bool IsCrossAddrSpaceOrdering,
325                           Position Pos) const = 0;
326 
327   /// Inserts any necessary instructions at position \p Pos relative to
328   /// instruction \p MI to ensure any subsequent memory instructions of this
329   /// thread with address spaces \p AddrSpace will observe the previous memory
330   /// operations by any thread for memory scopes up to memory scope \p Scope .
331   /// Returns true iff any instructions inserted.
332   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
333                              SIAtomicScope Scope,
334                              SIAtomicAddrSpace AddrSpace,
335                              Position Pos) const = 0;
336 
337   /// Inserts any necessary instructions at position \p Pos relative to
338   /// instruction \p MI to ensure previous memory instructions by this thread
339   /// with address spaces \p AddrSpace have completed and can be observed by
340   /// subsequent memory instructions by any thread executing in memory scope \p
341   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
342   /// between address spaces. Returns true iff any instructions inserted.
343   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
344                              SIAtomicScope Scope,
345                              SIAtomicAddrSpace AddrSpace,
346                              bool IsCrossAddrSpaceOrdering,
347                              Position Pos) const = 0;
348 
349   /// Virtual destructor to allow derivations to be deleted.
350   virtual ~SICacheControl() = default;
351 
352 };
353 
354 class SIGfx6CacheControl : public SICacheControl {
355 protected:
356 
357   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
358   /// is modified, false otherwise.
359   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
360     return enableNamedBit(MI, AMDGPU::CPol::GLC);
361   }
362 
363   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::SLC);
367   }
368 
369 public:
370 
371   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
372 
373   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
374                              SIAtomicScope Scope,
375                              SIAtomicAddrSpace AddrSpace) const override;
376 
377   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
378                               SIAtomicScope Scope,
379                               SIAtomicAddrSpace AddrSpace) const override;
380 
381   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
382                             SIAtomicScope Scope,
383                             SIAtomicAddrSpace AddrSpace) const override;
384 
385   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
386                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
387                                       bool IsVolatile,
388                                       bool IsNonTemporal) const override;
389 
390   bool insertWait(MachineBasicBlock::iterator &MI,
391                   SIAtomicScope Scope,
392                   SIAtomicAddrSpace AddrSpace,
393                   SIMemOp Op,
394                   bool IsCrossAddrSpaceOrdering,
395                   Position Pos) const override;
396 
397   bool insertAcquire(MachineBasicBlock::iterator &MI,
398                      SIAtomicScope Scope,
399                      SIAtomicAddrSpace AddrSpace,
400                      Position Pos) const override;
401 
402   bool insertRelease(MachineBasicBlock::iterator &MI,
403                      SIAtomicScope Scope,
404                      SIAtomicAddrSpace AddrSpace,
405                      bool IsCrossAddrSpaceOrdering,
406                      Position Pos) const override;
407 };
408 
409 class SIGfx7CacheControl : public SIGfx6CacheControl {
410 public:
411 
412   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419 };
420 
421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
422 public:
423 
424   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
425 
426   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
427                              SIAtomicScope Scope,
428                              SIAtomicAddrSpace AddrSpace) const override;
429 
430   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
431                               SIAtomicScope Scope,
432                               SIAtomicAddrSpace AddrSpace) const override;
433 
434   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
435                             SIAtomicScope Scope,
436                             SIAtomicAddrSpace AddrSpace) const override;
437 
438   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
439                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
440                                       bool IsVolatile,
441                                       bool IsNonTemporal) const override;
442 
443   bool insertWait(MachineBasicBlock::iterator &MI,
444                   SIAtomicScope Scope,
445                   SIAtomicAddrSpace AddrSpace,
446                   SIMemOp Op,
447                   bool IsCrossAddrSpaceOrdering,
448                   Position Pos) const override;
449 
450   bool insertAcquire(MachineBasicBlock::iterator &MI,
451                      SIAtomicScope Scope,
452                      SIAtomicAddrSpace AddrSpace,
453                      Position Pos) const override;
454 
455   bool insertRelease(MachineBasicBlock::iterator &MI,
456                      SIAtomicScope Scope,
457                      SIAtomicAddrSpace AddrSpace,
458                      bool IsCrossAddrSpaceOrdering,
459                      Position Pos) const override;
460 };
461 
462 class SIGfx10CacheControl : public SIGfx7CacheControl {
463 protected:
464 
465   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
466   /// is modified, false otherwise.
467   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
468     return enableNamedBit(MI, AMDGPU::CPol::DLC);
469   }
470 
471 public:
472 
473   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
474 
475   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
476                              SIAtomicScope Scope,
477                              SIAtomicAddrSpace AddrSpace) const override;
478 
479   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
480                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
481                                       bool IsVolatile,
482                                       bool IsNonTemporal) const override;
483 
484   bool insertWait(MachineBasicBlock::iterator &MI,
485                   SIAtomicScope Scope,
486                   SIAtomicAddrSpace AddrSpace,
487                   SIMemOp Op,
488                   bool IsCrossAddrSpaceOrdering,
489                   Position Pos) const override;
490 
491   bool insertAcquire(MachineBasicBlock::iterator &MI,
492                      SIAtomicScope Scope,
493                      SIAtomicAddrSpace AddrSpace,
494                      Position Pos) const override;
495 };
496 
497 class SIMemoryLegalizer final : public MachineFunctionPass {
498 private:
499 
500   /// Cache Control.
501   std::unique_ptr<SICacheControl> CC = nullptr;
502 
503   /// List of atomic pseudo instructions.
504   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
505 
506   /// Return true iff instruction \p MI is a atomic instruction that
507   /// returns a result.
508   bool isAtomicRet(const MachineInstr &MI) const {
509     return SIInstrInfo::isAtomicRet(MI);
510   }
511 
512   /// Removes all processed atomic pseudo instructions from the current
513   /// function. Returns true if current function is modified, false otherwise.
514   bool removeAtomicPseudoMIs();
515 
516   /// Expands load operation \p MI. Returns true if instructions are
517   /// added/deleted or \p MI is modified, false otherwise.
518   bool expandLoad(const SIMemOpInfo &MOI,
519                   MachineBasicBlock::iterator &MI);
520   /// Expands store operation \p MI. Returns true if instructions are
521   /// added/deleted or \p MI is modified, false otherwise.
522   bool expandStore(const SIMemOpInfo &MOI,
523                    MachineBasicBlock::iterator &MI);
524   /// Expands atomic fence operation \p MI. Returns true if
525   /// instructions are added/deleted or \p MI is modified, false otherwise.
526   bool expandAtomicFence(const SIMemOpInfo &MOI,
527                          MachineBasicBlock::iterator &MI);
528   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
529   /// instructions are added/deleted or \p MI is modified, false otherwise.
530   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
531                                 MachineBasicBlock::iterator &MI);
532 
533 public:
534   static char ID;
535 
536   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
537 
538   void getAnalysisUsage(AnalysisUsage &AU) const override {
539     AU.setPreservesCFG();
540     MachineFunctionPass::getAnalysisUsage(AU);
541   }
542 
543   StringRef getPassName() const override {
544     return PASS_NAME;
545   }
546 
547   bool runOnMachineFunction(MachineFunction &MF) override;
548 };
549 
550 } // end namespace anonymous
551 
552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
553                                       const char *Msg) const {
554   const Function &Func = MI->getParent()->getParent()->getFunction();
555   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
556   Func.getContext().diagnose(Diag);
557 }
558 
559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
561                                SIAtomicAddrSpace InstrAddrSpace) const {
562   if (SSID == SyncScope::System)
563     return std::make_tuple(SIAtomicScope::SYSTEM,
564                            SIAtomicAddrSpace::ATOMIC,
565                            true);
566   if (SSID == MMI->getAgentSSID())
567     return std::make_tuple(SIAtomicScope::AGENT,
568                            SIAtomicAddrSpace::ATOMIC,
569                            true);
570   if (SSID == MMI->getWorkgroupSSID())
571     return std::make_tuple(SIAtomicScope::WORKGROUP,
572                            SIAtomicAddrSpace::ATOMIC,
573                            true);
574   if (SSID == MMI->getWavefrontSSID())
575     return std::make_tuple(SIAtomicScope::WAVEFRONT,
576                            SIAtomicAddrSpace::ATOMIC,
577                            true);
578   if (SSID == SyncScope::SingleThread)
579     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
580                            SIAtomicAddrSpace::ATOMIC,
581                            true);
582   if (SSID == MMI->getSystemOneAddressSpaceSSID())
583     return std::make_tuple(SIAtomicScope::SYSTEM,
584                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
585                            false);
586   if (SSID == MMI->getAgentOneAddressSpaceSSID())
587     return std::make_tuple(SIAtomicScope::AGENT,
588                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
589                            false);
590   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
591     return std::make_tuple(SIAtomicScope::WORKGROUP,
592                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
593                            false);
594   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
595     return std::make_tuple(SIAtomicScope::WAVEFRONT,
596                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
597                            false);
598   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
599     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
600                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
601                            false);
602   return None;
603 }
604 
605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
606   if (AS == AMDGPUAS::FLAT_ADDRESS)
607     return SIAtomicAddrSpace::FLAT;
608   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
609     return SIAtomicAddrSpace::GLOBAL;
610   if (AS == AMDGPUAS::LOCAL_ADDRESS)
611     return SIAtomicAddrSpace::LDS;
612   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
613     return SIAtomicAddrSpace::SCRATCH;
614   if (AS == AMDGPUAS::REGION_ADDRESS)
615     return SIAtomicAddrSpace::GDS;
616 
617   return SIAtomicAddrSpace::OTHER;
618 }
619 
620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
621   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
622 }
623 
624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
625     const MachineBasicBlock::iterator &MI) const {
626   assert(MI->getNumMemOperands() > 0);
627 
628   SyncScope::ID SSID = SyncScope::SingleThread;
629   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
630   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
631   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
632   bool IsNonTemporal = true;
633   bool IsVolatile = false;
634 
635   // Validator should check whether or not MMOs cover the entire set of
636   // locations accessed by the memory instruction.
637   for (const auto &MMO : MI->memoperands()) {
638     IsNonTemporal &= MMO->isNonTemporal();
639     IsVolatile |= MMO->isVolatile();
640     InstrAddrSpace |=
641       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
642     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
643     if (OpOrdering != AtomicOrdering::NotAtomic) {
644       const auto &IsSyncScopeInclusion =
645           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
646       if (!IsSyncScopeInclusion) {
647         reportUnsupported(MI,
648           "Unsupported non-inclusive atomic synchronization scope");
649         return None;
650       }
651 
652       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
653       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
654       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
655              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
656       FailureOrdering =
657           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
658     }
659   }
660 
661   SIAtomicScope Scope = SIAtomicScope::NONE;
662   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
663   bool IsCrossAddressSpaceOrdering = false;
664   if (Ordering != AtomicOrdering::NotAtomic) {
665     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
666     if (!ScopeOrNone) {
667       reportUnsupported(MI, "Unsupported atomic synchronization scope");
668       return None;
669     }
670     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
671       ScopeOrNone.getValue();
672     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
673         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
674         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
675       reportUnsupported(MI, "Unsupported atomic address space");
676       return None;
677     }
678   }
679   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
680                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
681                      IsNonTemporal);
682 }
683 
684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
685     const MachineBasicBlock::iterator &MI) const {
686   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
687 
688   if (!(MI->mayLoad() && !MI->mayStore()))
689     return None;
690 
691   // Be conservative if there are no memory operands.
692   if (MI->getNumMemOperands() == 0)
693     return SIMemOpInfo();
694 
695   return constructFromMIWithMMO(MI);
696 }
697 
698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
699     const MachineBasicBlock::iterator &MI) const {
700   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
701 
702   if (!(!MI->mayLoad() && MI->mayStore()))
703     return None;
704 
705   // Be conservative if there are no memory operands.
706   if (MI->getNumMemOperands() == 0)
707     return SIMemOpInfo();
708 
709   return constructFromMIWithMMO(MI);
710 }
711 
712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
713     const MachineBasicBlock::iterator &MI) const {
714   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
715 
716   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
717     return None;
718 
719   AtomicOrdering Ordering =
720     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
721 
722   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
723   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
724   if (!ScopeOrNone) {
725     reportUnsupported(MI, "Unsupported atomic synchronization scope");
726     return None;
727   }
728 
729   SIAtomicScope Scope = SIAtomicScope::NONE;
730   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
731   bool IsCrossAddressSpaceOrdering = false;
732   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
733     ScopeOrNone.getValue();
734 
735   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
736       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
737     reportUnsupported(MI, "Unsupported atomic address space");
738     return None;
739   }
740 
741   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
742                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
743 }
744 
745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
746     const MachineBasicBlock::iterator &MI) const {
747   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
748 
749   if (!(MI->mayLoad() && MI->mayStore()))
750     return None;
751 
752   // Be conservative if there are no memory operands.
753   if (MI->getNumMemOperands() == 0)
754     return SIMemOpInfo();
755 
756   return constructFromMIWithMMO(MI);
757 }
758 
759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
760   TII = ST.getInstrInfo();
761   IV = getIsaVersion(ST.getCPU());
762   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
763 }
764 
765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
766                                     AMDGPU::CPol::CPol Bit) const {
767   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
768   if (!CPol)
769     return false;
770 
771   CPol->setImm(CPol->getImm() | Bit);
772   return true;
773 }
774 
775 /* static */
776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
777   GCNSubtarget::Generation Generation = ST.getGeneration();
778   if (ST.hasGFX90AInsts())
779     return std::make_unique<SIGfx90ACacheControl>(ST);
780   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
781     return std::make_unique<SIGfx6CacheControl>(ST);
782   if (Generation < AMDGPUSubtarget::GFX10)
783     return std::make_unique<SIGfx7CacheControl>(ST);
784   return std::make_unique<SIGfx10CacheControl>(ST);
785 }
786 
787 bool SIGfx6CacheControl::enableLoadCacheBypass(
788     const MachineBasicBlock::iterator &MI,
789     SIAtomicScope Scope,
790     SIAtomicAddrSpace AddrSpace) const {
791   assert(MI->mayLoad() && !MI->mayStore());
792   bool Changed = false;
793 
794   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
795     switch (Scope) {
796     case SIAtomicScope::SYSTEM:
797     case SIAtomicScope::AGENT:
798       // Set L1 cache policy to MISS_EVICT.
799       // Note: there is no L2 cache bypass policy at the ISA level.
800       Changed |= enableGLCBit(MI);
801       break;
802     case SIAtomicScope::WORKGROUP:
803     case SIAtomicScope::WAVEFRONT:
804     case SIAtomicScope::SINGLETHREAD:
805       // No cache to bypass.
806       break;
807     default:
808       llvm_unreachable("Unsupported synchronization scope");
809     }
810   }
811 
812   /// The scratch address space does not need the global memory caches
813   /// to be bypassed as all memory operations by the same thread are
814   /// sequentially consistent, and no other thread can access scratch
815   /// memory.
816 
817   /// Other address spaces do not have a cache.
818 
819   return Changed;
820 }
821 
822 bool SIGfx6CacheControl::enableStoreCacheBypass(
823     const MachineBasicBlock::iterator &MI,
824     SIAtomicScope Scope,
825     SIAtomicAddrSpace AddrSpace) const {
826   assert(!MI->mayLoad() && MI->mayStore());
827   bool Changed = false;
828 
829   /// The L1 cache is write through so does not need to be bypassed. There is no
830   /// bypass control for the L2 cache at the isa level.
831 
832   return Changed;
833 }
834 
835 bool SIGfx6CacheControl::enableRMWCacheBypass(
836     const MachineBasicBlock::iterator &MI,
837     SIAtomicScope Scope,
838     SIAtomicAddrSpace AddrSpace) const {
839   assert(MI->mayLoad() && MI->mayStore());
840   bool Changed = false;
841 
842   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
843   /// bypassed, and the GLC bit is instead used to indicate if they are
844   /// return or no-return.
845   /// Note: there is no L2 cache coherent bypass control at the ISA level.
846 
847   return Changed;
848 }
849 
850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
851     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
852     bool IsVolatile, bool IsNonTemporal) const {
853   // Only handle load and store, not atomic read-modify-write insructions. The
854   // latter use glc to indicate if the atomic returns a result and so must not
855   // be used for cache control.
856   assert(MI->mayLoad() ^ MI->mayStore());
857 
858   // Only update load and store, not LLVM IR atomic read-modify-write
859   // instructions. The latter are always marked as volatile so cannot sensibly
860   // handle it as do not want to pessimize all atomics. Also they do not support
861   // the nontemporal attribute.
862   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
863 
864   bool Changed = false;
865 
866   if (IsVolatile) {
867     // Set L1 cache policy to be MISS_EVICT for load instructions
868     // and MISS_LRU for store instructions.
869     // Note: there is no L2 cache bypass policy at the ISA level.
870     if (Op == SIMemOp::LOAD)
871       Changed |= enableGLCBit(MI);
872 
873     // Ensure operation has completed at system scope to cause all volatile
874     // operations to be visible outside the program in a global order. Do not
875     // request cross address space as only the global address space can be
876     // observable outside the program, so no need to cause a waitcnt for LDS
877     // address space operations.
878     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
879                           Position::AFTER);
880 
881     return Changed;
882   }
883 
884   if (IsNonTemporal) {
885     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
886     // for both loads and stores, and the L2 cache policy to STREAM.
887     Changed |= enableGLCBit(MI);
888     Changed |= enableSLCBit(MI);
889     return Changed;
890   }
891 
892   return Changed;
893 }
894 
895 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
896                                     SIAtomicScope Scope,
897                                     SIAtomicAddrSpace AddrSpace,
898                                     SIMemOp Op,
899                                     bool IsCrossAddrSpaceOrdering,
900                                     Position Pos) const {
901   bool Changed = false;
902 
903   MachineBasicBlock &MBB = *MI->getParent();
904   DebugLoc DL = MI->getDebugLoc();
905 
906   if (Pos == Position::AFTER)
907     ++MI;
908 
909   bool VMCnt = false;
910   bool LGKMCnt = false;
911 
912   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
913       SIAtomicAddrSpace::NONE) {
914     switch (Scope) {
915     case SIAtomicScope::SYSTEM:
916     case SIAtomicScope::AGENT:
917       VMCnt |= true;
918       break;
919     case SIAtomicScope::WORKGROUP:
920     case SIAtomicScope::WAVEFRONT:
921     case SIAtomicScope::SINGLETHREAD:
922       // The L1 cache keeps all memory operations in order for
923       // wavefronts in the same work-group.
924       break;
925     default:
926       llvm_unreachable("Unsupported synchronization scope");
927     }
928   }
929 
930   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
931     switch (Scope) {
932     case SIAtomicScope::SYSTEM:
933     case SIAtomicScope::AGENT:
934     case SIAtomicScope::WORKGROUP:
935       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
936       // not needed as LDS operations for all waves are executed in a total
937       // global ordering as observed by all waves. Required if also
938       // synchronizing with global/GDS memory as LDS operations could be
939       // reordered with respect to later global/GDS memory operations of the
940       // same wave.
941       LGKMCnt |= IsCrossAddrSpaceOrdering;
942       break;
943     case SIAtomicScope::WAVEFRONT:
944     case SIAtomicScope::SINGLETHREAD:
945       // The LDS keeps all memory operations in order for
946       // the same wavesfront.
947       break;
948     default:
949       llvm_unreachable("Unsupported synchronization scope");
950     }
951   }
952 
953   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
954     switch (Scope) {
955     case SIAtomicScope::SYSTEM:
956     case SIAtomicScope::AGENT:
957       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
958       // is not needed as GDS operations for all waves are executed in a total
959       // global ordering as observed by all waves. Required if also
960       // synchronizing with global/LDS memory as GDS operations could be
961       // reordered with respect to later global/LDS memory operations of the
962       // same wave.
963       LGKMCnt |= IsCrossAddrSpaceOrdering;
964       break;
965     case SIAtomicScope::WORKGROUP:
966     case SIAtomicScope::WAVEFRONT:
967     case SIAtomicScope::SINGLETHREAD:
968       // The GDS keeps all memory operations in order for
969       // the same work-group.
970       break;
971     default:
972       llvm_unreachable("Unsupported synchronization scope");
973     }
974   }
975 
976   if (VMCnt || LGKMCnt) {
977     unsigned WaitCntImmediate =
978       AMDGPU::encodeWaitcnt(IV,
979                             VMCnt ? 0 : getVmcntBitMask(IV),
980                             getExpcntBitMask(IV),
981                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
982     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
983     Changed = true;
984   }
985 
986   if (Pos == Position::AFTER)
987     --MI;
988 
989   return Changed;
990 }
991 
992 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
993                                        SIAtomicScope Scope,
994                                        SIAtomicAddrSpace AddrSpace,
995                                        Position Pos) const {
996   if (!InsertCacheInv)
997     return false;
998 
999   bool Changed = false;
1000 
1001   MachineBasicBlock &MBB = *MI->getParent();
1002   DebugLoc DL = MI->getDebugLoc();
1003 
1004   if (Pos == Position::AFTER)
1005     ++MI;
1006 
1007   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1008     switch (Scope) {
1009     case SIAtomicScope::SYSTEM:
1010     case SIAtomicScope::AGENT:
1011       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1012       Changed = true;
1013       break;
1014     case SIAtomicScope::WORKGROUP:
1015     case SIAtomicScope::WAVEFRONT:
1016     case SIAtomicScope::SINGLETHREAD:
1017       // No cache to invalidate.
1018       break;
1019     default:
1020       llvm_unreachable("Unsupported synchronization scope");
1021     }
1022   }
1023 
1024   /// The scratch address space does not need the global memory cache
1025   /// to be flushed as all memory operations by the same thread are
1026   /// sequentially consistent, and no other thread can access scratch
1027   /// memory.
1028 
1029   /// Other address spaces do not have a cache.
1030 
1031   if (Pos == Position::AFTER)
1032     --MI;
1033 
1034   return Changed;
1035 }
1036 
1037 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1038                                        SIAtomicScope Scope,
1039                                        SIAtomicAddrSpace AddrSpace,
1040                                        bool IsCrossAddrSpaceOrdering,
1041                                        Position Pos) const {
1042   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1043                     IsCrossAddrSpaceOrdering, Pos);
1044 }
1045 
1046 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1047                                        SIAtomicScope Scope,
1048                                        SIAtomicAddrSpace AddrSpace,
1049                                        Position Pos) const {
1050   if (!InsertCacheInv)
1051     return false;
1052 
1053   bool Changed = false;
1054 
1055   MachineBasicBlock &MBB = *MI->getParent();
1056   DebugLoc DL = MI->getDebugLoc();
1057 
1058   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1059 
1060   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1061                                     ? AMDGPU::BUFFER_WBINVL1
1062                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1063 
1064   if (Pos == Position::AFTER)
1065     ++MI;
1066 
1067   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1068     switch (Scope) {
1069     case SIAtomicScope::SYSTEM:
1070     case SIAtomicScope::AGENT:
1071       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1072       Changed = true;
1073       break;
1074     case SIAtomicScope::WORKGROUP:
1075     case SIAtomicScope::WAVEFRONT:
1076     case SIAtomicScope::SINGLETHREAD:
1077       // No cache to invalidate.
1078       break;
1079     default:
1080       llvm_unreachable("Unsupported synchronization scope");
1081     }
1082   }
1083 
1084   /// The scratch address space does not need the global memory cache
1085   /// to be flushed as all memory operations by the same thread are
1086   /// sequentially consistent, and no other thread can access scratch
1087   /// memory.
1088 
1089   /// Other address spaces do not have a cache.
1090 
1091   if (Pos == Position::AFTER)
1092     --MI;
1093 
1094   return Changed;
1095 }
1096 
1097 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1098     const MachineBasicBlock::iterator &MI,
1099     SIAtomicScope Scope,
1100     SIAtomicAddrSpace AddrSpace) const {
1101   assert(MI->mayLoad() && !MI->mayStore());
1102   bool Changed = false;
1103 
1104   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1105     switch (Scope) {
1106     case SIAtomicScope::SYSTEM:
1107     case SIAtomicScope::AGENT:
1108       // Set the L1 cache policy to MISS_LRU.
1109       // Note: there is no L2 cache bypass policy at the ISA level.
1110       Changed |= enableGLCBit(MI);
1111       break;
1112     case SIAtomicScope::WORKGROUP:
1113       // In threadgroup split mode the waves of a work-group can be executing on
1114       // different CUs. Therefore need to bypass the L1 which is per CU.
1115       // Otherwise in non-threadgroup split mode all waves of a work-group are
1116       // on the same CU, and so the L1 does not need to be bypassed.
1117       if (ST.isTgSplitEnabled())
1118         Changed |= enableGLCBit(MI);
1119       break;
1120     case SIAtomicScope::WAVEFRONT:
1121     case SIAtomicScope::SINGLETHREAD:
1122       // No cache to bypass.
1123       break;
1124     default:
1125       llvm_unreachable("Unsupported synchronization scope");
1126     }
1127   }
1128 
1129   /// The scratch address space does not need the global memory caches
1130   /// to be bypassed as all memory operations by the same thread are
1131   /// sequentially consistent, and no other thread can access scratch
1132   /// memory.
1133 
1134   /// Other address spaces do not have a cache.
1135 
1136   return Changed;
1137 }
1138 
1139 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1140     const MachineBasicBlock::iterator &MI,
1141     SIAtomicScope Scope,
1142     SIAtomicAddrSpace AddrSpace) const {
1143   assert(!MI->mayLoad() && MI->mayStore());
1144   bool Changed = false;
1145 
1146   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1147     switch (Scope) {
1148     case SIAtomicScope::SYSTEM:
1149     case SIAtomicScope::AGENT:
1150       /// Do not set glc for store atomic operations as they implicitly write
1151       /// through the L1 cache.
1152       break;
1153     case SIAtomicScope::WORKGROUP:
1154     case SIAtomicScope::WAVEFRONT:
1155     case SIAtomicScope::SINGLETHREAD:
1156       // No cache to bypass. Store atomics implicitly write through the L1
1157       // cache.
1158       break;
1159     default:
1160       llvm_unreachable("Unsupported synchronization scope");
1161     }
1162   }
1163 
1164   /// The scratch address space does not need the global memory caches
1165   /// to be bypassed as all memory operations by the same thread are
1166   /// sequentially consistent, and no other thread can access scratch
1167   /// memory.
1168 
1169   /// Other address spaces do not have a cache.
1170 
1171   return Changed;
1172 }
1173 
1174 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1175     const MachineBasicBlock::iterator &MI,
1176     SIAtomicScope Scope,
1177     SIAtomicAddrSpace AddrSpace) const {
1178   assert(MI->mayLoad() && MI->mayStore());
1179   bool Changed = false;
1180 
1181   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1182     switch (Scope) {
1183     case SIAtomicScope::SYSTEM:
1184     case SIAtomicScope::AGENT:
1185       /// Do not set glc for RMW atomic operations as they implicitly bypass
1186       /// the L1 cache, and the glc bit is instead used to indicate if they are
1187       /// return or no-return.
1188       break;
1189     case SIAtomicScope::WORKGROUP:
1190     case SIAtomicScope::WAVEFRONT:
1191     case SIAtomicScope::SINGLETHREAD:
1192       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1193       break;
1194     default:
1195       llvm_unreachable("Unsupported synchronization scope");
1196     }
1197   }
1198 
1199   return Changed;
1200 }
1201 
1202 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1203     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1204     bool IsVolatile, bool IsNonTemporal) const {
1205   // Only handle load and store, not atomic read-modify-write insructions. The
1206   // latter use glc to indicate if the atomic returns a result and so must not
1207   // be used for cache control.
1208   assert(MI->mayLoad() ^ MI->mayStore());
1209 
1210   // Only update load and store, not LLVM IR atomic read-modify-write
1211   // instructions. The latter are always marked as volatile so cannot sensibly
1212   // handle it as do not want to pessimize all atomics. Also they do not support
1213   // the nontemporal attribute.
1214   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1215 
1216   bool Changed = false;
1217 
1218   if (IsVolatile) {
1219     // Set L1 cache policy to be MISS_EVICT for load instructions
1220     // and MISS_LRU for store instructions.
1221     // Note: there is no L2 cache bypass policy at the ISA level.
1222     if (Op == SIMemOp::LOAD)
1223       Changed |= enableGLCBit(MI);
1224 
1225     // Ensure operation has completed at system scope to cause all volatile
1226     // operations to be visible outside the program in a global order. Do not
1227     // request cross address space as only the global address space can be
1228     // observable outside the program, so no need to cause a waitcnt for LDS
1229     // address space operations.
1230     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1231                           Position::AFTER);
1232 
1233     return Changed;
1234   }
1235 
1236   if (IsNonTemporal) {
1237     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1238     // for both loads and stores, and the L2 cache policy to STREAM.
1239     Changed |= enableGLCBit(MI);
1240     Changed |= enableSLCBit(MI);
1241     return Changed;
1242   }
1243 
1244   return Changed;
1245 }
1246 
1247 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1248                                       SIAtomicScope Scope,
1249                                       SIAtomicAddrSpace AddrSpace,
1250                                       SIMemOp Op,
1251                                       bool IsCrossAddrSpaceOrdering,
1252                                       Position Pos) const {
1253   if (ST.isTgSplitEnabled()) {
1254     // In threadgroup split mode the waves of a work-group can be executing on
1255     // different CUs. Therefore need to wait for global or GDS memory operations
1256     // to complete to ensure they are visible to waves in the other CUs.
1257     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1258     // the same CU, so no need to wait for global memory as all waves in the
1259     // work-group access the same the L1, nor wait for GDS as access are ordered
1260     // on a CU.
1261     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1262                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1263         (Scope == SIAtomicScope::WORKGROUP)) {
1264       // Same as GFX7 using agent scope.
1265       Scope = SIAtomicScope::AGENT;
1266     }
1267     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1268     // LDS memory operations.
1269     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1270   }
1271   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1272                                         IsCrossAddrSpaceOrdering, Pos);
1273 }
1274 
1275 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1276                                          SIAtomicScope Scope,
1277                                          SIAtomicAddrSpace AddrSpace,
1278                                          Position Pos) const {
1279   if (!InsertCacheInv)
1280     return false;
1281 
1282   bool Changed = false;
1283 
1284   MachineBasicBlock &MBB = *MI->getParent();
1285   DebugLoc DL = MI->getDebugLoc();
1286 
1287   if (Pos == Position::AFTER)
1288     ++MI;
1289 
1290   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1291     switch (Scope) {
1292     case SIAtomicScope::SYSTEM:
1293       // Ensures that following loads will not see stale remote VMEM data or
1294       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1295       // CC will never be stale due to the local memory probes.
1296       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1297       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1298       // hardware does not reorder memory operations by the same wave with
1299       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1300       // remove any cache lines of earlier writes by the same wave and ensures
1301       // later reads by the same wave will refetch the cache lines.
1302       Changed = true;
1303       break;
1304     case SIAtomicScope::AGENT:
1305       // Same as GFX7.
1306       break;
1307     case SIAtomicScope::WORKGROUP:
1308       // In threadgroup split mode the waves of a work-group can be executing on
1309       // different CUs. Therefore need to invalidate the L1 which is per CU.
1310       // Otherwise in non-threadgroup split mode all waves of a work-group are
1311       // on the same CU, and so the L1 does not need to be invalidated.
1312       if (ST.isTgSplitEnabled()) {
1313         // Same as GFX7 using agent scope.
1314         Scope = SIAtomicScope::AGENT;
1315       }
1316       break;
1317     case SIAtomicScope::WAVEFRONT:
1318     case SIAtomicScope::SINGLETHREAD:
1319       // Same as GFX7.
1320       break;
1321     default:
1322       llvm_unreachable("Unsupported synchronization scope");
1323     }
1324   }
1325 
1326   /// The scratch address space does not need the global memory cache
1327   /// to be flushed as all memory operations by the same thread are
1328   /// sequentially consistent, and no other thread can access scratch
1329   /// memory.
1330 
1331   /// Other address spaces do not have a cache.
1332 
1333   if (Pos == Position::AFTER)
1334     --MI;
1335 
1336   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1337 
1338   return Changed;
1339 }
1340 
1341 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1342                                          SIAtomicScope Scope,
1343                                          SIAtomicAddrSpace AddrSpace,
1344                                          bool IsCrossAddrSpaceOrdering,
1345                                          Position Pos) const {
1346   bool Changed = false;
1347 
1348   MachineBasicBlock &MBB = *MI->getParent();
1349   DebugLoc DL = MI->getDebugLoc();
1350 
1351   if (Pos == Position::AFTER)
1352     ++MI;
1353 
1354   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1355     switch (Scope) {
1356     case SIAtomicScope::SYSTEM:
1357       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1358       // hardware does not reorder memory operations by the same wave with
1359       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1360       // to initiate writeback of any dirty cache lines of earlier writes by the
1361       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1362       // writeback has completed.
1363       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1364       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1365       // vmcnt(0)" needed by the "BUFFER_WBL2".
1366       Changed = true;
1367       break;
1368     case SIAtomicScope::AGENT:
1369     case SIAtomicScope::WORKGROUP:
1370     case SIAtomicScope::WAVEFRONT:
1371     case SIAtomicScope::SINGLETHREAD:
1372       // Same as GFX7.
1373       break;
1374     default:
1375       llvm_unreachable("Unsupported synchronization scope");
1376     }
1377   }
1378 
1379   if (Pos == Position::AFTER)
1380     --MI;
1381 
1382   Changed |=
1383       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1384                                         IsCrossAddrSpaceOrdering, Pos);
1385 
1386   return Changed;
1387 }
1388 
1389 bool SIGfx10CacheControl::enableLoadCacheBypass(
1390     const MachineBasicBlock::iterator &MI,
1391     SIAtomicScope Scope,
1392     SIAtomicAddrSpace AddrSpace) const {
1393   assert(MI->mayLoad() && !MI->mayStore());
1394   bool Changed = false;
1395 
1396   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1397     switch (Scope) {
1398     case SIAtomicScope::SYSTEM:
1399     case SIAtomicScope::AGENT:
1400       // Set the L0 and L1 cache policies to MISS_EVICT.
1401       // Note: there is no L2 cache coherent bypass control at the ISA level.
1402       Changed |= enableGLCBit(MI);
1403       Changed |= enableDLCBit(MI);
1404       break;
1405     case SIAtomicScope::WORKGROUP:
1406       // In WGP mode the waves of a work-group can be executing on either CU of
1407       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1408       // CU mode all waves of a work-group are on the same CU, and so the L0
1409       // does not need to be bypassed.
1410       if (!ST.isCuModeEnabled())
1411         Changed |= enableGLCBit(MI);
1412       break;
1413     case SIAtomicScope::WAVEFRONT:
1414     case SIAtomicScope::SINGLETHREAD:
1415       // No cache to bypass.
1416       break;
1417     default:
1418       llvm_unreachable("Unsupported synchronization scope");
1419     }
1420   }
1421 
1422   /// The scratch address space does not need the global memory caches
1423   /// to be bypassed as all memory operations by the same thread are
1424   /// sequentially consistent, and no other thread can access scratch
1425   /// memory.
1426 
1427   /// Other address spaces do not have a cache.
1428 
1429   return Changed;
1430 }
1431 
1432 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1433     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1434     bool IsVolatile, bool IsNonTemporal) const {
1435 
1436   // Only handle load and store, not atomic read-modify-write insructions. The
1437   // latter use glc to indicate if the atomic returns a result and so must not
1438   // be used for cache control.
1439   assert(MI->mayLoad() ^ MI->mayStore());
1440 
1441   // Only update load and store, not LLVM IR atomic read-modify-write
1442   // instructions. The latter are always marked as volatile so cannot sensibly
1443   // handle it as do not want to pessimize all atomics. Also they do not support
1444   // the nontemporal attribute.
1445   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1446 
1447   bool Changed = false;
1448 
1449   if (IsVolatile) {
1450     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1451     // and MISS_LRU for store instructions.
1452     // Note: there is no L2 cache coherent bypass control at the ISA level.
1453     if (Op == SIMemOp::LOAD) {
1454       Changed |= enableGLCBit(MI);
1455       Changed |= enableDLCBit(MI);
1456     }
1457 
1458     // Ensure operation has completed at system scope to cause all volatile
1459     // operations to be visible outside the program in a global order. Do not
1460     // request cross address space as only the global address space can be
1461     // observable outside the program, so no need to cause a waitcnt for LDS
1462     // address space operations.
1463     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1464                           Position::AFTER);
1465     return Changed;
1466   }
1467 
1468   if (IsNonTemporal) {
1469     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1470     // and L2 cache policy to STREAM.
1471     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1472     // to MISS_EVICT and the L2 cache policy to STREAM.
1473     if (Op == SIMemOp::STORE)
1474       Changed |= enableGLCBit(MI);
1475     Changed |= enableSLCBit(MI);
1476 
1477     return Changed;
1478   }
1479 
1480   return Changed;
1481 }
1482 
1483 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1484                                      SIAtomicScope Scope,
1485                                      SIAtomicAddrSpace AddrSpace,
1486                                      SIMemOp Op,
1487                                      bool IsCrossAddrSpaceOrdering,
1488                                      Position Pos) const {
1489   bool Changed = false;
1490 
1491   MachineBasicBlock &MBB = *MI->getParent();
1492   DebugLoc DL = MI->getDebugLoc();
1493 
1494   if (Pos == Position::AFTER)
1495     ++MI;
1496 
1497   bool VMCnt = false;
1498   bool VSCnt = false;
1499   bool LGKMCnt = false;
1500 
1501   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1502       SIAtomicAddrSpace::NONE) {
1503     switch (Scope) {
1504     case SIAtomicScope::SYSTEM:
1505     case SIAtomicScope::AGENT:
1506       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1507         VMCnt |= true;
1508       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1509         VSCnt |= true;
1510       break;
1511     case SIAtomicScope::WORKGROUP:
1512       // In WGP mode the waves of a work-group can be executing on either CU of
1513       // the WGP. Therefore need to wait for operations to complete to ensure
1514       // they are visible to waves in the other CU as the L0 is per CU.
1515       // Otherwise in CU mode and all waves of a work-group are on the same CU
1516       // which shares the same L0.
1517       if (!ST.isCuModeEnabled()) {
1518         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1519           VMCnt |= true;
1520         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1521           VSCnt |= true;
1522       }
1523       break;
1524     case SIAtomicScope::WAVEFRONT:
1525     case SIAtomicScope::SINGLETHREAD:
1526       // The L0 cache keeps all memory operations in order for
1527       // work-items in the same wavefront.
1528       break;
1529     default:
1530       llvm_unreachable("Unsupported synchronization scope");
1531     }
1532   }
1533 
1534   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1535     switch (Scope) {
1536     case SIAtomicScope::SYSTEM:
1537     case SIAtomicScope::AGENT:
1538     case SIAtomicScope::WORKGROUP:
1539       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1540       // not needed as LDS operations for all waves are executed in a total
1541       // global ordering as observed by all waves. Required if also
1542       // synchronizing with global/GDS memory as LDS operations could be
1543       // reordered with respect to later global/GDS memory operations of the
1544       // same wave.
1545       LGKMCnt |= IsCrossAddrSpaceOrdering;
1546       break;
1547     case SIAtomicScope::WAVEFRONT:
1548     case SIAtomicScope::SINGLETHREAD:
1549       // The LDS keeps all memory operations in order for
1550       // the same wavesfront.
1551       break;
1552     default:
1553       llvm_unreachable("Unsupported synchronization scope");
1554     }
1555   }
1556 
1557   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1558     switch (Scope) {
1559     case SIAtomicScope::SYSTEM:
1560     case SIAtomicScope::AGENT:
1561       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1562       // is not needed as GDS operations for all waves are executed in a total
1563       // global ordering as observed by all waves. Required if also
1564       // synchronizing with global/LDS memory as GDS operations could be
1565       // reordered with respect to later global/LDS memory operations of the
1566       // same wave.
1567       LGKMCnt |= IsCrossAddrSpaceOrdering;
1568       break;
1569     case SIAtomicScope::WORKGROUP:
1570     case SIAtomicScope::WAVEFRONT:
1571     case SIAtomicScope::SINGLETHREAD:
1572       // The GDS keeps all memory operations in order for
1573       // the same work-group.
1574       break;
1575     default:
1576       llvm_unreachable("Unsupported synchronization scope");
1577     }
1578   }
1579 
1580   if (VMCnt || LGKMCnt) {
1581     unsigned WaitCntImmediate =
1582       AMDGPU::encodeWaitcnt(IV,
1583                             VMCnt ? 0 : getVmcntBitMask(IV),
1584                             getExpcntBitMask(IV),
1585                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1586     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1587     Changed = true;
1588   }
1589 
1590   if (VSCnt) {
1591     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1592       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1593       .addImm(0);
1594     Changed = true;
1595   }
1596 
1597   if (Pos == Position::AFTER)
1598     --MI;
1599 
1600   return Changed;
1601 }
1602 
1603 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1604                                         SIAtomicScope Scope,
1605                                         SIAtomicAddrSpace AddrSpace,
1606                                         Position Pos) const {
1607   if (!InsertCacheInv)
1608     return false;
1609 
1610   bool Changed = false;
1611 
1612   MachineBasicBlock &MBB = *MI->getParent();
1613   DebugLoc DL = MI->getDebugLoc();
1614 
1615   if (Pos == Position::AFTER)
1616     ++MI;
1617 
1618   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1619     switch (Scope) {
1620     case SIAtomicScope::SYSTEM:
1621     case SIAtomicScope::AGENT:
1622       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1623       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1624       Changed = true;
1625       break;
1626     case SIAtomicScope::WORKGROUP:
1627       // In WGP mode the waves of a work-group can be executing on either CU of
1628       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1629       // in CU mode and all waves of a work-group are on the same CU, and so the
1630       // L0 does not need to be invalidated.
1631       if (!ST.isCuModeEnabled()) {
1632         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1633         Changed = true;
1634       }
1635       break;
1636     case SIAtomicScope::WAVEFRONT:
1637     case SIAtomicScope::SINGLETHREAD:
1638       // No cache to invalidate.
1639       break;
1640     default:
1641       llvm_unreachable("Unsupported synchronization scope");
1642     }
1643   }
1644 
1645   /// The scratch address space does not need the global memory cache
1646   /// to be flushed as all memory operations by the same thread are
1647   /// sequentially consistent, and no other thread can access scratch
1648   /// memory.
1649 
1650   /// Other address spaces do not have a cache.
1651 
1652   if (Pos == Position::AFTER)
1653     --MI;
1654 
1655   return Changed;
1656 }
1657 
1658 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1659   if (AtomicPseudoMIs.empty())
1660     return false;
1661 
1662   for (auto &MI : AtomicPseudoMIs)
1663     MI->eraseFromParent();
1664 
1665   AtomicPseudoMIs.clear();
1666   return true;
1667 }
1668 
1669 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1670                                    MachineBasicBlock::iterator &MI) {
1671   assert(MI->mayLoad() && !MI->mayStore());
1672 
1673   bool Changed = false;
1674 
1675   if (MOI.isAtomic()) {
1676     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1677         MOI.getOrdering() == AtomicOrdering::Acquire ||
1678         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1679       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1680                                            MOI.getOrderingAddrSpace());
1681     }
1682 
1683     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1684       Changed |= CC->insertWait(MI, MOI.getScope(),
1685                                 MOI.getOrderingAddrSpace(),
1686                                 SIMemOp::LOAD | SIMemOp::STORE,
1687                                 MOI.getIsCrossAddressSpaceOrdering(),
1688                                 Position::BEFORE);
1689 
1690     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1691         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1692       Changed |= CC->insertWait(MI, MOI.getScope(),
1693                                 MOI.getInstrAddrSpace(),
1694                                 SIMemOp::LOAD,
1695                                 MOI.getIsCrossAddressSpaceOrdering(),
1696                                 Position::AFTER);
1697       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1698                                    MOI.getOrderingAddrSpace(),
1699                                    Position::AFTER);
1700     }
1701 
1702     return Changed;
1703   }
1704 
1705   // Atomic instructions already bypass caches to the scope specified by the
1706   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1707   // need additional treatment.
1708   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1709                                                 SIMemOp::LOAD, MOI.isVolatile(),
1710                                                 MOI.isNonTemporal());
1711   return Changed;
1712 }
1713 
1714 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1715                                     MachineBasicBlock::iterator &MI) {
1716   assert(!MI->mayLoad() && MI->mayStore());
1717 
1718   bool Changed = false;
1719 
1720   if (MOI.isAtomic()) {
1721     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1722         MOI.getOrdering() == AtomicOrdering::Release ||
1723         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1724       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1725                                             MOI.getOrderingAddrSpace());
1726     }
1727 
1728     if (MOI.getOrdering() == AtomicOrdering::Release ||
1729         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1730       Changed |= CC->insertRelease(MI, MOI.getScope(),
1731                                    MOI.getOrderingAddrSpace(),
1732                                    MOI.getIsCrossAddressSpaceOrdering(),
1733                                    Position::BEFORE);
1734 
1735     return Changed;
1736   }
1737 
1738   // Atomic instructions already bypass caches to the scope specified by the
1739   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1740   // need additional treatment.
1741   Changed |= CC->enableVolatileAndOrNonTemporal(
1742       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1743       MOI.isNonTemporal());
1744   return Changed;
1745 }
1746 
1747 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1748                                           MachineBasicBlock::iterator &MI) {
1749   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1750 
1751   AtomicPseudoMIs.push_back(MI);
1752   bool Changed = false;
1753 
1754   if (MOI.isAtomic()) {
1755     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1756         MOI.getOrdering() == AtomicOrdering::Release ||
1757         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1758         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1759       /// TODO: This relies on a barrier always generating a waitcnt
1760       /// for LDS to ensure it is not reordered with the completion of
1761       /// the proceeding LDS operations. If barrier had a memory
1762       /// ordering and memory scope, then library does not need to
1763       /// generate a fence. Could add support in this file for
1764       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1765       /// adding S_WAITCNT before a S_BARRIER.
1766       Changed |= CC->insertRelease(MI, MOI.getScope(),
1767                                    MOI.getOrderingAddrSpace(),
1768                                    MOI.getIsCrossAddressSpaceOrdering(),
1769                                    Position::BEFORE);
1770 
1771     // TODO: If both release and invalidate are happening they could be combined
1772     // to use the single "BUFFER_WBINV*" instruction. This could be done by
1773     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1774     // track cache invalidate and write back instructions.
1775 
1776     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1777         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1778         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1779       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1780                                    MOI.getOrderingAddrSpace(),
1781                                    Position::BEFORE);
1782 
1783     return Changed;
1784   }
1785 
1786   return Changed;
1787 }
1788 
1789 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1790   MachineBasicBlock::iterator &MI) {
1791   assert(MI->mayLoad() && MI->mayStore());
1792 
1793   bool Changed = false;
1794 
1795   if (MOI.isAtomic()) {
1796     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1797         MOI.getOrdering() == AtomicOrdering::Acquire ||
1798         MOI.getOrdering() == AtomicOrdering::Release ||
1799         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1800         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1801       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1802                                           MOI.getInstrAddrSpace());
1803     }
1804 
1805     if (MOI.getOrdering() == AtomicOrdering::Release ||
1806         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1807         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1808         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1809       Changed |= CC->insertRelease(MI, MOI.getScope(),
1810                                    MOI.getOrderingAddrSpace(),
1811                                    MOI.getIsCrossAddressSpaceOrdering(),
1812                                    Position::BEFORE);
1813 
1814     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1815         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1816         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1817         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1818         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1819       Changed |= CC->insertWait(MI, MOI.getScope(),
1820                                 MOI.getInstrAddrSpace(),
1821                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1822                                                    SIMemOp::STORE,
1823                                 MOI.getIsCrossAddressSpaceOrdering(),
1824                                 Position::AFTER);
1825       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1826                                    MOI.getOrderingAddrSpace(),
1827                                    Position::AFTER);
1828     }
1829 
1830     return Changed;
1831   }
1832 
1833   return Changed;
1834 }
1835 
1836 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1837   bool Changed = false;
1838 
1839   SIMemOpAccess MOA(MF);
1840   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1841 
1842   for (auto &MBB : MF) {
1843     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1844 
1845       // Unbundle instructions after the post-RA scheduler.
1846       if (MI->isBundle() && MI->mayLoadOrStore()) {
1847         MachineBasicBlock::instr_iterator II(MI->getIterator());
1848         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1849              I != E && I->isBundledWithPred(); ++I) {
1850           I->unbundleFromPred();
1851           for (MachineOperand &MO : I->operands())
1852             if (MO.isReg())
1853               MO.setIsInternalRead(false);
1854         }
1855 
1856         MI->eraseFromParent();
1857         MI = II->getIterator();
1858       }
1859 
1860       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1861         continue;
1862 
1863       if (const auto &MOI = MOA.getLoadInfo(MI))
1864         Changed |= expandLoad(MOI.getValue(), MI);
1865       else if (const auto &MOI = MOA.getStoreInfo(MI))
1866         Changed |= expandStore(MOI.getValue(), MI);
1867       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1868         Changed |= expandAtomicFence(MOI.getValue(), MI);
1869       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1870         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1871     }
1872   }
1873 
1874   Changed |= removeAtomicPseudoMIs();
1875   return Changed;
1876 }
1877 
1878 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1879 
1880 char SIMemoryLegalizer::ID = 0;
1881 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1882 
1883 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1884   return new SIMemoryLegalizer();
1885 }
1886