1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/IR/DiagnosticInfo.h"
25 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
26 #include "llvm/Support/AtomicOrdering.h"
27 #include "llvm/TargetParser/TargetParser.h"
28
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31
32 #define DEBUG_TYPE "si-memory-legalizer"
33 #define PASS_NAME "SI Memory Legalizer"
34
35 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
37 cl::desc("Use this to skip inserting cache invalidating instructions."));
38
39 namespace {
40
41 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
42
43 /// Memory operation flags. Can be ORed together.
44 enum class SIMemOp {
45 NONE = 0u,
46 LOAD = 1u << 0,
47 STORE = 1u << 1,
48 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
49 };
50
51 /// Position to insert a new instruction relative to an existing
52 /// instruction.
53 enum class Position {
54 BEFORE,
55 AFTER
56 };
57
58 /// The atomic synchronization scopes supported by the AMDGPU target.
59 enum class SIAtomicScope {
60 NONE,
61 SINGLETHREAD,
62 WAVEFRONT,
63 WORKGROUP,
64 AGENT,
65 SYSTEM
66 };
67
68 /// The distinct address spaces supported by the AMDGPU target for
69 /// atomic memory operation. Can be ORed together.
70 enum class SIAtomicAddrSpace {
71 NONE = 0u,
72 GLOBAL = 1u << 0,
73 LDS = 1u << 1,
74 SCRATCH = 1u << 2,
75 GDS = 1u << 3,
76 OTHER = 1u << 4,
77
78 /// The address spaces that can be accessed by a FLAT instruction.
79 FLAT = GLOBAL | LDS | SCRATCH,
80
81 /// The address spaces that support atomic instructions.
82 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
83
84 /// All address spaces.
85 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
86
87 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
88 };
89
90 class SIMemOpInfo final {
91 private:
92
93 friend class SIMemOpAccess;
94
95 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100 bool IsCrossAddressSpaceOrdering = false;
101 bool IsVolatile = false;
102 bool IsNonTemporal = false;
103 bool IsLastUse = false;
104
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false,bool IsLastUse=false)105 SIMemOpInfo(
106 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110 bool IsCrossAddressSpaceOrdering = true,
111 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112 bool IsVolatile = false, bool IsNonTemporal = false,
113 bool IsLastUse = false)
114 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118 IsLastUse(IsLastUse) {
119
120 if (Ordering == AtomicOrdering::NotAtomic) {
121 assert(Scope == SIAtomicScope::NONE &&
122 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123 !IsCrossAddressSpaceOrdering &&
124 FailureOrdering == AtomicOrdering::NotAtomic);
125 return;
126 }
127
128 assert(Scope != SIAtomicScope::NONE &&
129 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE &&
131 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132 SIAtomicAddrSpace::NONE);
133
134 // There is also no cross address space ordering if the ordering
135 // address space is the same as the instruction address space and
136 // only contains a single address space.
137 if ((OrderingAddrSpace == InstrAddrSpace) &&
138 isPowerOf2_32(uint32_t(InstrAddrSpace)))
139 this->IsCrossAddressSpaceOrdering = false;
140
141 // Limit the scope to the maximum supported by the instruction's address
142 // spaces.
143 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144 SIAtomicAddrSpace::NONE) {
145 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
146 } else if ((InstrAddrSpace &
147 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
148 SIAtomicAddrSpace::NONE) {
149 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
150 } else if ((InstrAddrSpace &
151 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
152 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
154 }
155 }
156
157 public:
158 /// \returns Atomic synchronization scope of the machine instruction used to
159 /// create this SIMemOpInfo.
getScope() const160 SIAtomicScope getScope() const {
161 return Scope;
162 }
163
164 /// \returns Ordering constraint of the machine instruction used to
165 /// create this SIMemOpInfo.
getOrdering() const166 AtomicOrdering getOrdering() const {
167 return Ordering;
168 }
169
170 /// \returns Failure ordering constraint of the machine instruction used to
171 /// create this SIMemOpInfo.
getFailureOrdering() const172 AtomicOrdering getFailureOrdering() const {
173 return FailureOrdering;
174 }
175
176 /// \returns The address spaces be accessed by the machine
177 /// instruction used to create this SIMemOpInfo.
getInstrAddrSpace() const178 SIAtomicAddrSpace getInstrAddrSpace() const {
179 return InstrAddrSpace;
180 }
181
182 /// \returns The address spaces that must be ordered by the machine
183 /// instruction used to create this SIMemOpInfo.
getOrderingAddrSpace() const184 SIAtomicAddrSpace getOrderingAddrSpace() const {
185 return OrderingAddrSpace;
186 }
187
188 /// \returns Return true iff memory ordering of operations on
189 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const190 bool getIsCrossAddressSpaceOrdering() const {
191 return IsCrossAddressSpaceOrdering;
192 }
193
194 /// \returns True if memory access of the machine instruction used to
195 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const196 bool isVolatile() const {
197 return IsVolatile;
198 }
199
200 /// \returns True if memory access of the machine instruction used to
201 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const202 bool isNonTemporal() const {
203 return IsNonTemporal;
204 }
205
206 /// \returns True if memory access of the machine instruction used to
207 /// create this SIMemOpInfo is last use, false otherwise.
isLastUse() const208 bool isLastUse() const { return IsLastUse; }
209
210 /// \returns True if ordering constraint of the machine instruction used to
211 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const212 bool isAtomic() const {
213 return Ordering != AtomicOrdering::NotAtomic;
214 }
215
216 };
217
218 class SIMemOpAccess final {
219 private:
220 const AMDGPUMachineModuleInfo *MMI = nullptr;
221
222 /// Reports unsupported message \p Msg for \p MI to LLVM context.
223 void reportUnsupported(const MachineBasicBlock::iterator &MI,
224 const char *Msg) const;
225
226 /// Inspects the target synchronization scope \p SSID and determines
227 /// the SI atomic scope it corresponds to, the address spaces it
228 /// covers, and whether the memory ordering applies between address
229 /// spaces.
230 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232
233 /// \return Return a bit set of the address spaces accessed by \p AS.
234 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235
236 /// \returns Info constructed from \p MI, which has at least machine memory
237 /// operand.
238 std::optional<SIMemOpInfo>
239 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240
241 public:
242 /// Construct class to support accessing the machine memory operands
243 /// of instructions in the machine function \p MF.
244 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
245
246 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247 std::optional<SIMemOpInfo>
248 getLoadInfo(const MachineBasicBlock::iterator &MI) const;
249
250 /// \returns Store info if \p MI is a store operation, "std::nullopt"
251 /// otherwise.
252 std::optional<SIMemOpInfo>
253 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254
255 /// \returns Atomic fence info if \p MI is an atomic fence operation,
256 /// "std::nullopt" otherwise.
257 std::optional<SIMemOpInfo>
258 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259
260 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261 /// rmw operation, "std::nullopt" otherwise.
262 std::optional<SIMemOpInfo>
263 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264 };
265
266 class SICacheControl {
267 protected:
268
269 /// AMDGPU subtarget info.
270 const GCNSubtarget &ST;
271
272 /// Instruction info.
273 const SIInstrInfo *TII = nullptr;
274
275 IsaVersion IV;
276
277 /// Whether to insert cache invalidating instructions.
278 bool InsertCacheInv;
279
280 SICacheControl(const GCNSubtarget &ST);
281
282 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
283 /// \returns Returns true if \p MI is modified, false otherwise.
284 bool enableNamedBit(const MachineBasicBlock::iterator MI,
285 AMDGPU::CPol::CPol Bit) const;
286
287 public:
288
289 /// Create a cache control for the subtarget \p ST.
290 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291
292 /// Update \p MI memory load instruction to bypass any caches up to
293 /// the \p Scope memory scope for address spaces \p
294 /// AddrSpace. Return true iff the instruction was modified.
295 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296 SIAtomicScope Scope,
297 SIAtomicAddrSpace AddrSpace) const = 0;
298
299 /// Update \p MI memory store instruction to bypass any caches up to
300 /// the \p Scope memory scope for address spaces \p
301 /// AddrSpace. Return true iff the instruction was modified.
302 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303 SIAtomicScope Scope,
304 SIAtomicAddrSpace AddrSpace) const = 0;
305
306 /// Update \p MI memory read-modify-write instruction to bypass any caches up
307 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308 /// iff the instruction was modified.
309 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310 SIAtomicScope Scope,
311 SIAtomicAddrSpace AddrSpace) const = 0;
312
313 /// Update \p MI memory instruction of kind \p Op associated with address
314 /// spaces \p AddrSpace to indicate it is volatile and/or
315 /// nontemporal/last-use. Return true iff the instruction was modified.
316 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317 SIAtomicAddrSpace AddrSpace,
318 SIMemOp Op, bool IsVolatile,
319 bool IsNonTemporal,
320 bool IsLastUse = false) const = 0;
321
expandSystemScopeStore(MachineBasicBlock::iterator & MI) const322 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323 return false;
324 };
325
326 /// Inserts any necessary instructions at position \p Pos relative
327 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
328 /// \p Op associated with address spaces \p AddrSpace have completed. Used
329 /// between memory instructions to enforce the order they become visible as
330 /// observed by other memory instructions executing in memory scope \p Scope.
331 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332 /// address spaces. Returns true iff any instructions inserted.
333 virtual bool insertWait(MachineBasicBlock::iterator &MI,
334 SIAtomicScope Scope,
335 SIAtomicAddrSpace AddrSpace,
336 SIMemOp Op,
337 bool IsCrossAddrSpaceOrdering,
338 Position Pos) const = 0;
339
340 /// Inserts any necessary instructions at position \p Pos relative to
341 /// instruction \p MI to ensure any subsequent memory instructions of this
342 /// thread with address spaces \p AddrSpace will observe the previous memory
343 /// operations by any thread for memory scopes up to memory scope \p Scope .
344 /// Returns true iff any instructions inserted.
345 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 Position Pos) const = 0;
349
350 /// Inserts any necessary instructions at position \p Pos relative to
351 /// instruction \p MI to ensure previous memory instructions by this thread
352 /// with address spaces \p AddrSpace have completed and can be observed by
353 /// subsequent memory instructions by any thread executing in memory scope \p
354 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355 /// between address spaces. Returns true iff any instructions inserted.
356 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357 SIAtomicScope Scope,
358 SIAtomicAddrSpace AddrSpace,
359 bool IsCrossAddrSpaceOrdering,
360 Position Pos) const = 0;
361
362 /// Virtual destructor to allow derivations to be deleted.
363 virtual ~SICacheControl() = default;
364
tryForceStoreSC0SC1(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI) const365 virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
366 MachineBasicBlock::iterator &MI) const {
367 return false;
368 }
369 };
370
371 class SIGfx6CacheControl : public SICacheControl {
372 protected:
373
374 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
375 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const376 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377 return enableNamedBit(MI, AMDGPU::CPol::GLC);
378 }
379
380 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
381 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const382 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383 return enableNamedBit(MI, AMDGPU::CPol::SLC);
384 }
385
386 public:
387
SIGfx6CacheControl(const GCNSubtarget & ST)388 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
389
390 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
391 SIAtomicScope Scope,
392 SIAtomicAddrSpace AddrSpace) const override;
393
394 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395 SIAtomicScope Scope,
396 SIAtomicAddrSpace AddrSpace) const override;
397
398 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399 SIAtomicScope Scope,
400 SIAtomicAddrSpace AddrSpace) const override;
401
402 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404 bool IsVolatile, bool IsNonTemporal,
405 bool IsLastUse) const override;
406
407 bool insertWait(MachineBasicBlock::iterator &MI,
408 SIAtomicScope Scope,
409 SIAtomicAddrSpace AddrSpace,
410 SIMemOp Op,
411 bool IsCrossAddrSpaceOrdering,
412 Position Pos) const override;
413
414 bool insertAcquire(MachineBasicBlock::iterator &MI,
415 SIAtomicScope Scope,
416 SIAtomicAddrSpace AddrSpace,
417 Position Pos) const override;
418
419 bool insertRelease(MachineBasicBlock::iterator &MI,
420 SIAtomicScope Scope,
421 SIAtomicAddrSpace AddrSpace,
422 bool IsCrossAddrSpaceOrdering,
423 Position Pos) const override;
424 };
425
426 class SIGfx7CacheControl : public SIGfx6CacheControl {
427 public:
428
SIGfx7CacheControl(const GCNSubtarget & ST)429 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
430
431 bool insertAcquire(MachineBasicBlock::iterator &MI,
432 SIAtomicScope Scope,
433 SIAtomicAddrSpace AddrSpace,
434 Position Pos) const override;
435
436 };
437
438 class SIGfx90ACacheControl : public SIGfx7CacheControl {
439 public:
440
SIGfx90ACacheControl(const GCNSubtarget & ST)441 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
442
443 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444 SIAtomicScope Scope,
445 SIAtomicAddrSpace AddrSpace) const override;
446
447 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448 SIAtomicScope Scope,
449 SIAtomicAddrSpace AddrSpace) const override;
450
451 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace) const override;
454
455 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457 bool IsVolatile, bool IsNonTemporal,
458 bool IsLastUse) const override;
459
460 bool insertWait(MachineBasicBlock::iterator &MI,
461 SIAtomicScope Scope,
462 SIAtomicAddrSpace AddrSpace,
463 SIMemOp Op,
464 bool IsCrossAddrSpaceOrdering,
465 Position Pos) const override;
466
467 bool insertAcquire(MachineBasicBlock::iterator &MI,
468 SIAtomicScope Scope,
469 SIAtomicAddrSpace AddrSpace,
470 Position Pos) const override;
471
472 bool insertRelease(MachineBasicBlock::iterator &MI,
473 SIAtomicScope Scope,
474 SIAtomicAddrSpace AddrSpace,
475 bool IsCrossAddrSpaceOrdering,
476 Position Pos) const override;
477 };
478
479 class SIGfx940CacheControl : public SIGfx90ACacheControl {
480 protected:
481
482 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
483 /// is modified, false otherwise.
enableSC0Bit(const MachineBasicBlock::iterator & MI) const484 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
485 return enableNamedBit(MI, AMDGPU::CPol::SC0);
486 }
487
488 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
489 /// is modified, false otherwise.
enableSC1Bit(const MachineBasicBlock::iterator & MI) const490 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
491 return enableNamedBit(MI, AMDGPU::CPol::SC1);
492 }
493
494 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
495 /// is modified, false otherwise.
enableNTBit(const MachineBasicBlock::iterator & MI) const496 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
497 return enableNamedBit(MI, AMDGPU::CPol::NT);
498 }
499
500 public:
501
SIGfx940CacheControl(const GCNSubtarget & ST)502 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
503
504 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
505 SIAtomicScope Scope,
506 SIAtomicAddrSpace AddrSpace) const override;
507
508 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
509 SIAtomicScope Scope,
510 SIAtomicAddrSpace AddrSpace) const override;
511
512 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
513 SIAtomicScope Scope,
514 SIAtomicAddrSpace AddrSpace) const override;
515
516 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
517 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518 bool IsVolatile, bool IsNonTemporal,
519 bool IsLastUse) const override;
520
521 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
522 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
523
524 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
526 Position Pos) const override;
527
tryForceStoreSC0SC1(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI) const528 bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
529 MachineBasicBlock::iterator &MI) const override {
530 bool Changed = false;
531 if (ST.hasForceStoreSC0SC1() &&
532 (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
533 SIAtomicAddrSpace::GLOBAL |
534 SIAtomicAddrSpace::OTHER)) !=
535 SIAtomicAddrSpace::NONE) {
536 Changed |= enableSC0Bit(MI);
537 Changed |= enableSC1Bit(MI);
538 }
539 return Changed;
540 }
541 };
542
543 class SIGfx10CacheControl : public SIGfx7CacheControl {
544 protected:
545
546 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
547 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const548 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549 return enableNamedBit(MI, AMDGPU::CPol::DLC);
550 }
551
552 public:
553
SIGfx10CacheControl(const GCNSubtarget & ST)554 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
555
556 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557 SIAtomicScope Scope,
558 SIAtomicAddrSpace AddrSpace) const override;
559
560 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562 bool IsVolatile, bool IsNonTemporal,
563 bool IsLastUse) const override;
564
565 bool insertWait(MachineBasicBlock::iterator &MI,
566 SIAtomicScope Scope,
567 SIAtomicAddrSpace AddrSpace,
568 SIMemOp Op,
569 bool IsCrossAddrSpaceOrdering,
570 Position Pos) const override;
571
572 bool insertAcquire(MachineBasicBlock::iterator &MI,
573 SIAtomicScope Scope,
574 SIAtomicAddrSpace AddrSpace,
575 Position Pos) const override;
576 };
577
578 class SIGfx11CacheControl : public SIGfx10CacheControl {
579 public:
SIGfx11CacheControl(const GCNSubtarget & ST)580 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
581
582 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
583 SIAtomicScope Scope,
584 SIAtomicAddrSpace AddrSpace) const override;
585
586 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
587 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588 bool IsVolatile, bool IsNonTemporal,
589 bool IsLastUse) const override;
590 };
591
592 class SIGfx12CacheControl : public SIGfx11CacheControl {
593 protected:
594 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
595 // \returns Returns true if \p MI is modified, false otherwise.
596 bool setTH(const MachineBasicBlock::iterator MI,
597 AMDGPU::CPol::CPol Value) const;
598 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
599 // MI. \returns Returns true if \p MI is modified, false otherwise.
600 bool setScope(const MachineBasicBlock::iterator MI,
601 AMDGPU::CPol::CPol Value) const;
602
603 // Stores with system scope (SCOPE_SYS) need to wait for:
604 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
605 // - non-returning-atomics - wait for STORECNT==0
606 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607 // since it does not distinguish atomics-with-return from regular stores.
608 // There is no need to wait if memory is cached (mtype != UC).
609 bool
610 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611
612 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614
615 public:
SIGfx12CacheControl(const GCNSubtarget & ST)616 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
617
618 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
619 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620 bool IsCrossAddrSpaceOrdering, Position Pos) const override;
621
622 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
623 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
624
625 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
626 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627 bool IsVolatile, bool IsNonTemporal,
628 bool IsLastUse) const override;
629
630 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631
632 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634 Position Pos) const override;
635
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const636 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637 SIAtomicScope Scope,
638 SIAtomicAddrSpace AddrSpace) const override {
639 return setAtomicScope(MI, Scope, AddrSpace);
640 }
641
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const642 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643 SIAtomicScope Scope,
644 SIAtomicAddrSpace AddrSpace) const override {
645 return setAtomicScope(MI, Scope, AddrSpace);
646 }
647
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const648 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649 SIAtomicScope Scope,
650 SIAtomicAddrSpace AddrSpace) const override {
651 return setAtomicScope(MI, Scope, AddrSpace);
652 }
653 };
654
655 class SIMemoryLegalizer final : public MachineFunctionPass {
656 private:
657
658 /// Cache Control.
659 std::unique_ptr<SICacheControl> CC = nullptr;
660
661 /// List of atomic pseudo instructions.
662 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
663
664 /// Return true iff instruction \p MI is a atomic instruction that
665 /// returns a result.
isAtomicRet(const MachineInstr & MI) const666 bool isAtomicRet(const MachineInstr &MI) const {
667 return SIInstrInfo::isAtomicRet(MI);
668 }
669
670 /// Removes all processed atomic pseudo instructions from the current
671 /// function. Returns true if current function is modified, false otherwise.
672 bool removeAtomicPseudoMIs();
673
674 /// Expands load operation \p MI. Returns true if instructions are
675 /// added/deleted or \p MI is modified, false otherwise.
676 bool expandLoad(const SIMemOpInfo &MOI,
677 MachineBasicBlock::iterator &MI);
678 /// Expands store operation \p MI. Returns true if instructions are
679 /// added/deleted or \p MI is modified, false otherwise.
680 bool expandStore(const SIMemOpInfo &MOI,
681 MachineBasicBlock::iterator &MI);
682 /// Expands atomic fence operation \p MI. Returns true if
683 /// instructions are added/deleted or \p MI is modified, false otherwise.
684 bool expandAtomicFence(const SIMemOpInfo &MOI,
685 MachineBasicBlock::iterator &MI);
686 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
687 /// instructions are added/deleted or \p MI is modified, false otherwise.
688 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
689 MachineBasicBlock::iterator &MI);
690
691 public:
692 static char ID;
693
SIMemoryLegalizer()694 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
695
getAnalysisUsage(AnalysisUsage & AU) const696 void getAnalysisUsage(AnalysisUsage &AU) const override {
697 AU.setPreservesCFG();
698 MachineFunctionPass::getAnalysisUsage(AU);
699 }
700
getPassName() const701 StringRef getPassName() const override {
702 return PASS_NAME;
703 }
704
705 bool runOnMachineFunction(MachineFunction &MF) override;
706 };
707
708 static const StringMap<SIAtomicAddrSpace> ASNames = {{
709 {"global", SIAtomicAddrSpace::GLOBAL},
710 {"local", SIAtomicAddrSpace::LDS},
711 }};
712
diagnoseUnknownMMRAASName(const MachineInstr & MI,StringRef AS)713 void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
714 const MachineFunction *MF = MI.getMF();
715 const Function &Fn = MF->getFunction();
716 SmallString<128> Str;
717 raw_svector_ostream OS(Str);
718 OS << "unknown address space '" << AS << "'; expected one of ";
719 ListSeparator LS;
720 for (const auto &[Name, Val] : ASNames)
721 OS << LS << '\'' << Name << '\'';
722 DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
723 Fn.getContext().diagnose(BadTag);
724 }
725
726 /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
727 /// If this tag isn't present, or if it has no meaningful values, returns \p
728 /// Default. Otherwise returns all the address spaces concerned by the MMRA.
getFenceAddrSpaceMMRA(const MachineInstr & MI,SIAtomicAddrSpace Default)729 static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
730 SIAtomicAddrSpace Default) {
731 static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
732
733 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
734 if (!MMRA)
735 return Default;
736
737 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
738 for (const auto &[Prefix, Suffix] : MMRA) {
739 if (Prefix != FenceASPrefix)
740 continue;
741
742 if (auto It = ASNames.find(Suffix); It != ASNames.end())
743 Result |= It->second;
744 else
745 diagnoseUnknownMMRAASName(MI, Suffix);
746 }
747
748 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
749 }
750
751 } // end anonymous namespace
752
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const753 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
754 const char *Msg) const {
755 const Function &Func = MI->getParent()->getParent()->getFunction();
756 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
757 Func.getContext().diagnose(Diag);
758 }
759
760 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const761 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
762 SIAtomicAddrSpace InstrAddrSpace) const {
763 if (SSID == SyncScope::System)
764 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
765 if (SSID == MMI->getAgentSSID())
766 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
767 if (SSID == MMI->getWorkgroupSSID())
768 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
769 true);
770 if (SSID == MMI->getWavefrontSSID())
771 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
772 true);
773 if (SSID == SyncScope::SingleThread)
774 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
775 true);
776 if (SSID == MMI->getSystemOneAddressSpaceSSID())
777 return std::tuple(SIAtomicScope::SYSTEM,
778 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
779 if (SSID == MMI->getAgentOneAddressSpaceSSID())
780 return std::tuple(SIAtomicScope::AGENT,
781 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
782 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
783 return std::tuple(SIAtomicScope::WORKGROUP,
784 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
785 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
786 return std::tuple(SIAtomicScope::WAVEFRONT,
787 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
788 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
789 return std::tuple(SIAtomicScope::SINGLETHREAD,
790 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791 return std::nullopt;
792 }
793
toSIAtomicAddrSpace(unsigned AS) const794 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
795 if (AS == AMDGPUAS::FLAT_ADDRESS)
796 return SIAtomicAddrSpace::FLAT;
797 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
798 return SIAtomicAddrSpace::GLOBAL;
799 if (AS == AMDGPUAS::LOCAL_ADDRESS)
800 return SIAtomicAddrSpace::LDS;
801 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
802 return SIAtomicAddrSpace::SCRATCH;
803 if (AS == AMDGPUAS::REGION_ADDRESS)
804 return SIAtomicAddrSpace::GDS;
805
806 return SIAtomicAddrSpace::OTHER;
807 }
808
SIMemOpAccess(const AMDGPUMachineModuleInfo & MMI_)809 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
810 : MMI(&MMI_) {}
811
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const812 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
813 const MachineBasicBlock::iterator &MI) const {
814 assert(MI->getNumMemOperands() > 0);
815
816 SyncScope::ID SSID = SyncScope::SingleThread;
817 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
818 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
819 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
820 bool IsNonTemporal = true;
821 bool IsVolatile = false;
822 bool IsLastUse = false;
823
824 // Validator should check whether or not MMOs cover the entire set of
825 // locations accessed by the memory instruction.
826 for (const auto &MMO : MI->memoperands()) {
827 IsNonTemporal &= MMO->isNonTemporal();
828 IsVolatile |= MMO->isVolatile();
829 IsLastUse |= MMO->getFlags() & MOLastUse;
830 InstrAddrSpace |=
831 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
832 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
833 if (OpOrdering != AtomicOrdering::NotAtomic) {
834 const auto &IsSyncScopeInclusion =
835 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
836 if (!IsSyncScopeInclusion) {
837 reportUnsupported(MI,
838 "Unsupported non-inclusive atomic synchronization scope");
839 return std::nullopt;
840 }
841
842 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
843 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
844 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
845 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
846 FailureOrdering =
847 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
848 }
849 }
850
851 SIAtomicScope Scope = SIAtomicScope::NONE;
852 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
853 bool IsCrossAddressSpaceOrdering = false;
854 if (Ordering != AtomicOrdering::NotAtomic) {
855 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
856 if (!ScopeOrNone) {
857 reportUnsupported(MI, "Unsupported atomic synchronization scope");
858 return std::nullopt;
859 }
860 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
861 *ScopeOrNone;
862 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
863 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
864 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
865 reportUnsupported(MI, "Unsupported atomic address space");
866 return std::nullopt;
867 }
868 }
869 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
870 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
871 IsNonTemporal, IsLastUse);
872 }
873
874 std::optional<SIMemOpInfo>
getLoadInfo(const MachineBasicBlock::iterator & MI) const875 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
876 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
877
878 if (!(MI->mayLoad() && !MI->mayStore()))
879 return std::nullopt;
880
881 // Be conservative if there are no memory operands.
882 if (MI->getNumMemOperands() == 0)
883 return SIMemOpInfo();
884
885 return constructFromMIWithMMO(MI);
886 }
887
888 std::optional<SIMemOpInfo>
getStoreInfo(const MachineBasicBlock::iterator & MI) const889 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
890 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
891
892 if (!(!MI->mayLoad() && MI->mayStore()))
893 return std::nullopt;
894
895 // Be conservative if there are no memory operands.
896 if (MI->getNumMemOperands() == 0)
897 return SIMemOpInfo();
898
899 return constructFromMIWithMMO(MI);
900 }
901
902 std::optional<SIMemOpInfo>
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const903 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
904 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
905
906 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
907 return std::nullopt;
908
909 AtomicOrdering Ordering =
910 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
911
912 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
913 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
914 if (!ScopeOrNone) {
915 reportUnsupported(MI, "Unsupported atomic synchronization scope");
916 return std::nullopt;
917 }
918
919 SIAtomicScope Scope = SIAtomicScope::NONE;
920 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
921 bool IsCrossAddressSpaceOrdering = false;
922 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
923 *ScopeOrNone;
924
925 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
926 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
927 reportUnsupported(MI, "Unsupported atomic address space");
928 return std::nullopt;
929 }
930
931 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
932 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
933 }
934
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const935 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
936 const MachineBasicBlock::iterator &MI) const {
937 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
938
939 if (!(MI->mayLoad() && MI->mayStore()))
940 return std::nullopt;
941
942 // Be conservative if there are no memory operands.
943 if (MI->getNumMemOperands() == 0)
944 return SIMemOpInfo();
945
946 return constructFromMIWithMMO(MI);
947 }
948
SICacheControl(const GCNSubtarget & ST)949 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
950 TII = ST.getInstrInfo();
951 IV = getIsaVersion(ST.getCPU());
952 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
953 }
954
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const955 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
956 AMDGPU::CPol::CPol Bit) const {
957 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
958 if (!CPol)
959 return false;
960
961 CPol->setImm(CPol->getImm() | Bit);
962 return true;
963 }
964
965 /* static */
create(const GCNSubtarget & ST)966 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
967 GCNSubtarget::Generation Generation = ST.getGeneration();
968 if (ST.hasGFX940Insts())
969 return std::make_unique<SIGfx940CacheControl>(ST);
970 if (ST.hasGFX90AInsts())
971 return std::make_unique<SIGfx90ACacheControl>(ST);
972 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
973 return std::make_unique<SIGfx6CacheControl>(ST);
974 if (Generation < AMDGPUSubtarget::GFX10)
975 return std::make_unique<SIGfx7CacheControl>(ST);
976 if (Generation < AMDGPUSubtarget::GFX11)
977 return std::make_unique<SIGfx10CacheControl>(ST);
978 if (Generation < AMDGPUSubtarget::GFX12)
979 return std::make_unique<SIGfx11CacheControl>(ST);
980 return std::make_unique<SIGfx12CacheControl>(ST);
981 }
982
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const983 bool SIGfx6CacheControl::enableLoadCacheBypass(
984 const MachineBasicBlock::iterator &MI,
985 SIAtomicScope Scope,
986 SIAtomicAddrSpace AddrSpace) const {
987 assert(MI->mayLoad() && !MI->mayStore());
988 bool Changed = false;
989
990 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
991 switch (Scope) {
992 case SIAtomicScope::SYSTEM:
993 case SIAtomicScope::AGENT:
994 // Set L1 cache policy to MISS_EVICT.
995 // Note: there is no L2 cache bypass policy at the ISA level.
996 Changed |= enableGLCBit(MI);
997 break;
998 case SIAtomicScope::WORKGROUP:
999 case SIAtomicScope::WAVEFRONT:
1000 case SIAtomicScope::SINGLETHREAD:
1001 // No cache to bypass.
1002 break;
1003 default:
1004 llvm_unreachable("Unsupported synchronization scope");
1005 }
1006 }
1007
1008 /// The scratch address space does not need the global memory caches
1009 /// to be bypassed as all memory operations by the same thread are
1010 /// sequentially consistent, and no other thread can access scratch
1011 /// memory.
1012
1013 /// Other address spaces do not have a cache.
1014
1015 return Changed;
1016 }
1017
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1018 bool SIGfx6CacheControl::enableStoreCacheBypass(
1019 const MachineBasicBlock::iterator &MI,
1020 SIAtomicScope Scope,
1021 SIAtomicAddrSpace AddrSpace) const {
1022 assert(!MI->mayLoad() && MI->mayStore());
1023 bool Changed = false;
1024
1025 /// The L1 cache is write through so does not need to be bypassed. There is no
1026 /// bypass control for the L2 cache at the isa level.
1027
1028 return Changed;
1029 }
1030
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1031 bool SIGfx6CacheControl::enableRMWCacheBypass(
1032 const MachineBasicBlock::iterator &MI,
1033 SIAtomicScope Scope,
1034 SIAtomicAddrSpace AddrSpace) const {
1035 assert(MI->mayLoad() && MI->mayStore());
1036 bool Changed = false;
1037
1038 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1039 /// bypassed, and the GLC bit is instead used to indicate if they are
1040 /// return or no-return.
1041 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1042
1043 return Changed;
1044 }
1045
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1046 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1047 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1048 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1049 // Only handle load and store, not atomic read-modify-write insructions. The
1050 // latter use glc to indicate if the atomic returns a result and so must not
1051 // be used for cache control.
1052 assert(MI->mayLoad() ^ MI->mayStore());
1053
1054 // Only update load and store, not LLVM IR atomic read-modify-write
1055 // instructions. The latter are always marked as volatile so cannot sensibly
1056 // handle it as do not want to pessimize all atomics. Also they do not support
1057 // the nontemporal attribute.
1058 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1059
1060 bool Changed = false;
1061
1062 if (IsVolatile) {
1063 // Set L1 cache policy to be MISS_EVICT for load instructions
1064 // and MISS_LRU for store instructions.
1065 // Note: there is no L2 cache bypass policy at the ISA level.
1066 if (Op == SIMemOp::LOAD)
1067 Changed |= enableGLCBit(MI);
1068
1069 // Ensure operation has completed at system scope to cause all volatile
1070 // operations to be visible outside the program in a global order. Do not
1071 // request cross address space as only the global address space can be
1072 // observable outside the program, so no need to cause a waitcnt for LDS
1073 // address space operations.
1074 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1075 Position::AFTER);
1076
1077 return Changed;
1078 }
1079
1080 if (IsNonTemporal) {
1081 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1082 // for both loads and stores, and the L2 cache policy to STREAM.
1083 Changed |= enableGLCBit(MI);
1084 Changed |= enableSLCBit(MI);
1085 return Changed;
1086 }
1087
1088 return Changed;
1089 }
1090
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1091 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1092 SIAtomicScope Scope,
1093 SIAtomicAddrSpace AddrSpace,
1094 SIMemOp Op,
1095 bool IsCrossAddrSpaceOrdering,
1096 Position Pos) const {
1097 bool Changed = false;
1098
1099 MachineBasicBlock &MBB = *MI->getParent();
1100 DebugLoc DL = MI->getDebugLoc();
1101
1102 if (Pos == Position::AFTER)
1103 ++MI;
1104
1105 bool VMCnt = false;
1106 bool LGKMCnt = false;
1107
1108 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1109 SIAtomicAddrSpace::NONE) {
1110 switch (Scope) {
1111 case SIAtomicScope::SYSTEM:
1112 case SIAtomicScope::AGENT:
1113 VMCnt |= true;
1114 break;
1115 case SIAtomicScope::WORKGROUP:
1116 case SIAtomicScope::WAVEFRONT:
1117 case SIAtomicScope::SINGLETHREAD:
1118 // The L1 cache keeps all memory operations in order for
1119 // wavefronts in the same work-group.
1120 break;
1121 default:
1122 llvm_unreachable("Unsupported synchronization scope");
1123 }
1124 }
1125
1126 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1127 switch (Scope) {
1128 case SIAtomicScope::SYSTEM:
1129 case SIAtomicScope::AGENT:
1130 case SIAtomicScope::WORKGROUP:
1131 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1132 // not needed as LDS operations for all waves are executed in a total
1133 // global ordering as observed by all waves. Required if also
1134 // synchronizing with global/GDS memory as LDS operations could be
1135 // reordered with respect to later global/GDS memory operations of the
1136 // same wave.
1137 LGKMCnt |= IsCrossAddrSpaceOrdering;
1138 break;
1139 case SIAtomicScope::WAVEFRONT:
1140 case SIAtomicScope::SINGLETHREAD:
1141 // The LDS keeps all memory operations in order for
1142 // the same wavefront.
1143 break;
1144 default:
1145 llvm_unreachable("Unsupported synchronization scope");
1146 }
1147 }
1148
1149 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1150 switch (Scope) {
1151 case SIAtomicScope::SYSTEM:
1152 case SIAtomicScope::AGENT:
1153 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1154 // is not needed as GDS operations for all waves are executed in a total
1155 // global ordering as observed by all waves. Required if also
1156 // synchronizing with global/LDS memory as GDS operations could be
1157 // reordered with respect to later global/LDS memory operations of the
1158 // same wave.
1159 LGKMCnt |= IsCrossAddrSpaceOrdering;
1160 break;
1161 case SIAtomicScope::WORKGROUP:
1162 case SIAtomicScope::WAVEFRONT:
1163 case SIAtomicScope::SINGLETHREAD:
1164 // The GDS keeps all memory operations in order for
1165 // the same work-group.
1166 break;
1167 default:
1168 llvm_unreachable("Unsupported synchronization scope");
1169 }
1170 }
1171
1172 if (VMCnt || LGKMCnt) {
1173 unsigned WaitCntImmediate =
1174 AMDGPU::encodeWaitcnt(IV,
1175 VMCnt ? 0 : getVmcntBitMask(IV),
1176 getExpcntBitMask(IV),
1177 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1178 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1179 .addImm(WaitCntImmediate);
1180 Changed = true;
1181 }
1182
1183 if (Pos == Position::AFTER)
1184 --MI;
1185
1186 return Changed;
1187 }
1188
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1189 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1190 SIAtomicScope Scope,
1191 SIAtomicAddrSpace AddrSpace,
1192 Position Pos) const {
1193 if (!InsertCacheInv)
1194 return false;
1195
1196 bool Changed = false;
1197
1198 MachineBasicBlock &MBB = *MI->getParent();
1199 DebugLoc DL = MI->getDebugLoc();
1200
1201 if (Pos == Position::AFTER)
1202 ++MI;
1203
1204 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1205 switch (Scope) {
1206 case SIAtomicScope::SYSTEM:
1207 case SIAtomicScope::AGENT:
1208 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1209 Changed = true;
1210 break;
1211 case SIAtomicScope::WORKGROUP:
1212 case SIAtomicScope::WAVEFRONT:
1213 case SIAtomicScope::SINGLETHREAD:
1214 // No cache to invalidate.
1215 break;
1216 default:
1217 llvm_unreachable("Unsupported synchronization scope");
1218 }
1219 }
1220
1221 /// The scratch address space does not need the global memory cache
1222 /// to be flushed as all memory operations by the same thread are
1223 /// sequentially consistent, and no other thread can access scratch
1224 /// memory.
1225
1226 /// Other address spaces do not have a cache.
1227
1228 if (Pos == Position::AFTER)
1229 --MI;
1230
1231 return Changed;
1232 }
1233
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1234 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1235 SIAtomicScope Scope,
1236 SIAtomicAddrSpace AddrSpace,
1237 bool IsCrossAddrSpaceOrdering,
1238 Position Pos) const {
1239 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1240 IsCrossAddrSpaceOrdering, Pos);
1241 }
1242
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1243 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1244 SIAtomicScope Scope,
1245 SIAtomicAddrSpace AddrSpace,
1246 Position Pos) const {
1247 if (!InsertCacheInv)
1248 return false;
1249
1250 bool Changed = false;
1251
1252 MachineBasicBlock &MBB = *MI->getParent();
1253 DebugLoc DL = MI->getDebugLoc();
1254
1255 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1256
1257 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1258 ? AMDGPU::BUFFER_WBINVL1
1259 : AMDGPU::BUFFER_WBINVL1_VOL;
1260
1261 if (Pos == Position::AFTER)
1262 ++MI;
1263
1264 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1265 switch (Scope) {
1266 case SIAtomicScope::SYSTEM:
1267 case SIAtomicScope::AGENT:
1268 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1269 Changed = true;
1270 break;
1271 case SIAtomicScope::WORKGROUP:
1272 case SIAtomicScope::WAVEFRONT:
1273 case SIAtomicScope::SINGLETHREAD:
1274 // No cache to invalidate.
1275 break;
1276 default:
1277 llvm_unreachable("Unsupported synchronization scope");
1278 }
1279 }
1280
1281 /// The scratch address space does not need the global memory cache
1282 /// to be flushed as all memory operations by the same thread are
1283 /// sequentially consistent, and no other thread can access scratch
1284 /// memory.
1285
1286 /// Other address spaces do not have a cache.
1287
1288 if (Pos == Position::AFTER)
1289 --MI;
1290
1291 return Changed;
1292 }
1293
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1294 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1295 const MachineBasicBlock::iterator &MI,
1296 SIAtomicScope Scope,
1297 SIAtomicAddrSpace AddrSpace) const {
1298 assert(MI->mayLoad() && !MI->mayStore());
1299 bool Changed = false;
1300
1301 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1302 switch (Scope) {
1303 case SIAtomicScope::SYSTEM:
1304 case SIAtomicScope::AGENT:
1305 // Set the L1 cache policy to MISS_LRU.
1306 // Note: there is no L2 cache bypass policy at the ISA level.
1307 Changed |= enableGLCBit(MI);
1308 break;
1309 case SIAtomicScope::WORKGROUP:
1310 // In threadgroup split mode the waves of a work-group can be executing on
1311 // different CUs. Therefore need to bypass the L1 which is per CU.
1312 // Otherwise in non-threadgroup split mode all waves of a work-group are
1313 // on the same CU, and so the L1 does not need to be bypassed.
1314 if (ST.isTgSplitEnabled())
1315 Changed |= enableGLCBit(MI);
1316 break;
1317 case SIAtomicScope::WAVEFRONT:
1318 case SIAtomicScope::SINGLETHREAD:
1319 // No cache to bypass.
1320 break;
1321 default:
1322 llvm_unreachable("Unsupported synchronization scope");
1323 }
1324 }
1325
1326 /// The scratch address space does not need the global memory caches
1327 /// to be bypassed as all memory operations by the same thread are
1328 /// sequentially consistent, and no other thread can access scratch
1329 /// memory.
1330
1331 /// Other address spaces do not have a cache.
1332
1333 return Changed;
1334 }
1335
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1336 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1337 const MachineBasicBlock::iterator &MI,
1338 SIAtomicScope Scope,
1339 SIAtomicAddrSpace AddrSpace) const {
1340 assert(!MI->mayLoad() && MI->mayStore());
1341 bool Changed = false;
1342
1343 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1344 switch (Scope) {
1345 case SIAtomicScope::SYSTEM:
1346 case SIAtomicScope::AGENT:
1347 /// Do not set glc for store atomic operations as they implicitly write
1348 /// through the L1 cache.
1349 break;
1350 case SIAtomicScope::WORKGROUP:
1351 case SIAtomicScope::WAVEFRONT:
1352 case SIAtomicScope::SINGLETHREAD:
1353 // No cache to bypass. Store atomics implicitly write through the L1
1354 // cache.
1355 break;
1356 default:
1357 llvm_unreachable("Unsupported synchronization scope");
1358 }
1359 }
1360
1361 /// The scratch address space does not need the global memory caches
1362 /// to be bypassed as all memory operations by the same thread are
1363 /// sequentially consistent, and no other thread can access scratch
1364 /// memory.
1365
1366 /// Other address spaces do not have a cache.
1367
1368 return Changed;
1369 }
1370
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1371 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1372 const MachineBasicBlock::iterator &MI,
1373 SIAtomicScope Scope,
1374 SIAtomicAddrSpace AddrSpace) const {
1375 assert(MI->mayLoad() && MI->mayStore());
1376 bool Changed = false;
1377
1378 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1379 switch (Scope) {
1380 case SIAtomicScope::SYSTEM:
1381 case SIAtomicScope::AGENT:
1382 /// Do not set glc for RMW atomic operations as they implicitly bypass
1383 /// the L1 cache, and the glc bit is instead used to indicate if they are
1384 /// return or no-return.
1385 break;
1386 case SIAtomicScope::WORKGROUP:
1387 case SIAtomicScope::WAVEFRONT:
1388 case SIAtomicScope::SINGLETHREAD:
1389 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1390 break;
1391 default:
1392 llvm_unreachable("Unsupported synchronization scope");
1393 }
1394 }
1395
1396 return Changed;
1397 }
1398
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1399 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1400 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1401 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1402 // Only handle load and store, not atomic read-modify-write insructions. The
1403 // latter use glc to indicate if the atomic returns a result and so must not
1404 // be used for cache control.
1405 assert(MI->mayLoad() ^ MI->mayStore());
1406
1407 // Only update load and store, not LLVM IR atomic read-modify-write
1408 // instructions. The latter are always marked as volatile so cannot sensibly
1409 // handle it as do not want to pessimize all atomics. Also they do not support
1410 // the nontemporal attribute.
1411 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1412
1413 bool Changed = false;
1414
1415 if (IsVolatile) {
1416 // Set L1 cache policy to be MISS_EVICT for load instructions
1417 // and MISS_LRU for store instructions.
1418 // Note: there is no L2 cache bypass policy at the ISA level.
1419 if (Op == SIMemOp::LOAD)
1420 Changed |= enableGLCBit(MI);
1421
1422 // Ensure operation has completed at system scope to cause all volatile
1423 // operations to be visible outside the program in a global order. Do not
1424 // request cross address space as only the global address space can be
1425 // observable outside the program, so no need to cause a waitcnt for LDS
1426 // address space operations.
1427 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1428 Position::AFTER);
1429
1430 return Changed;
1431 }
1432
1433 if (IsNonTemporal) {
1434 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1435 // for both loads and stores, and the L2 cache policy to STREAM.
1436 Changed |= enableGLCBit(MI);
1437 Changed |= enableSLCBit(MI);
1438 return Changed;
1439 }
1440
1441 return Changed;
1442 }
1443
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1444 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1445 SIAtomicScope Scope,
1446 SIAtomicAddrSpace AddrSpace,
1447 SIMemOp Op,
1448 bool IsCrossAddrSpaceOrdering,
1449 Position Pos) const {
1450 if (ST.isTgSplitEnabled()) {
1451 // In threadgroup split mode the waves of a work-group can be executing on
1452 // different CUs. Therefore need to wait for global or GDS memory operations
1453 // to complete to ensure they are visible to waves in the other CUs.
1454 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1455 // the same CU, so no need to wait for global memory as all waves in the
1456 // work-group access the same the L1, nor wait for GDS as access are ordered
1457 // on a CU.
1458 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1459 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1460 (Scope == SIAtomicScope::WORKGROUP)) {
1461 // Same as GFX7 using agent scope.
1462 Scope = SIAtomicScope::AGENT;
1463 }
1464 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1465 // LDS memory operations.
1466 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1467 }
1468 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1469 IsCrossAddrSpaceOrdering, Pos);
1470 }
1471
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1472 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1473 SIAtomicScope Scope,
1474 SIAtomicAddrSpace AddrSpace,
1475 Position Pos) const {
1476 if (!InsertCacheInv)
1477 return false;
1478
1479 bool Changed = false;
1480
1481 MachineBasicBlock &MBB = *MI->getParent();
1482 DebugLoc DL = MI->getDebugLoc();
1483
1484 if (Pos == Position::AFTER)
1485 ++MI;
1486
1487 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1488 switch (Scope) {
1489 case SIAtomicScope::SYSTEM:
1490 // Ensures that following loads will not see stale remote VMEM data or
1491 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1492 // CC will never be stale due to the local memory probes.
1493 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1494 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1495 // hardware does not reorder memory operations by the same wave with
1496 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1497 // remove any cache lines of earlier writes by the same wave and ensures
1498 // later reads by the same wave will refetch the cache lines.
1499 Changed = true;
1500 break;
1501 case SIAtomicScope::AGENT:
1502 // Same as GFX7.
1503 break;
1504 case SIAtomicScope::WORKGROUP:
1505 // In threadgroup split mode the waves of a work-group can be executing on
1506 // different CUs. Therefore need to invalidate the L1 which is per CU.
1507 // Otherwise in non-threadgroup split mode all waves of a work-group are
1508 // on the same CU, and so the L1 does not need to be invalidated.
1509 if (ST.isTgSplitEnabled()) {
1510 // Same as GFX7 using agent scope.
1511 Scope = SIAtomicScope::AGENT;
1512 }
1513 break;
1514 case SIAtomicScope::WAVEFRONT:
1515 case SIAtomicScope::SINGLETHREAD:
1516 // Same as GFX7.
1517 break;
1518 default:
1519 llvm_unreachable("Unsupported synchronization scope");
1520 }
1521 }
1522
1523 /// The scratch address space does not need the global memory cache
1524 /// to be flushed as all memory operations by the same thread are
1525 /// sequentially consistent, and no other thread can access scratch
1526 /// memory.
1527
1528 /// Other address spaces do not have a cache.
1529
1530 if (Pos == Position::AFTER)
1531 --MI;
1532
1533 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1534
1535 return Changed;
1536 }
1537
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1538 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1539 SIAtomicScope Scope,
1540 SIAtomicAddrSpace AddrSpace,
1541 bool IsCrossAddrSpaceOrdering,
1542 Position Pos) const {
1543 bool Changed = false;
1544
1545 MachineBasicBlock &MBB = *MI->getParent();
1546 const DebugLoc &DL = MI->getDebugLoc();
1547
1548 if (Pos == Position::AFTER)
1549 ++MI;
1550
1551 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1552 switch (Scope) {
1553 case SIAtomicScope::SYSTEM:
1554 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1555 // hardware does not reorder memory operations by the same wave with
1556 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1557 // to initiate writeback of any dirty cache lines of earlier writes by the
1558 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1559 // writeback has completed.
1560 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1561 // Set SC bits to indicate system scope.
1562 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1563 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1564 // vmcnt(0)" needed by the "BUFFER_WBL2".
1565 Changed = true;
1566 break;
1567 case SIAtomicScope::AGENT:
1568 case SIAtomicScope::WORKGROUP:
1569 case SIAtomicScope::WAVEFRONT:
1570 case SIAtomicScope::SINGLETHREAD:
1571 // Same as GFX7.
1572 break;
1573 default:
1574 llvm_unreachable("Unsupported synchronization scope");
1575 }
1576 }
1577
1578 if (Pos == Position::AFTER)
1579 --MI;
1580
1581 Changed |=
1582 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1583 IsCrossAddrSpaceOrdering, Pos);
1584
1585 return Changed;
1586 }
1587
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1588 bool SIGfx940CacheControl::enableLoadCacheBypass(
1589 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1590 SIAtomicAddrSpace AddrSpace) const {
1591 assert(MI->mayLoad() && !MI->mayStore());
1592 bool Changed = false;
1593
1594 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1595 switch (Scope) {
1596 case SIAtomicScope::SYSTEM:
1597 // Set SC bits to indicate system scope.
1598 Changed |= enableSC0Bit(MI);
1599 Changed |= enableSC1Bit(MI);
1600 break;
1601 case SIAtomicScope::AGENT:
1602 // Set SC bits to indicate agent scope.
1603 Changed |= enableSC1Bit(MI);
1604 break;
1605 case SIAtomicScope::WORKGROUP:
1606 // In threadgroup split mode the waves of a work-group can be executing on
1607 // different CUs. Therefore need to bypass the L1 which is per CU.
1608 // Otherwise in non-threadgroup split mode all waves of a work-group are
1609 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1610 // bits to indicate work-group scope will do this automatically.
1611 Changed |= enableSC0Bit(MI);
1612 break;
1613 case SIAtomicScope::WAVEFRONT:
1614 case SIAtomicScope::SINGLETHREAD:
1615 // Leave SC bits unset to indicate wavefront scope.
1616 break;
1617 default:
1618 llvm_unreachable("Unsupported synchronization scope");
1619 }
1620 }
1621
1622 /// The scratch address space does not need the global memory caches
1623 /// to be bypassed as all memory operations by the same thread are
1624 /// sequentially consistent, and no other thread can access scratch
1625 /// memory.
1626
1627 /// Other address spaces do not have a cache.
1628
1629 return Changed;
1630 }
1631
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1632 bool SIGfx940CacheControl::enableStoreCacheBypass(
1633 const MachineBasicBlock::iterator &MI,
1634 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1635 assert(!MI->mayLoad() && MI->mayStore());
1636 bool Changed = false;
1637
1638 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1639 switch (Scope) {
1640 case SIAtomicScope::SYSTEM:
1641 // Set SC bits to indicate system scope.
1642 Changed |= enableSC0Bit(MI);
1643 Changed |= enableSC1Bit(MI);
1644 break;
1645 case SIAtomicScope::AGENT:
1646 // Set SC bits to indicate agent scope.
1647 Changed |= enableSC1Bit(MI);
1648 break;
1649 case SIAtomicScope::WORKGROUP:
1650 // Set SC bits to indicate workgroup scope.
1651 Changed |= enableSC0Bit(MI);
1652 break;
1653 case SIAtomicScope::WAVEFRONT:
1654 case SIAtomicScope::SINGLETHREAD:
1655 // Leave SC bits unset to indicate wavefront scope.
1656 break;
1657 default:
1658 llvm_unreachable("Unsupported synchronization scope");
1659 }
1660 }
1661
1662 /// The scratch address space does not need the global memory caches
1663 /// to be bypassed as all memory operations by the same thread are
1664 /// sequentially consistent, and no other thread can access scratch
1665 /// memory.
1666
1667 /// Other address spaces do not have a cache.
1668
1669 return Changed;
1670 }
1671
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1672 bool SIGfx940CacheControl::enableRMWCacheBypass(
1673 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1674 SIAtomicAddrSpace AddrSpace) const {
1675 assert(MI->mayLoad() && MI->mayStore());
1676 bool Changed = false;
1677
1678 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1679 switch (Scope) {
1680 case SIAtomicScope::SYSTEM:
1681 // Set SC1 bit to indicate system scope.
1682 Changed |= enableSC1Bit(MI);
1683 break;
1684 case SIAtomicScope::AGENT:
1685 case SIAtomicScope::WORKGROUP:
1686 case SIAtomicScope::WAVEFRONT:
1687 case SIAtomicScope::SINGLETHREAD:
1688 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1689 // to indicate system or agent scope. The SC0 bit is used to indicate if
1690 // they are return or no-return. Leave SC1 bit unset to indicate agent
1691 // scope.
1692 break;
1693 default:
1694 llvm_unreachable("Unsupported synchronization scope");
1695 }
1696 }
1697
1698 return Changed;
1699 }
1700
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1701 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1702 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1703 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1704 // Only handle load and store, not atomic read-modify-write insructions. The
1705 // latter use glc to indicate if the atomic returns a result and so must not
1706 // be used for cache control.
1707 assert(MI->mayLoad() ^ MI->mayStore());
1708
1709 // Only update load and store, not LLVM IR atomic read-modify-write
1710 // instructions. The latter are always marked as volatile so cannot sensibly
1711 // handle it as do not want to pessimize all atomics. Also they do not support
1712 // the nontemporal attribute.
1713 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1714
1715 bool Changed = false;
1716
1717 if (IsVolatile) {
1718 // Set SC bits to indicate system scope.
1719 Changed |= enableSC0Bit(MI);
1720 Changed |= enableSC1Bit(MI);
1721
1722 // Ensure operation has completed at system scope to cause all volatile
1723 // operations to be visible outside the program in a global order. Do not
1724 // request cross address space as only the global address space can be
1725 // observable outside the program, so no need to cause a waitcnt for LDS
1726 // address space operations.
1727 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1728 Position::AFTER);
1729
1730 return Changed;
1731 }
1732
1733 if (IsNonTemporal) {
1734 Changed |= enableNTBit(MI);
1735 return Changed;
1736 }
1737
1738 return Changed;
1739 }
1740
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1741 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1742 SIAtomicScope Scope,
1743 SIAtomicAddrSpace AddrSpace,
1744 Position Pos) const {
1745 if (!InsertCacheInv)
1746 return false;
1747
1748 bool Changed = false;
1749
1750 MachineBasicBlock &MBB = *MI->getParent();
1751 DebugLoc DL = MI->getDebugLoc();
1752
1753 if (Pos == Position::AFTER)
1754 ++MI;
1755
1756 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1757 switch (Scope) {
1758 case SIAtomicScope::SYSTEM:
1759 // Ensures that following loads will not see stale remote VMEM data or
1760 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1761 // CC will never be stale due to the local memory probes.
1762 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1763 // Set SC bits to indicate system scope.
1764 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1765 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1766 // hardware does not reorder memory operations by the same wave with
1767 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1768 // remove any cache lines of earlier writes by the same wave and ensures
1769 // later reads by the same wave will refetch the cache lines.
1770 Changed = true;
1771 break;
1772 case SIAtomicScope::AGENT:
1773 // Ensures that following loads will not see stale remote date or local
1774 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1775 // due to the memory probes.
1776 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1777 // Set SC bits to indicate agent scope.
1778 .addImm(AMDGPU::CPol::SC1);
1779 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1780 // does not reorder memory operations with respect to preceeding buffer
1781 // invalidate. The invalidate is guaranteed to remove any cache lines of
1782 // earlier writes and ensures later writes will refetch the cache lines.
1783 Changed = true;
1784 break;
1785 case SIAtomicScope::WORKGROUP:
1786 // In threadgroup split mode the waves of a work-group can be executing on
1787 // different CUs. Therefore need to invalidate the L1 which is per CU.
1788 // Otherwise in non-threadgroup split mode all waves of a work-group are
1789 // on the same CU, and so the L1 does not need to be invalidated.
1790 if (ST.isTgSplitEnabled()) {
1791 // Ensures L1 is invalidated if in threadgroup split mode. In
1792 // non-threadgroup split mode it is a NOP, but no point generating it in
1793 // that case if know not in that mode.
1794 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1795 // Set SC bits to indicate work-group scope.
1796 .addImm(AMDGPU::CPol::SC0);
1797 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1798 // does not reorder memory operations with respect to preceeding buffer
1799 // invalidate. The invalidate is guaranteed to remove any cache lines of
1800 // earlier writes and ensures later writes will refetch the cache lines.
1801 Changed = true;
1802 }
1803 break;
1804 case SIAtomicScope::WAVEFRONT:
1805 case SIAtomicScope::SINGLETHREAD:
1806 // Could generate "BUFFER_INV" but it would do nothing as there are no
1807 // caches to invalidate.
1808 break;
1809 default:
1810 llvm_unreachable("Unsupported synchronization scope");
1811 }
1812 }
1813
1814 /// The scratch address space does not need the global memory cache
1815 /// to be flushed as all memory operations by the same thread are
1816 /// sequentially consistent, and no other thread can access scratch
1817 /// memory.
1818
1819 /// Other address spaces do not have a cache.
1820
1821 if (Pos == Position::AFTER)
1822 --MI;
1823
1824 return Changed;
1825 }
1826
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1827 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1828 SIAtomicScope Scope,
1829 SIAtomicAddrSpace AddrSpace,
1830 bool IsCrossAddrSpaceOrdering,
1831 Position Pos) const {
1832 bool Changed = false;
1833
1834 MachineBasicBlock &MBB = *MI->getParent();
1835 DebugLoc DL = MI->getDebugLoc();
1836
1837 if (Pos == Position::AFTER)
1838 ++MI;
1839
1840 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1841 switch (Scope) {
1842 case SIAtomicScope::SYSTEM:
1843 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1844 // hardware does not reorder memory operations by the same wave with
1845 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1846 // to initiate writeback of any dirty cache lines of earlier writes by the
1847 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1848 // writeback has completed.
1849 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1850 // Set SC bits to indicate system scope.
1851 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1852 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1853 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1854 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1855 Changed = true;
1856 break;
1857 case SIAtomicScope::AGENT:
1858 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1859 // Set SC bits to indicate agent scope.
1860 .addImm(AMDGPU::CPol::SC1);
1861
1862 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1863 // SIAtomicScope::AGENT, the following insertWait will generate the
1864 // required "S_WAITCNT vmcnt(0)".
1865 Changed = true;
1866 break;
1867 case SIAtomicScope::WORKGROUP:
1868 case SIAtomicScope::WAVEFRONT:
1869 case SIAtomicScope::SINGLETHREAD:
1870 // Do not generate "BUFFER_WBL2" as there are no caches it would
1871 // writeback, and would require an otherwise unnecessary
1872 // "S_WAITCNT vmcnt(0)".
1873 break;
1874 default:
1875 llvm_unreachable("Unsupported synchronization scope");
1876 }
1877 }
1878
1879 if (Pos == Position::AFTER)
1880 --MI;
1881
1882 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1883 // S_WAITCNT needed.
1884 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1885 IsCrossAddrSpaceOrdering, Pos);
1886
1887 return Changed;
1888 }
1889
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1890 bool SIGfx10CacheControl::enableLoadCacheBypass(
1891 const MachineBasicBlock::iterator &MI,
1892 SIAtomicScope Scope,
1893 SIAtomicAddrSpace AddrSpace) const {
1894 assert(MI->mayLoad() && !MI->mayStore());
1895 bool Changed = false;
1896
1897 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1898 switch (Scope) {
1899 case SIAtomicScope::SYSTEM:
1900 case SIAtomicScope::AGENT:
1901 // Set the L0 and L1 cache policies to MISS_EVICT.
1902 // Note: there is no L2 cache coherent bypass control at the ISA level.
1903 Changed |= enableGLCBit(MI);
1904 Changed |= enableDLCBit(MI);
1905 break;
1906 case SIAtomicScope::WORKGROUP:
1907 // In WGP mode the waves of a work-group can be executing on either CU of
1908 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1909 // CU mode all waves of a work-group are on the same CU, and so the L0
1910 // does not need to be bypassed.
1911 if (!ST.isCuModeEnabled())
1912 Changed |= enableGLCBit(MI);
1913 break;
1914 case SIAtomicScope::WAVEFRONT:
1915 case SIAtomicScope::SINGLETHREAD:
1916 // No cache to bypass.
1917 break;
1918 default:
1919 llvm_unreachable("Unsupported synchronization scope");
1920 }
1921 }
1922
1923 /// The scratch address space does not need the global memory caches
1924 /// to be bypassed as all memory operations by the same thread are
1925 /// sequentially consistent, and no other thread can access scratch
1926 /// memory.
1927
1928 /// Other address spaces do not have a cache.
1929
1930 return Changed;
1931 }
1932
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const1933 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1934 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1935 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1936
1937 // Only handle load and store, not atomic read-modify-write insructions. The
1938 // latter use glc to indicate if the atomic returns a result and so must not
1939 // be used for cache control.
1940 assert(MI->mayLoad() ^ MI->mayStore());
1941
1942 // Only update load and store, not LLVM IR atomic read-modify-write
1943 // instructions. The latter are always marked as volatile so cannot sensibly
1944 // handle it as do not want to pessimize all atomics. Also they do not support
1945 // the nontemporal attribute.
1946 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1947
1948 bool Changed = false;
1949
1950 if (IsVolatile) {
1951 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1952 // and MISS_LRU for store instructions.
1953 // Note: there is no L2 cache coherent bypass control at the ISA level.
1954 if (Op == SIMemOp::LOAD) {
1955 Changed |= enableGLCBit(MI);
1956 Changed |= enableDLCBit(MI);
1957 }
1958
1959 // Ensure operation has completed at system scope to cause all volatile
1960 // operations to be visible outside the program in a global order. Do not
1961 // request cross address space as only the global address space can be
1962 // observable outside the program, so no need to cause a waitcnt for LDS
1963 // address space operations.
1964 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1965 Position::AFTER);
1966 return Changed;
1967 }
1968
1969 if (IsNonTemporal) {
1970 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1971 // and L2 cache policy to STREAM.
1972 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1973 // to MISS_EVICT and the L2 cache policy to STREAM.
1974 if (Op == SIMemOp::STORE)
1975 Changed |= enableGLCBit(MI);
1976 Changed |= enableSLCBit(MI);
1977
1978 return Changed;
1979 }
1980
1981 return Changed;
1982 }
1983
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1984 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1985 SIAtomicScope Scope,
1986 SIAtomicAddrSpace AddrSpace,
1987 SIMemOp Op,
1988 bool IsCrossAddrSpaceOrdering,
1989 Position Pos) const {
1990 bool Changed = false;
1991
1992 MachineBasicBlock &MBB = *MI->getParent();
1993 DebugLoc DL = MI->getDebugLoc();
1994
1995 if (Pos == Position::AFTER)
1996 ++MI;
1997
1998 bool VMCnt = false;
1999 bool VSCnt = false;
2000 bool LGKMCnt = false;
2001
2002 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2003 SIAtomicAddrSpace::NONE) {
2004 switch (Scope) {
2005 case SIAtomicScope::SYSTEM:
2006 case SIAtomicScope::AGENT:
2007 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2008 VMCnt |= true;
2009 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2010 VSCnt |= true;
2011 break;
2012 case SIAtomicScope::WORKGROUP:
2013 // In WGP mode the waves of a work-group can be executing on either CU of
2014 // the WGP. Therefore need to wait for operations to complete to ensure
2015 // they are visible to waves in the other CU as the L0 is per CU.
2016 // Otherwise in CU mode and all waves of a work-group are on the same CU
2017 // which shares the same L0.
2018 if (!ST.isCuModeEnabled()) {
2019 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2020 VMCnt |= true;
2021 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2022 VSCnt |= true;
2023 }
2024 break;
2025 case SIAtomicScope::WAVEFRONT:
2026 case SIAtomicScope::SINGLETHREAD:
2027 // The L0 cache keeps all memory operations in order for
2028 // work-items in the same wavefront.
2029 break;
2030 default:
2031 llvm_unreachable("Unsupported synchronization scope");
2032 }
2033 }
2034
2035 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2036 switch (Scope) {
2037 case SIAtomicScope::SYSTEM:
2038 case SIAtomicScope::AGENT:
2039 case SIAtomicScope::WORKGROUP:
2040 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2041 // not needed as LDS operations for all waves are executed in a total
2042 // global ordering as observed by all waves. Required if also
2043 // synchronizing with global/GDS memory as LDS operations could be
2044 // reordered with respect to later global/GDS memory operations of the
2045 // same wave.
2046 LGKMCnt |= IsCrossAddrSpaceOrdering;
2047 break;
2048 case SIAtomicScope::WAVEFRONT:
2049 case SIAtomicScope::SINGLETHREAD:
2050 // The LDS keeps all memory operations in order for
2051 // the same wavefront.
2052 break;
2053 default:
2054 llvm_unreachable("Unsupported synchronization scope");
2055 }
2056 }
2057
2058 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2059 switch (Scope) {
2060 case SIAtomicScope::SYSTEM:
2061 case SIAtomicScope::AGENT:
2062 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2063 // is not needed as GDS operations for all waves are executed in a total
2064 // global ordering as observed by all waves. Required if also
2065 // synchronizing with global/LDS memory as GDS operations could be
2066 // reordered with respect to later global/LDS memory operations of the
2067 // same wave.
2068 LGKMCnt |= IsCrossAddrSpaceOrdering;
2069 break;
2070 case SIAtomicScope::WORKGROUP:
2071 case SIAtomicScope::WAVEFRONT:
2072 case SIAtomicScope::SINGLETHREAD:
2073 // The GDS keeps all memory operations in order for
2074 // the same work-group.
2075 break;
2076 default:
2077 llvm_unreachable("Unsupported synchronization scope");
2078 }
2079 }
2080
2081 if (VMCnt || LGKMCnt) {
2082 unsigned WaitCntImmediate =
2083 AMDGPU::encodeWaitcnt(IV,
2084 VMCnt ? 0 : getVmcntBitMask(IV),
2085 getExpcntBitMask(IV),
2086 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2087 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2088 .addImm(WaitCntImmediate);
2089 Changed = true;
2090 }
2091
2092 if (VSCnt) {
2093 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2094 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2095 .addImm(0);
2096 Changed = true;
2097 }
2098
2099 if (Pos == Position::AFTER)
2100 --MI;
2101
2102 return Changed;
2103 }
2104
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const2105 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2106 SIAtomicScope Scope,
2107 SIAtomicAddrSpace AddrSpace,
2108 Position Pos) const {
2109 if (!InsertCacheInv)
2110 return false;
2111
2112 bool Changed = false;
2113
2114 MachineBasicBlock &MBB = *MI->getParent();
2115 DebugLoc DL = MI->getDebugLoc();
2116
2117 if (Pos == Position::AFTER)
2118 ++MI;
2119
2120 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2121 switch (Scope) {
2122 case SIAtomicScope::SYSTEM:
2123 case SIAtomicScope::AGENT:
2124 // The order of invalidates matter here. We must invalidate "outer in"
2125 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2126 // invalidated.
2127 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2129 Changed = true;
2130 break;
2131 case SIAtomicScope::WORKGROUP:
2132 // In WGP mode the waves of a work-group can be executing on either CU of
2133 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2134 // in CU mode and all waves of a work-group are on the same CU, and so the
2135 // L0 does not need to be invalidated.
2136 if (!ST.isCuModeEnabled()) {
2137 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2138 Changed = true;
2139 }
2140 break;
2141 case SIAtomicScope::WAVEFRONT:
2142 case SIAtomicScope::SINGLETHREAD:
2143 // No cache to invalidate.
2144 break;
2145 default:
2146 llvm_unreachable("Unsupported synchronization scope");
2147 }
2148 }
2149
2150 /// The scratch address space does not need the global memory cache
2151 /// to be flushed as all memory operations by the same thread are
2152 /// sequentially consistent, and no other thread can access scratch
2153 /// memory.
2154
2155 /// Other address spaces do not have a cache.
2156
2157 if (Pos == Position::AFTER)
2158 --MI;
2159
2160 return Changed;
2161 }
2162
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const2163 bool SIGfx11CacheControl::enableLoadCacheBypass(
2164 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2165 SIAtomicAddrSpace AddrSpace) const {
2166 assert(MI->mayLoad() && !MI->mayStore());
2167 bool Changed = false;
2168
2169 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2170 switch (Scope) {
2171 case SIAtomicScope::SYSTEM:
2172 case SIAtomicScope::AGENT:
2173 // Set the L0 and L1 cache policies to MISS_EVICT.
2174 // Note: there is no L2 cache coherent bypass control at the ISA level.
2175 Changed |= enableGLCBit(MI);
2176 break;
2177 case SIAtomicScope::WORKGROUP:
2178 // In WGP mode the waves of a work-group can be executing on either CU of
2179 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2180 // CU mode all waves of a work-group are on the same CU, and so the L0
2181 // does not need to be bypassed.
2182 if (!ST.isCuModeEnabled())
2183 Changed |= enableGLCBit(MI);
2184 break;
2185 case SIAtomicScope::WAVEFRONT:
2186 case SIAtomicScope::SINGLETHREAD:
2187 // No cache to bypass.
2188 break;
2189 default:
2190 llvm_unreachable("Unsupported synchronization scope");
2191 }
2192 }
2193
2194 /// The scratch address space does not need the global memory caches
2195 /// to be bypassed as all memory operations by the same thread are
2196 /// sequentially consistent, and no other thread can access scratch
2197 /// memory.
2198
2199 /// Other address spaces do not have a cache.
2200
2201 return Changed;
2202 }
2203
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const2204 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2205 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2206 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2207
2208 // Only handle load and store, not atomic read-modify-write insructions. The
2209 // latter use glc to indicate if the atomic returns a result and so must not
2210 // be used for cache control.
2211 assert(MI->mayLoad() ^ MI->mayStore());
2212
2213 // Only update load and store, not LLVM IR atomic read-modify-write
2214 // instructions. The latter are always marked as volatile so cannot sensibly
2215 // handle it as do not want to pessimize all atomics. Also they do not support
2216 // the nontemporal attribute.
2217 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2218
2219 bool Changed = false;
2220
2221 if (IsVolatile) {
2222 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2223 // and MISS_LRU for store instructions.
2224 // Note: there is no L2 cache coherent bypass control at the ISA level.
2225 if (Op == SIMemOp::LOAD)
2226 Changed |= enableGLCBit(MI);
2227
2228 // Set MALL NOALLOC for load and store instructions.
2229 Changed |= enableDLCBit(MI);
2230
2231 // Ensure operation has completed at system scope to cause all volatile
2232 // operations to be visible outside the program in a global order. Do not
2233 // request cross address space as only the global address space can be
2234 // observable outside the program, so no need to cause a waitcnt for LDS
2235 // address space operations.
2236 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2237 Position::AFTER);
2238 return Changed;
2239 }
2240
2241 if (IsNonTemporal) {
2242 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2243 // and L2 cache policy to STREAM.
2244 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2245 // to MISS_EVICT and the L2 cache policy to STREAM.
2246 if (Op == SIMemOp::STORE)
2247 Changed |= enableGLCBit(MI);
2248 Changed |= enableSLCBit(MI);
2249
2250 // Set MALL NOALLOC for load and store instructions.
2251 Changed |= enableDLCBit(MI);
2252 return Changed;
2253 }
2254
2255 return Changed;
2256 }
2257
setTH(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Value) const2258 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2259 AMDGPU::CPol::CPol Value) const {
2260 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2261 if (!CPol)
2262 return false;
2263
2264 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2265 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2266 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2267 return true;
2268 }
2269
2270 return false;
2271 }
2272
setScope(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Value) const2273 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2274 AMDGPU::CPol::CPol Value) const {
2275 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2276 if (!CPol)
2277 return false;
2278
2279 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2280 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2281 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2282 return true;
2283 }
2284
2285 return false;
2286 }
2287
insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const2288 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2289 const MachineBasicBlock::iterator MI) const {
2290 // TODO: implement flag for frontend to give us a hint not to insert waits.
2291
2292 MachineBasicBlock &MBB = *MI->getParent();
2293 const DebugLoc &DL = MI->getDebugLoc();
2294
2295 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2296 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2297 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2298 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2299 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2300
2301 return true;
2302 }
2303
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const2304 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2305 SIAtomicScope Scope,
2306 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2307 bool IsCrossAddrSpaceOrdering,
2308 Position Pos) const {
2309 bool Changed = false;
2310
2311 MachineBasicBlock &MBB = *MI->getParent();
2312 DebugLoc DL = MI->getDebugLoc();
2313
2314 bool LOADCnt = false;
2315 bool DSCnt = false;
2316 bool STORECnt = false;
2317
2318 if (Pos == Position::AFTER)
2319 ++MI;
2320
2321 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2322 SIAtomicAddrSpace::NONE) {
2323 switch (Scope) {
2324 case SIAtomicScope::SYSTEM:
2325 case SIAtomicScope::AGENT:
2326 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2327 LOADCnt |= true;
2328 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2329 STORECnt |= true;
2330 break;
2331 case SIAtomicScope::WORKGROUP:
2332 // In WGP mode the waves of a work-group can be executing on either CU of
2333 // the WGP. Therefore need to wait for operations to complete to ensure
2334 // they are visible to waves in the other CU as the L0 is per CU.
2335 // Otherwise in CU mode and all waves of a work-group are on the same CU
2336 // which shares the same L0.
2337 if (!ST.isCuModeEnabled()) {
2338 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2339 LOADCnt |= true;
2340 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2341 STORECnt |= true;
2342 }
2343 break;
2344 case SIAtomicScope::WAVEFRONT:
2345 case SIAtomicScope::SINGLETHREAD:
2346 // The L0 cache keeps all memory operations in order for
2347 // work-items in the same wavefront.
2348 break;
2349 default:
2350 llvm_unreachable("Unsupported synchronization scope");
2351 }
2352 }
2353
2354 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2355 switch (Scope) {
2356 case SIAtomicScope::SYSTEM:
2357 case SIAtomicScope::AGENT:
2358 case SIAtomicScope::WORKGROUP:
2359 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2360 // not needed as LDS operations for all waves are executed in a total
2361 // global ordering as observed by all waves. Required if also
2362 // synchronizing with global/GDS memory as LDS operations could be
2363 // reordered with respect to later global/GDS memory operations of the
2364 // same wave.
2365 DSCnt |= IsCrossAddrSpaceOrdering;
2366 break;
2367 case SIAtomicScope::WAVEFRONT:
2368 case SIAtomicScope::SINGLETHREAD:
2369 // The LDS keeps all memory operations in order for
2370 // the same wavefront.
2371 break;
2372 default:
2373 llvm_unreachable("Unsupported synchronization scope");
2374 }
2375 }
2376
2377 if (LOADCnt) {
2378 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2379 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2380 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2381 Changed = true;
2382 }
2383
2384 if (STORECnt) {
2385 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2386 Changed = true;
2387 }
2388
2389 if (DSCnt) {
2390 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2391 Changed = true;
2392 }
2393
2394 if (Pos == Position::AFTER)
2395 --MI;
2396
2397 return Changed;
2398 }
2399
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const2400 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2401 SIAtomicScope Scope,
2402 SIAtomicAddrSpace AddrSpace,
2403 Position Pos) const {
2404 if (!InsertCacheInv)
2405 return false;
2406
2407 MachineBasicBlock &MBB = *MI->getParent();
2408 DebugLoc DL = MI->getDebugLoc();
2409
2410 /// The scratch address space does not need the global memory cache
2411 /// to be flushed as all memory operations by the same thread are
2412 /// sequentially consistent, and no other thread can access scratch
2413 /// memory.
2414
2415 /// Other address spaces do not have a cache.
2416 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2417 return false;
2418
2419 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2420 switch (Scope) {
2421 case SIAtomicScope::SYSTEM:
2422 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2423 break;
2424 case SIAtomicScope::AGENT:
2425 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2426 break;
2427 case SIAtomicScope::WORKGROUP:
2428 // In WGP mode the waves of a work-group can be executing on either CU of
2429 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2430 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2431 // the L0 does not need to be invalidated.
2432 if (ST.isCuModeEnabled())
2433 return false;
2434
2435 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2436 break;
2437 case SIAtomicScope::WAVEFRONT:
2438 case SIAtomicScope::SINGLETHREAD:
2439 // No cache to invalidate.
2440 return false;
2441 default:
2442 llvm_unreachable("Unsupported synchronization scope");
2443 }
2444
2445 if (Pos == Position::AFTER)
2446 ++MI;
2447
2448 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2449
2450 if (Pos == Position::AFTER)
2451 --MI;
2452
2453 return true;
2454 }
2455
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const2456 bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2457 SIAtomicScope Scope,
2458 SIAtomicAddrSpace AddrSpace,
2459 bool IsCrossAddrSpaceOrdering,
2460 Position Pos) const {
2461 MachineBasicBlock &MBB = *MI->getParent();
2462 DebugLoc DL = MI->getDebugLoc();
2463
2464 // The scratch address space does not need the global memory cache
2465 // writeback as all memory operations by the same thread are
2466 // sequentially consistent, and no other thread can access scratch
2467 // memory.
2468
2469 // Other address spaces do not have a cache.
2470 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2471 return false;
2472
2473 if (Pos == Position::AFTER)
2474 ++MI;
2475
2476 // GLOBAL_WB is always needed, even for write-through caches, as it
2477 // additionally ensures all operations have reached the desired cache level.
2478 bool SkipWB = false;
2479 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2480 switch (Scope) {
2481 case SIAtomicScope::SYSTEM:
2482 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2483 break;
2484 case SIAtomicScope::AGENT:
2485 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2486 break;
2487 case SIAtomicScope::WORKGROUP:
2488 // In WGP mode the waves of a work-group can be executing on either CU of
2489 // the WGP. Therefore we need to ensure all operations have reached L1,
2490 // hence the SCOPE_SE WB.
2491 // For CU mode, we need operations to reach L0, so the wait is enough -
2492 // there are no ways for an operation to report completion without reaching
2493 // at least L0.
2494 if (ST.isCuModeEnabled())
2495 SkipWB = true;
2496 else
2497 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2498 break;
2499 case SIAtomicScope::WAVEFRONT:
2500 case SIAtomicScope::SINGLETHREAD:
2501 // No cache to invalidate.
2502 return false;
2503 default:
2504 llvm_unreachable("Unsupported synchronization scope");
2505 }
2506
2507 if (!SkipWB)
2508 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2509
2510 if (Pos == Position::AFTER)
2511 --MI;
2512
2513 // We always have to wait for previous memory operations (load/store) to
2514 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2515 // we of course need to wait for that as well.
2516 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2517 IsCrossAddrSpaceOrdering, Pos);
2518
2519 return true;
2520 }
2521
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal,bool IsLastUse=false) const2522 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2523 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2524 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2525
2526 // Only handle load and store, not atomic read-modify-write instructions.
2527 assert(MI->mayLoad() ^ MI->mayStore());
2528
2529 // Only update load and store, not LLVM IR atomic read-modify-write
2530 // instructions. The latter are always marked as volatile so cannot sensibly
2531 // handle it as do not want to pessimize all atomics. Also they do not support
2532 // the nontemporal attribute.
2533 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2534
2535 bool Changed = false;
2536
2537 if (IsLastUse) {
2538 // Set last-use hint.
2539 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2540 } else if (IsNonTemporal) {
2541 // Set non-temporal hint for all cache levels.
2542 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2543 }
2544
2545 if (IsVolatile) {
2546 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2547
2548 if (Op == SIMemOp::STORE)
2549 Changed |= insertWaitsBeforeSystemScopeStore(MI);
2550
2551 // Ensure operation has completed at system scope to cause all volatile
2552 // operations to be visible outside the program in a global order. Do not
2553 // request cross address space as only the global address space can be
2554 // observable outside the program, so no need to cause a waitcnt for LDS
2555 // address space operations.
2556 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2557 Position::AFTER);
2558 }
2559
2560 return Changed;
2561 }
2562
expandSystemScopeStore(MachineBasicBlock::iterator & MI) const2563 bool SIGfx12CacheControl::expandSystemScopeStore(
2564 MachineBasicBlock::iterator &MI) const {
2565 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2566 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2567 return insertWaitsBeforeSystemScopeStore(MI);
2568
2569 return false;
2570 }
2571
setAtomicScope(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const2572 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2573 SIAtomicScope Scope,
2574 SIAtomicAddrSpace AddrSpace) const {
2575 bool Changed = false;
2576
2577 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2578 switch (Scope) {
2579 case SIAtomicScope::SYSTEM:
2580 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2581 break;
2582 case SIAtomicScope::AGENT:
2583 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2584 break;
2585 case SIAtomicScope::WORKGROUP:
2586 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2587 // different CUs that access different L0s.
2588 if (!ST.isCuModeEnabled())
2589 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2590 break;
2591 case SIAtomicScope::WAVEFRONT:
2592 case SIAtomicScope::SINGLETHREAD:
2593 // No cache to bypass.
2594 break;
2595 default:
2596 llvm_unreachable("Unsupported synchronization scope");
2597 }
2598 }
2599
2600 // The scratch address space does not need the global memory caches
2601 // to be bypassed as all memory operations by the same thread are
2602 // sequentially consistent, and no other thread can access scratch
2603 // memory.
2604
2605 // Other address spaces do not have a cache.
2606
2607 return Changed;
2608 }
2609
removeAtomicPseudoMIs()2610 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2611 if (AtomicPseudoMIs.empty())
2612 return false;
2613
2614 for (auto &MI : AtomicPseudoMIs)
2615 MI->eraseFromParent();
2616
2617 AtomicPseudoMIs.clear();
2618 return true;
2619 }
2620
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2621 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2622 MachineBasicBlock::iterator &MI) {
2623 assert(MI->mayLoad() && !MI->mayStore());
2624
2625 bool Changed = false;
2626
2627 if (MOI.isAtomic()) {
2628 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2629 MOI.getOrdering() == AtomicOrdering::Acquire ||
2630 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2631 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2632 MOI.getOrderingAddrSpace());
2633 }
2634
2635 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2636 Changed |= CC->insertWait(MI, MOI.getScope(),
2637 MOI.getOrderingAddrSpace(),
2638 SIMemOp::LOAD | SIMemOp::STORE,
2639 MOI.getIsCrossAddressSpaceOrdering(),
2640 Position::BEFORE);
2641
2642 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2643 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2644 Changed |= CC->insertWait(MI, MOI.getScope(),
2645 MOI.getInstrAddrSpace(),
2646 SIMemOp::LOAD,
2647 MOI.getIsCrossAddressSpaceOrdering(),
2648 Position::AFTER);
2649 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2650 MOI.getOrderingAddrSpace(),
2651 Position::AFTER);
2652 }
2653
2654 return Changed;
2655 }
2656
2657 // Atomic instructions already bypass caches to the scope specified by the
2658 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2659 // instructions need additional treatment.
2660 Changed |= CC->enableVolatileAndOrNonTemporal(
2661 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2662 MOI.isNonTemporal(), MOI.isLastUse());
2663
2664 return Changed;
2665 }
2666
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2667 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2668 MachineBasicBlock::iterator &MI) {
2669 assert(!MI->mayLoad() && MI->mayStore());
2670
2671 bool Changed = false;
2672
2673 if (MOI.isAtomic()) {
2674 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2675 MOI.getOrdering() == AtomicOrdering::Release ||
2676 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2677 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2678 MOI.getOrderingAddrSpace());
2679 }
2680
2681 if (MOI.getOrdering() == AtomicOrdering::Release ||
2682 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2683 Changed |= CC->insertRelease(MI, MOI.getScope(),
2684 MOI.getOrderingAddrSpace(),
2685 MOI.getIsCrossAddressSpaceOrdering(),
2686 Position::BEFORE);
2687
2688 return Changed;
2689 }
2690
2691 // Atomic instructions already bypass caches to the scope specified by the
2692 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2693 // need additional treatment.
2694 Changed |= CC->enableVolatileAndOrNonTemporal(
2695 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2696 MOI.isNonTemporal());
2697
2698 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2699 // instruction field, do not confuse it with atomic scope.
2700 Changed |= CC->expandSystemScopeStore(MI);
2701 return Changed;
2702 }
2703
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2704 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2705 MachineBasicBlock::iterator &MI) {
2706 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2707
2708 AtomicPseudoMIs.push_back(MI);
2709 bool Changed = false;
2710
2711 // Refine fenced address space based on MMRAs.
2712 //
2713 // TODO: Should we support this MMRA on other atomic operations?
2714 auto OrderingAddrSpace =
2715 getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2716
2717 if (MOI.isAtomic()) {
2718 if (MOI.getOrdering() == AtomicOrdering::Acquire)
2719 Changed |= CC->insertWait(
2720 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2721 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
2722
2723 if (MOI.getOrdering() == AtomicOrdering::Release ||
2724 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2725 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2726 /// TODO: This relies on a barrier always generating a waitcnt
2727 /// for LDS to ensure it is not reordered with the completion of
2728 /// the proceeding LDS operations. If barrier had a memory
2729 /// ordering and memory scope, then library does not need to
2730 /// generate a fence. Could add support in this file for
2731 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2732 /// adding S_WAITCNT before a S_BARRIER.
2733 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2734 MOI.getIsCrossAddressSpaceOrdering(),
2735 Position::BEFORE);
2736
2737 // TODO: If both release and invalidate are happening they could be combined
2738 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2739 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2740 // track cache invalidate and write back instructions.
2741
2742 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2743 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2744 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2745 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2746 Position::BEFORE);
2747
2748 return Changed;
2749 }
2750
2751 return Changed;
2752 }
2753
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2754 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2755 MachineBasicBlock::iterator &MI) {
2756 assert(MI->mayLoad() && MI->mayStore());
2757
2758 bool Changed = false;
2759
2760 if (MOI.isAtomic()) {
2761 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2762 MOI.getOrdering() == AtomicOrdering::Acquire ||
2763 MOI.getOrdering() == AtomicOrdering::Release ||
2764 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2765 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2766 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2767 MOI.getInstrAddrSpace());
2768 }
2769
2770 if (MOI.getOrdering() == AtomicOrdering::Release ||
2771 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2772 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2773 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2774 Changed |= CC->insertRelease(MI, MOI.getScope(),
2775 MOI.getOrderingAddrSpace(),
2776 MOI.getIsCrossAddressSpaceOrdering(),
2777 Position::BEFORE);
2778
2779 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2780 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2781 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2782 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2783 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2784 Changed |= CC->insertWait(MI, MOI.getScope(),
2785 MOI.getInstrAddrSpace(),
2786 isAtomicRet(*MI) ? SIMemOp::LOAD :
2787 SIMemOp::STORE,
2788 MOI.getIsCrossAddressSpaceOrdering(),
2789 Position::AFTER);
2790 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2791 MOI.getOrderingAddrSpace(),
2792 Position::AFTER);
2793 }
2794
2795 return Changed;
2796 }
2797
2798 return Changed;
2799 }
2800
runOnMachineFunction(MachineFunction & MF)2801 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2802 bool Changed = false;
2803
2804 const MachineModuleInfo &MMI =
2805 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2806
2807 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2808 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2809
2810 for (auto &MBB : MF) {
2811 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2812
2813 // Unbundle instructions after the post-RA scheduler.
2814 if (MI->isBundle() && MI->mayLoadOrStore()) {
2815 MachineBasicBlock::instr_iterator II(MI->getIterator());
2816 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2817 I != E && I->isBundledWithPred(); ++I) {
2818 I->unbundleFromPred();
2819 for (MachineOperand &MO : I->operands())
2820 if (MO.isReg())
2821 MO.setIsInternalRead(false);
2822 }
2823
2824 MI->eraseFromParent();
2825 MI = II->getIterator();
2826 }
2827
2828 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2829 continue;
2830
2831 if (const auto &MOI = MOA.getLoadInfo(MI))
2832 Changed |= expandLoad(*MOI, MI);
2833 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2834 Changed |= expandStore(*MOI, MI);
2835 Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2836 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2837 Changed |= expandAtomicFence(*MOI, MI);
2838 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2839 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2840 }
2841 }
2842
2843 Changed |= removeAtomicPseudoMIs();
2844 return Changed;
2845 }
2846
2847 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2848
2849 char SIMemoryLegalizer::ID = 0;
2850 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2851
createSIMemoryLegalizerPass()2852 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2853 return new SIMemoryLegalizer();
2854 }
2855