1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/PostOrderIterator.h"
33 #include "llvm/ADT/Sequence.h"
34 #include "llvm/Analysis/AliasAnalysis.h"
35 #include "llvm/CodeGen/MachineLoopInfo.h"
36 #include "llvm/CodeGen/MachinePostDominators.h"
37 #include "llvm/InitializePasses.h"
38 #include "llvm/Support/DebugCounter.h"
39 #include "llvm/TargetParser/TargetParser.h"
40 using namespace llvm;
41
42 #define DEBUG_TYPE "si-insert-waitcnts"
43
44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
51 static cl::opt<bool> ForceEmitZeroFlag(
52 "amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
56 namespace {
57 // Class of object that encapsulates latest instruction counter score
58 // associated with the operand. Used for determining whether
59 // s_waitcnt instruction needs to be emitted.
60
61 enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72 };
73 } // namespace
74
75 namespace llvm {
76 template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78 };
79 } // namespace llvm
80
81 namespace {
82 // Return an iterator over all counters between LOAD_CNT (the first counter)
83 // and \c MaxCounter (exclusive, default value yields an enumeration over
84 // all counters).
inst_counter_types(InstCounterType MaxCounter=NUM_INST_CNTS)85 auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
87 }
88
89 using RegInterval = std::pair<int, int>;
90
91 struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99 };
100
101 struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106 };
107
108 enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126 };
127
128 // The mapping is:
129 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132 // We reserve a fixed number of VGPR slots in the scoring tables for
133 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
134 enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146 };
147
148 // Enumerate different types of result-returning VMEM operations. Although
149 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
150 // s_waitcnt only instructions of the same VmemType are guaranteed to write
151 // their results in order -- so there is no need to insert an s_waitcnt between
152 // two instructions of the same type that write the same vgpr.
153 enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161 };
162
163 // Maps values of InstCounterType to the instruction that waits on that
164 // counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165 // returns true.
166 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
updateVMCntOnly(const MachineInstr & Inst)171 static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
173 SIInstrInfo::isFLATScratch(Inst);
174 }
175
176 #ifndef NDEBUG
isNormalMode(InstCounterType MaxCounter)177 static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179 }
180 #endif // NDEBUG
181
getVmemType(const MachineInstr & Inst)182 VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
185 !SIInstrInfo::isVSAMPLE(Inst))
186 return VMEM_NOSAMPLER;
187 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
189 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
190 // We have to make an additional check for isVSAMPLE here since some
191 // instructions don't have a sampler, but are still classified as sampler
192 // instructions for the purposes of e.g. waitcnt.
193 return BaseInfo->BVH ? VMEM_BVH
194 : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
195 : VMEM_NOSAMPLER;
196 }
197
getCounterRef(AMDGPU::Waitcnt & Wait,InstCounterType T)198 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
199 switch (T) {
200 case LOAD_CNT:
201 return Wait.LoadCnt;
202 case EXP_CNT:
203 return Wait.ExpCnt;
204 case DS_CNT:
205 return Wait.DsCnt;
206 case STORE_CNT:
207 return Wait.StoreCnt;
208 case SAMPLE_CNT:
209 return Wait.SampleCnt;
210 case BVH_CNT:
211 return Wait.BvhCnt;
212 case KM_CNT:
213 return Wait.KmCnt;
214 default:
215 llvm_unreachable("bad InstCounterType");
216 }
217 }
218
addWait(AMDGPU::Waitcnt & Wait,InstCounterType T,unsigned Count)219 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
220 unsigned &WC = getCounterRef(Wait, T);
221 WC = std::min(WC, Count);
222 }
223
setNoWait(AMDGPU::Waitcnt & Wait,InstCounterType T)224 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 getCounterRef(Wait, T) = ~0u;
226 }
227
getWait(AMDGPU::Waitcnt & Wait,InstCounterType T)228 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
229 return getCounterRef(Wait, T);
230 }
231
232 // Mapping from event to counter according to the table masks.
eventCounter(const unsigned * masks,WaitEventType E)233 InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
234 for (auto T : inst_counter_types()) {
235 if (masks[T] & (1 << E))
236 return T;
237 }
238 llvm_unreachable("event type has no associated counter");
239 }
240
241 // This objects maintains the current score brackets of each wait counter, and
242 // a per-register scoreboard for each wait counter.
243 //
244 // We also maintain the latest score for every event type that can change the
245 // waitcnt in order to know if there are multiple types of events within
246 // the brackets. When multiple types of event happen in the bracket,
247 // wait count may get decreased out of order, therefore we need to put in
248 // "s_waitcnt 0" before use.
249 class WaitcntBrackets {
250 public:
WaitcntBrackets(const GCNSubtarget * SubTarget,InstCounterType MaxCounter,HardwareLimits Limits,RegisterEncoding Encoding,const unsigned * WaitEventMaskForInst,InstCounterType SmemAccessCounter)251 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
252 HardwareLimits Limits, RegisterEncoding Encoding,
253 const unsigned *WaitEventMaskForInst,
254 InstCounterType SmemAccessCounter)
255 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
256 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
257 SmemAccessCounter(SmemAccessCounter) {}
258
getWaitCountMax(InstCounterType T) const259 unsigned getWaitCountMax(InstCounterType T) const {
260 switch (T) {
261 case LOAD_CNT:
262 return Limits.LoadcntMax;
263 case DS_CNT:
264 return Limits.DscntMax;
265 case EXP_CNT:
266 return Limits.ExpcntMax;
267 case STORE_CNT:
268 return Limits.StorecntMax;
269 case SAMPLE_CNT:
270 return Limits.SamplecntMax;
271 case BVH_CNT:
272 return Limits.BvhcntMax;
273 case KM_CNT:
274 return Limits.KmcntMax;
275 default:
276 break;
277 }
278 return 0;
279 }
280
getScoreLB(InstCounterType T) const281 unsigned getScoreLB(InstCounterType T) const {
282 assert(T < NUM_INST_CNTS);
283 return ScoreLBs[T];
284 }
285
getScoreUB(InstCounterType T) const286 unsigned getScoreUB(InstCounterType T) const {
287 assert(T < NUM_INST_CNTS);
288 return ScoreUBs[T];
289 }
290
getScoreRange(InstCounterType T) const291 unsigned getScoreRange(InstCounterType T) const {
292 return getScoreUB(T) - getScoreLB(T);
293 }
294
getRegScore(int GprNo,InstCounterType T) const295 unsigned getRegScore(int GprNo, InstCounterType T) const {
296 if (GprNo < NUM_ALL_VGPRS) {
297 return VgprScores[T][GprNo];
298 }
299 assert(T == SmemAccessCounter);
300 return SgprScores[GprNo - NUM_ALL_VGPRS];
301 }
302
303 bool merge(const WaitcntBrackets &Other);
304
305 RegInterval getRegInterval(const MachineInstr *MI,
306 const MachineRegisterInfo *MRI,
307 const SIRegisterInfo *TRI, unsigned OpNo) const;
308
309 bool counterOutOfOrder(InstCounterType T) const;
310 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
311 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
312 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
313 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
314 void applyWaitcnt(InstCounterType T, unsigned Count);
315 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
316 const MachineRegisterInfo *MRI, WaitEventType E,
317 MachineInstr &MI);
318
hasPendingEvent() const319 unsigned hasPendingEvent() const { return PendingEvents; }
hasPendingEvent(WaitEventType E) const320 unsigned hasPendingEvent(WaitEventType E) const {
321 return PendingEvents & (1 << E);
322 }
hasPendingEvent(InstCounterType T) const323 unsigned hasPendingEvent(InstCounterType T) const {
324 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
325 assert((HasPending != 0) == (getScoreRange(T) != 0));
326 return HasPending;
327 }
328
hasMixedPendingEvents(InstCounterType T) const329 bool hasMixedPendingEvents(InstCounterType T) const {
330 unsigned Events = hasPendingEvent(T);
331 // Return true if more than one bit is set in Events.
332 return Events & (Events - 1);
333 }
334
hasPendingFlat() const335 bool hasPendingFlat() const {
336 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
337 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
338 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
339 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
340 }
341
setPendingFlat()342 void setPendingFlat() {
343 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
344 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
345 }
346
347 // Return true if there might be pending writes to the specified vgpr by VMEM
348 // instructions with types different from V.
hasOtherPendingVmemTypes(int GprNo,VmemType V) const349 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
350 assert(GprNo < NUM_ALL_VGPRS);
351 return VgprVmemTypes[GprNo] & ~(1 << V);
352 }
353
clearVgprVmemTypes(int GprNo)354 void clearVgprVmemTypes(int GprNo) {
355 assert(GprNo < NUM_ALL_VGPRS);
356 VgprVmemTypes[GprNo] = 0;
357 }
358
setStateOnFunctionEntryOrReturn()359 void setStateOnFunctionEntryOrReturn() {
360 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
361 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
362 }
363
getLDSDMAStores() const364 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
365 return LDSDMAStores;
366 }
367
368 void print(raw_ostream &);
dump()369 void dump() { print(dbgs()); }
370
371 private:
372 struct MergeInfo {
373 unsigned OldLB;
374 unsigned OtherLB;
375 unsigned MyShift;
376 unsigned OtherShift;
377 };
378 static bool mergeScore(const MergeInfo &M, unsigned &Score,
379 unsigned OtherScore);
380
setScoreLB(InstCounterType T,unsigned Val)381 void setScoreLB(InstCounterType T, unsigned Val) {
382 assert(T < NUM_INST_CNTS);
383 ScoreLBs[T] = Val;
384 }
385
setScoreUB(InstCounterType T,unsigned Val)386 void setScoreUB(InstCounterType T, unsigned Val) {
387 assert(T < NUM_INST_CNTS);
388 ScoreUBs[T] = Val;
389
390 if (T != EXP_CNT)
391 return;
392
393 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
394 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
395 }
396
setRegScore(int GprNo,InstCounterType T,unsigned Val)397 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
398 if (GprNo < NUM_ALL_VGPRS) {
399 VgprUB = std::max(VgprUB, GprNo);
400 VgprScores[T][GprNo] = Val;
401 } else {
402 assert(T == SmemAccessCounter);
403 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
404 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
405 }
406 }
407
408 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
409 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
410 unsigned OpNo, unsigned Val);
411
412 const GCNSubtarget *ST = nullptr;
413 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
414 HardwareLimits Limits = {};
415 RegisterEncoding Encoding = {};
416 const unsigned *WaitEventMaskForInst;
417 InstCounterType SmemAccessCounter;
418 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
419 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
420 unsigned PendingEvents = 0;
421 // Remember the last flat memory operation.
422 unsigned LastFlat[NUM_INST_CNTS] = {0};
423 // wait_cnt scores for every vgpr.
424 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
425 int VgprUB = -1;
426 int SgprUB = -1;
427 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
428 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
429 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
430 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
431 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
432 // write to each vgpr.
433 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
434 // Store representative LDS DMA operations. The only useful info here is
435 // alias info. One store is kept per unique AAInfo.
436 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
437 };
438
439 // This abstracts the logic for generating and updating S_WAIT* instructions
440 // away from the analysis that determines where they are needed. This was
441 // done because the set of counters and instructions for waiting on them
442 // underwent a major shift with gfx12, sufficiently so that having this
443 // abstraction allows the main analysis logic to be simpler than it would
444 // otherwise have had to become.
445 class WaitcntGenerator {
446 protected:
447 const GCNSubtarget *ST = nullptr;
448 const SIInstrInfo *TII = nullptr;
449 AMDGPU::IsaVersion IV;
450 InstCounterType MaxCounter;
451 bool OptNone;
452
453 public:
454 WaitcntGenerator() = default;
WaitcntGenerator(const MachineFunction & MF,InstCounterType MaxCounter)455 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
456 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
457 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
458 OptNone(MF.getFunction().hasOptNone() ||
459 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
460
461 // Return true if the current function should be compiled with no
462 // optimization.
isOptNone() const463 bool isOptNone() const { return OptNone; }
464
465 // Edits an existing sequence of wait count instructions according
466 // to an incoming Waitcnt value, which is itself updated to reflect
467 // any new wait count instructions which may need to be generated by
468 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
469 // were made.
470 //
471 // This editing will usually be merely updated operands, but it may also
472 // delete instructions if the incoming Wait value indicates they are not
473 // needed. It may also remove existing instructions for which a wait
474 // is needed if it can be determined that it is better to generate new
475 // instructions later, as can happen on gfx12.
476 virtual bool
477 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
478 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
479 MachineBasicBlock::instr_iterator It) const = 0;
480
481 // Transform a soft waitcnt into a normal one.
482 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
483
484 // Generates new wait count instructions according to the value of
485 // Wait, returning true if any new instructions were created.
486 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
487 MachineBasicBlock::instr_iterator It,
488 AMDGPU::Waitcnt Wait) = 0;
489
490 // Returns an array of bit masks which can be used to map values in
491 // WaitEventType to corresponding counter values in InstCounterType.
492 virtual const unsigned *getWaitEventMask() const = 0;
493
494 // Returns a new waitcnt with all counters except VScnt set to 0. If
495 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
496 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
497
498 virtual ~WaitcntGenerator() = default;
499
500 // Create a mask value from the initializer list of wait event types.
501 static constexpr unsigned
eventMask(std::initializer_list<WaitEventType> Events)502 eventMask(std::initializer_list<WaitEventType> Events) {
503 unsigned Mask = 0;
504 for (auto &E : Events)
505 Mask |= 1 << E;
506
507 return Mask;
508 }
509 };
510
511 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
512 public:
513 WaitcntGeneratorPreGFX12() = default;
WaitcntGeneratorPreGFX12(const MachineFunction & MF)514 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
515 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
516
517 bool
518 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
519 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
520 MachineBasicBlock::instr_iterator It) const override;
521
522 bool createNewWaitcnt(MachineBasicBlock &Block,
523 MachineBasicBlock::instr_iterator It,
524 AMDGPU::Waitcnt Wait) override;
525
getWaitEventMask() const526 const unsigned *getWaitEventMask() const override {
527 assert(ST);
528
529 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
530 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
531 VMEM_BVH_READ_ACCESS}),
532 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
533 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
534 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
535 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
536 0,
537 0,
538 0};
539
540 return WaitEventMaskForInstPreGFX12;
541 }
542
543 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
544 };
545
546 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
547 public:
548 WaitcntGeneratorGFX12Plus() = default;
WaitcntGeneratorGFX12Plus(const MachineFunction & MF,InstCounterType MaxCounter)549 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
550 InstCounterType MaxCounter)
551 : WaitcntGenerator(MF, MaxCounter) {}
552
553 bool
554 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
555 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
556 MachineBasicBlock::instr_iterator It) const override;
557
558 bool createNewWaitcnt(MachineBasicBlock &Block,
559 MachineBasicBlock::instr_iterator It,
560 AMDGPU::Waitcnt Wait) override;
561
getWaitEventMask() const562 const unsigned *getWaitEventMask() const override {
563 assert(ST);
564
565 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
566 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
567 eventMask({LDS_ACCESS, GDS_ACCESS}),
568 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
569 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
570 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
571 eventMask({VMEM_SAMPLER_READ_ACCESS}),
572 eventMask({VMEM_BVH_READ_ACCESS}),
573 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
574
575 return WaitEventMaskForInstGFX12Plus;
576 }
577
578 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
579 };
580
581 class SIInsertWaitcnts : public MachineFunctionPass {
582 private:
583 const GCNSubtarget *ST = nullptr;
584 const SIInstrInfo *TII = nullptr;
585 const SIRegisterInfo *TRI = nullptr;
586 const MachineRegisterInfo *MRI = nullptr;
587
588 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
589 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
590 MachineLoopInfo *MLI;
591 MachinePostDominatorTree *PDT;
592 AliasAnalysis *AA = nullptr;
593
594 struct BlockInfo {
595 std::unique_ptr<WaitcntBrackets> Incoming;
596 bool Dirty = true;
597 };
598
599 InstCounterType SmemAccessCounter;
600
601 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
602
603 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
604 // because of amdgpu-waitcnt-forcezero flag
605 bool ForceEmitZeroWaitcnts;
606 bool ForceEmitWaitcnt[NUM_INST_CNTS];
607
608 // In any given run of this pass, WCG will point to one of these two
609 // generator objects, which must have been re-initialised before use
610 // from a value made using a subtarget constructor.
611 WaitcntGeneratorPreGFX12 WCGPreGFX12;
612 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
613
614 WaitcntGenerator *WCG = nullptr;
615
616 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
617 // message.
618 DenseSet<MachineInstr *> ReleaseVGPRInsts;
619
620 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
621
622 public:
623 static char ID;
624
SIInsertWaitcnts()625 SIInsertWaitcnts() : MachineFunctionPass(ID) {
626 (void)ForceExpCounter;
627 (void)ForceLgkmCounter;
628 (void)ForceVMCounter;
629 }
630
631 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
632 bool isPreheaderToFlush(MachineBasicBlock &MBB,
633 WaitcntBrackets &ScoreBrackets);
634 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
635 bool runOnMachineFunction(MachineFunction &MF) override;
636
getPassName() const637 StringRef getPassName() const override {
638 return "SI insert wait instructions";
639 }
640
getAnalysisUsage(AnalysisUsage & AU) const641 void getAnalysisUsage(AnalysisUsage &AU) const override {
642 AU.setPreservesCFG();
643 AU.addRequired<MachineLoopInfoWrapperPass>();
644 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
645 AU.addUsedIfAvailable<AAResultsWrapperPass>();
646 AU.addPreserved<AAResultsWrapperPass>();
647 MachineFunctionPass::getAnalysisUsage(AU);
648 }
649
isForceEmitWaitcnt() const650 bool isForceEmitWaitcnt() const {
651 for (auto T : inst_counter_types())
652 if (ForceEmitWaitcnt[T])
653 return true;
654 return false;
655 }
656
setForceEmitWaitcnt()657 void setForceEmitWaitcnt() {
658 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
659 // For debug builds, get the debug counter info and adjust if need be
660 #ifndef NDEBUG
661 if (DebugCounter::isCounterSet(ForceExpCounter) &&
662 DebugCounter::shouldExecute(ForceExpCounter)) {
663 ForceEmitWaitcnt[EXP_CNT] = true;
664 } else {
665 ForceEmitWaitcnt[EXP_CNT] = false;
666 }
667
668 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
669 DebugCounter::shouldExecute(ForceLgkmCounter)) {
670 ForceEmitWaitcnt[DS_CNT] = true;
671 ForceEmitWaitcnt[KM_CNT] = true;
672 } else {
673 ForceEmitWaitcnt[DS_CNT] = false;
674 ForceEmitWaitcnt[KM_CNT] = false;
675 }
676
677 if (DebugCounter::isCounterSet(ForceVMCounter) &&
678 DebugCounter::shouldExecute(ForceVMCounter)) {
679 ForceEmitWaitcnt[LOAD_CNT] = true;
680 ForceEmitWaitcnt[SAMPLE_CNT] = true;
681 ForceEmitWaitcnt[BVH_CNT] = true;
682 } else {
683 ForceEmitWaitcnt[LOAD_CNT] = false;
684 ForceEmitWaitcnt[SAMPLE_CNT] = false;
685 ForceEmitWaitcnt[BVH_CNT] = false;
686 }
687 #endif // NDEBUG
688 }
689
690 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
691 // FLAT instruction.
getVmemWaitEventType(const MachineInstr & Inst) const692 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
693 // Maps VMEM access types to their corresponding WaitEventType.
694 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
695 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
696
697 assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
698 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
699 // these should use VM_CNT.
700 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
701 return VMEM_ACCESS;
702 if (Inst.mayStore() &&
703 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
704 // FLAT and SCRATCH instructions may access scratch. Other VMEM
705 // instructions do not.
706 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
707 return SCRATCH_WRITE_ACCESS;
708 return VMEM_WRITE_ACCESS;
709 }
710 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
711 return VMEM_READ_ACCESS;
712 return VmemReadMapping[getVmemType(Inst)];
713 }
714
715 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
716 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
717 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
718 bool generateWaitcntInstBefore(MachineInstr &MI,
719 WaitcntBrackets &ScoreBrackets,
720 MachineInstr *OldWaitcntInstr,
721 bool FlushVmCnt);
722 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
723 MachineBasicBlock::instr_iterator It,
724 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
725 MachineInstr *OldWaitcntInstr);
726 void updateEventWaitcntAfter(MachineInstr &Inst,
727 WaitcntBrackets *ScoreBrackets);
728 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
729 WaitcntBrackets &ScoreBrackets);
730 };
731
732 } // end anonymous namespace
733
getRegInterval(const MachineInstr * MI,const MachineRegisterInfo * MRI,const SIRegisterInfo * TRI,unsigned OpNo) const734 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
735 const MachineRegisterInfo *MRI,
736 const SIRegisterInfo *TRI,
737 unsigned OpNo) const {
738 const MachineOperand &Op = MI->getOperand(OpNo);
739 if (!TRI->isInAllocatableClass(Op.getReg()))
740 return {-1, -1};
741
742 // A use via a PW operand does not need a waitcnt.
743 // A partial write is not a WAW.
744 assert(!Op.getSubReg() || !Op.isUndef());
745
746 RegInterval Result;
747
748 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
749 AMDGPU::HWEncoding::REG_IDX_MASK;
750
751 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
752 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
753 Result.first = Reg - Encoding.VGPR0;
754 if (TRI->isAGPR(*MRI, Op.getReg()))
755 Result.first += AGPR_OFFSET;
756 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
757 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
758 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
759 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
760 assert(Result.first >= NUM_ALL_VGPRS &&
761 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
762 }
763 // TODO: Handle TTMP
764 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
765 else
766 return {-1, -1};
767
768 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
769 unsigned Size = TRI->getRegSizeInBits(*RC);
770 Result.second = Result.first + ((Size + 16) / 32);
771
772 return Result;
773 }
774
setExpScore(const MachineInstr * MI,const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,unsigned OpNo,unsigned Val)775 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
776 const SIInstrInfo *TII,
777 const SIRegisterInfo *TRI,
778 const MachineRegisterInfo *MRI, unsigned OpNo,
779 unsigned Val) {
780 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
781 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
782 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
783 setRegScore(RegNo, EXP_CNT, Val);
784 }
785 }
786
updateByEvent(const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,WaitEventType E,MachineInstr & Inst)787 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
788 const SIRegisterInfo *TRI,
789 const MachineRegisterInfo *MRI,
790 WaitEventType E, MachineInstr &Inst) {
791 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
792
793 unsigned UB = getScoreUB(T);
794 unsigned CurrScore = UB + 1;
795 if (CurrScore == 0)
796 report_fatal_error("InsertWaitcnt score wraparound");
797 // PendingEvents and ScoreUB need to be update regardless if this event
798 // changes the score of a register or not.
799 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
800 PendingEvents |= 1 << E;
801 setScoreUB(T, CurrScore);
802
803 if (T == EXP_CNT) {
804 // Put score on the source vgprs. If this is a store, just use those
805 // specific register(s).
806 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
807 int AddrOpIdx =
808 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
809 // All GDS operations must protect their address register (same as
810 // export.)
811 if (AddrOpIdx != -1) {
812 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
813 }
814
815 if (Inst.mayStore()) {
816 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
817 setExpScore(
818 &Inst, TII, TRI, MRI,
819 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
820 CurrScore);
821 }
822 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
823 setExpScore(&Inst, TII, TRI, MRI,
824 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
825 AMDGPU::OpName::data1),
826 CurrScore);
827 }
828 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
829 Inst.getOpcode() != AMDGPU::DS_APPEND &&
830 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
831 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
832 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
833 const MachineOperand &Op = Inst.getOperand(I);
834 if (Op.isReg() && !Op.isDef() &&
835 TRI->isVectorRegister(*MRI, Op.getReg())) {
836 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
837 }
838 }
839 }
840 } else if (TII->isFLAT(Inst)) {
841 if (Inst.mayStore()) {
842 setExpScore(
843 &Inst, TII, TRI, MRI,
844 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
845 CurrScore);
846 } else if (SIInstrInfo::isAtomicRet(Inst)) {
847 setExpScore(
848 &Inst, TII, TRI, MRI,
849 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
850 CurrScore);
851 }
852 } else if (TII->isMIMG(Inst)) {
853 if (Inst.mayStore()) {
854 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
855 } else if (SIInstrInfo::isAtomicRet(Inst)) {
856 setExpScore(
857 &Inst, TII, TRI, MRI,
858 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
859 CurrScore);
860 }
861 } else if (TII->isMTBUF(Inst)) {
862 if (Inst.mayStore()) {
863 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
864 }
865 } else if (TII->isMUBUF(Inst)) {
866 if (Inst.mayStore()) {
867 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
868 } else if (SIInstrInfo::isAtomicRet(Inst)) {
869 setExpScore(
870 &Inst, TII, TRI, MRI,
871 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
872 CurrScore);
873 }
874 } else if (TII->isLDSDIR(Inst)) {
875 // LDSDIR instructions attach the score to the destination.
876 setExpScore(
877 &Inst, TII, TRI, MRI,
878 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
879 CurrScore);
880 } else {
881 if (TII->isEXP(Inst)) {
882 // For export the destination registers are really temps that
883 // can be used as the actual source after export patching, so
884 // we need to treat them like sources and set the EXP_CNT
885 // score.
886 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
887 MachineOperand &DefMO = Inst.getOperand(I);
888 if (DefMO.isReg() && DefMO.isDef() &&
889 TRI->isVGPR(*MRI, DefMO.getReg())) {
890 setRegScore(
891 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
892 EXP_CNT, CurrScore);
893 }
894 }
895 }
896 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
897 MachineOperand &MO = Inst.getOperand(I);
898 if (MO.isReg() && !MO.isDef() &&
899 TRI->isVectorRegister(*MRI, MO.getReg())) {
900 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
901 }
902 }
903 }
904 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
905 // Match the score to the destination registers.
906 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
907 auto &Op = Inst.getOperand(I);
908 if (!Op.isReg() || !Op.isDef())
909 continue;
910 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
911 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
912 if (Interval.first >= NUM_ALL_VGPRS)
913 continue;
914 if (updateVMCntOnly(Inst)) {
915 // updateVMCntOnly should only leave us with VGPRs
916 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
917 // defs. That's required for a sane index into `VgprMemTypes` below
918 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
919 VmemType V = getVmemType(Inst);
920 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
921 VgprVmemTypes[RegNo] |= 1 << V;
922 }
923 }
924 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
925 setRegScore(RegNo, T, CurrScore);
926 }
927 }
928 if (Inst.mayStore() &&
929 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
930 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
931 // written can be accessed. A load from LDS to VMEM does not need a wait.
932 unsigned Slot = 0;
933 for (const auto *MemOp : Inst.memoperands()) {
934 if (!MemOp->isStore() ||
935 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
936 continue;
937 // Comparing just AA info does not guarantee memoperands are equal
938 // in general, but this is so for LDS DMA in practice.
939 auto AAI = MemOp->getAAInfo();
940 // Alias scope information gives a way to definitely identify an
941 // original memory object and practically produced in the module LDS
942 // lowering pass. If there is no scope available we will not be able
943 // to disambiguate LDS aliasing as after the module lowering all LDS
944 // is squashed into a single big object. Do not attempt to use one of
945 // the limited LDSDMAStores for something we will not be able to use
946 // anyway.
947 if (!AAI || !AAI.Scope)
948 break;
949 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
950 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
951 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
952 Slot = I + 1;
953 break;
954 }
955 }
956 }
957 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
958 break;
959 LDSDMAStores.push_back(&Inst);
960 Slot = LDSDMAStores.size();
961 break;
962 }
963 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
964 if (Slot)
965 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
966 }
967 }
968 }
969
print(raw_ostream & OS)970 void WaitcntBrackets::print(raw_ostream &OS) {
971 OS << '\n';
972 for (auto T : inst_counter_types(MaxCounter)) {
973 unsigned SR = getScoreRange(T);
974
975 switch (T) {
976 case LOAD_CNT:
977 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
978 << SR << "): ";
979 break;
980 case DS_CNT:
981 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
982 << SR << "): ";
983 break;
984 case EXP_CNT:
985 OS << " EXP_CNT(" << SR << "): ";
986 break;
987 case STORE_CNT:
988 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
989 << SR << "): ";
990 break;
991 case SAMPLE_CNT:
992 OS << " SAMPLE_CNT(" << SR << "): ";
993 break;
994 case BVH_CNT:
995 OS << " BVH_CNT(" << SR << "): ";
996 break;
997 case KM_CNT:
998 OS << " KM_CNT(" << SR << "): ";
999 break;
1000 default:
1001 OS << " UNKNOWN(" << SR << "): ";
1002 break;
1003 }
1004
1005 if (SR != 0) {
1006 // Print vgpr scores.
1007 unsigned LB = getScoreLB(T);
1008
1009 for (int J = 0; J <= VgprUB; J++) {
1010 unsigned RegScore = getRegScore(J, T);
1011 if (RegScore <= LB)
1012 continue;
1013 unsigned RelScore = RegScore - LB - 1;
1014 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1015 OS << RelScore << ":v" << J << " ";
1016 } else {
1017 OS << RelScore << ":ds ";
1018 }
1019 }
1020 // Also need to print sgpr scores for lgkm_cnt.
1021 if (T == SmemAccessCounter) {
1022 for (int J = 0; J <= SgprUB; J++) {
1023 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1024 if (RegScore <= LB)
1025 continue;
1026 unsigned RelScore = RegScore - LB - 1;
1027 OS << RelScore << ":s" << J << " ";
1028 }
1029 }
1030 }
1031 OS << '\n';
1032 }
1033 OS << '\n';
1034 }
1035
1036 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
1037 /// whether a waitcnt instruction is needed at all.
simplifyWaitcnt(AMDGPU::Waitcnt & Wait) const1038 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1039 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1040 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1041 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1042 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1043 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1044 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1045 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1046 }
1047
simplifyWaitcnt(InstCounterType T,unsigned & Count) const1048 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1049 unsigned &Count) const {
1050 // The number of outstanding events for this type, T, can be calculated
1051 // as (UB - LB). If the current Count is greater than or equal to the number
1052 // of outstanding events, then the wait for this counter is redundant.
1053 if (Count >= getScoreRange(T))
1054 Count = ~0u;
1055 }
1056
determineWait(InstCounterType T,int RegNo,AMDGPU::Waitcnt & Wait) const1057 void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1058 AMDGPU::Waitcnt &Wait) const {
1059 unsigned ScoreToWait = getRegScore(RegNo, T);
1060
1061 // If the score of src_operand falls within the bracket, we need an
1062 // s_waitcnt instruction.
1063 const unsigned LB = getScoreLB(T);
1064 const unsigned UB = getScoreUB(T);
1065 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1066 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1067 !ST->hasFlatLgkmVMemCountInOrder()) {
1068 // If there is a pending FLAT operation, and this is a VMem or LGKM
1069 // waitcnt and the target can report early completion, then we need
1070 // to force a waitcnt 0.
1071 addWait(Wait, T, 0);
1072 } else if (counterOutOfOrder(T)) {
1073 // Counter can get decremented out-of-order when there
1074 // are multiple types event in the bracket. Also emit an s_wait counter
1075 // with a conservative value of 0 for the counter.
1076 addWait(Wait, T, 0);
1077 } else {
1078 // If a counter has been maxed out avoid overflow by waiting for
1079 // MAX(CounterType) - 1 instead.
1080 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1081 addWait(Wait, T, NeededWait);
1082 }
1083 }
1084 }
1085
applyWaitcnt(const AMDGPU::Waitcnt & Wait)1086 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1087 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1088 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1089 applyWaitcnt(DS_CNT, Wait.DsCnt);
1090 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1091 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1092 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1093 applyWaitcnt(KM_CNT, Wait.KmCnt);
1094 }
1095
applyWaitcnt(InstCounterType T,unsigned Count)1096 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1097 const unsigned UB = getScoreUB(T);
1098 if (Count >= UB)
1099 return;
1100 if (Count != 0) {
1101 if (counterOutOfOrder(T))
1102 return;
1103 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1104 } else {
1105 setScoreLB(T, UB);
1106 PendingEvents &= ~WaitEventMaskForInst[T];
1107 }
1108 }
1109
1110 // Where there are multiple types of event in the bracket of a counter,
1111 // the decrement may go out of order.
counterOutOfOrder(InstCounterType T) const1112 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1113 // Scalar memory read always can go out of order.
1114 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1115 return true;
1116 return hasMixedPendingEvents(T);
1117 }
1118
1119 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1120 false)
1121 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1122 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1123 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1124 false)
1125
1126 char SIInsertWaitcnts::ID = 0;
1127
1128 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1129
createSIInsertWaitcntsPass()1130 FunctionPass *llvm::createSIInsertWaitcntsPass() {
1131 return new SIInsertWaitcnts();
1132 }
1133
updateOperandIfDifferent(MachineInstr & MI,uint16_t OpName,unsigned NewEnc)1134 static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
1135 unsigned NewEnc) {
1136 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1137 assert(OpIdx >= 0);
1138
1139 MachineOperand &MO = MI.getOperand(OpIdx);
1140
1141 if (NewEnc == MO.getImm())
1142 return false;
1143
1144 MO.setImm(NewEnc);
1145 return true;
1146 }
1147
1148 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1149 /// and if so, which counter it is waiting on.
counterTypeForInstr(unsigned Opcode)1150 static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1151 switch (Opcode) {
1152 case AMDGPU::S_WAIT_LOADCNT:
1153 return LOAD_CNT;
1154 case AMDGPU::S_WAIT_EXPCNT:
1155 return EXP_CNT;
1156 case AMDGPU::S_WAIT_STORECNT:
1157 return STORE_CNT;
1158 case AMDGPU::S_WAIT_SAMPLECNT:
1159 return SAMPLE_CNT;
1160 case AMDGPU::S_WAIT_BVHCNT:
1161 return BVH_CNT;
1162 case AMDGPU::S_WAIT_DSCNT:
1163 return DS_CNT;
1164 case AMDGPU::S_WAIT_KMCNT:
1165 return KM_CNT;
1166 default:
1167 return {};
1168 }
1169 }
1170
promoteSoftWaitCnt(MachineInstr * Waitcnt) const1171 bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1172 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1173 if (Opcode == Waitcnt->getOpcode())
1174 return false;
1175
1176 Waitcnt->setDesc(TII->get(Opcode));
1177 return true;
1178 }
1179
1180 /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1181 /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1182 /// from \p Wait that were added by previous passes. Currently this pass
1183 /// conservatively assumes that these preexisting waits are required for
1184 /// correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const1185 bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1186 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1187 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1188 assert(ST);
1189 assert(isNormalMode(MaxCounter));
1190
1191 bool Modified = false;
1192 MachineInstr *WaitcntInstr = nullptr;
1193 MachineInstr *WaitcntVsCntInstr = nullptr;
1194
1195 for (auto &II :
1196 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1197 if (II.isMetaInstruction())
1198 continue;
1199
1200 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1201 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1202
1203 // Update required wait count. If this is a soft waitcnt (= it was added
1204 // by an earlier pass), it may be entirely removed.
1205 if (Opcode == AMDGPU::S_WAITCNT) {
1206 unsigned IEnc = II.getOperand(0).getImm();
1207 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1208 if (TrySimplify)
1209 ScoreBrackets.simplifyWaitcnt(OldWait);
1210 Wait = Wait.combined(OldWait);
1211
1212 // Merge consecutive waitcnt of the same type by erasing multiples.
1213 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1214 II.eraseFromParent();
1215 Modified = true;
1216 } else
1217 WaitcntInstr = &II;
1218 } else {
1219 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1220 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1221
1222 unsigned OldVSCnt =
1223 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1224 if (TrySimplify)
1225 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1226 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1227
1228 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1229 II.eraseFromParent();
1230 Modified = true;
1231 } else
1232 WaitcntVsCntInstr = &II;
1233 }
1234 }
1235
1236 if (WaitcntInstr) {
1237 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1238 AMDGPU::encodeWaitcnt(IV, Wait));
1239 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1240
1241 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1242 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1243 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1244 Wait.LoadCnt = ~0u;
1245 Wait.ExpCnt = ~0u;
1246 Wait.DsCnt = ~0u;
1247
1248 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1249 ? dbgs()
1250 << "applyPreexistingWaitcnt\n"
1251 << "New Instr at block end: " << *WaitcntInstr << '\n'
1252 : dbgs() << "applyPreexistingWaitcnt\n"
1253 << "Old Instr: " << *It
1254 << "New Instr: " << *WaitcntInstr << '\n');
1255 }
1256
1257 if (WaitcntVsCntInstr) {
1258 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1259 AMDGPU::OpName::simm16, Wait.StoreCnt);
1260 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1261
1262 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1263 Wait.StoreCnt = ~0u;
1264
1265 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1266 ? dbgs() << "applyPreexistingWaitcnt\n"
1267 << "New Instr at block end: " << *WaitcntVsCntInstr
1268 << '\n'
1269 : dbgs() << "applyPreexistingWaitcnt\n"
1270 << "Old Instr: " << *It
1271 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1272 }
1273
1274 return Modified;
1275 }
1276
1277 /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1278 /// required counters in \p Wait
createNewWaitcnt(MachineBasicBlock & Block,MachineBasicBlock::instr_iterator It,AMDGPU::Waitcnt Wait)1279 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1280 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1281 AMDGPU::Waitcnt Wait) {
1282 assert(ST);
1283 assert(isNormalMode(MaxCounter));
1284
1285 bool Modified = false;
1286 const DebugLoc &DL = Block.findDebugLoc(It);
1287
1288 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1289 // single instruction while VScnt has its own instruction.
1290 if (Wait.hasWaitExceptStoreCnt()) {
1291 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1292 [[maybe_unused]] auto SWaitInst =
1293 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1294 Modified = true;
1295
1296 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1297 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1298 dbgs() << "New Instr: " << *SWaitInst << '\n');
1299 }
1300
1301 if (Wait.hasWaitStoreCnt()) {
1302 assert(ST->hasVscnt());
1303
1304 [[maybe_unused]] auto SWaitInst =
1305 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1306 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1307 .addImm(Wait.StoreCnt);
1308 Modified = true;
1309
1310 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1311 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1312 dbgs() << "New Instr: " << *SWaitInst << '\n');
1313 }
1314
1315 return Modified;
1316 }
1317
1318 AMDGPU::Waitcnt
getAllZeroWaitcnt(bool IncludeVSCnt) const1319 WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1320 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1321 }
1322
1323 AMDGPU::Waitcnt
getAllZeroWaitcnt(bool IncludeVSCnt) const1324 WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1325 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1326 }
1327
1328 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1329 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1330 /// were added by previous passes. Currently this pass conservatively
1331 /// assumes that these preexisting waits are required for correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const1332 bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1333 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1334 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1335 assert(ST);
1336 assert(!isNormalMode(MaxCounter));
1337
1338 bool Modified = false;
1339 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1340 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1341 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1342
1343 for (auto &II :
1344 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1345 if (II.isMetaInstruction())
1346 continue;
1347
1348 MachineInstr **UpdatableInstr;
1349
1350 // Update required wait count. If this is a soft waitcnt (= it was added
1351 // by an earlier pass), it may be entirely removed.
1352
1353 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1354 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1355
1356 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1357 // attempt to do more than that either.
1358 if (Opcode == AMDGPU::S_WAITCNT)
1359 continue;
1360
1361 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1362 unsigned OldEnc =
1363 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1364 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1365 if (TrySimplify)
1366 ScoreBrackets.simplifyWaitcnt(OldWait);
1367 Wait = Wait.combined(OldWait);
1368 UpdatableInstr = &CombinedLoadDsCntInstr;
1369 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1370 unsigned OldEnc =
1371 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1372 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1373 if (TrySimplify)
1374 ScoreBrackets.simplifyWaitcnt(OldWait);
1375 Wait = Wait.combined(OldWait);
1376 UpdatableInstr = &CombinedStoreDsCntInstr;
1377 } else {
1378 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1379 assert(CT.has_value());
1380 unsigned OldCnt =
1381 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1382 if (TrySimplify)
1383 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1384 addWait(Wait, CT.value(), OldCnt);
1385 UpdatableInstr = &WaitInstrs[CT.value()];
1386 }
1387
1388 // Merge consecutive waitcnt of the same type by erasing multiples.
1389 if (!*UpdatableInstr) {
1390 *UpdatableInstr = &II;
1391 } else {
1392 II.eraseFromParent();
1393 Modified = true;
1394 }
1395 }
1396
1397 if (CombinedLoadDsCntInstr) {
1398 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1399 // to be waited for. Otherwise, let the instruction be deleted so
1400 // the appropriate single counter wait instruction can be inserted
1401 // instead, when new S_WAIT_*CNT instructions are inserted by
1402 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1403 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1404 // the loop below that deals with single counter instructions.
1405 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1406 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1407 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1408 AMDGPU::OpName::simm16, NewEnc);
1409 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1410 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1411 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1412 Wait.LoadCnt = ~0u;
1413 Wait.DsCnt = ~0u;
1414
1415 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1416 ? dbgs() << "applyPreexistingWaitcnt\n"
1417 << "New Instr at block end: "
1418 << *CombinedLoadDsCntInstr << '\n'
1419 : dbgs() << "applyPreexistingWaitcnt\n"
1420 << "Old Instr: " << *It << "New Instr: "
1421 << *CombinedLoadDsCntInstr << '\n');
1422 } else {
1423 CombinedLoadDsCntInstr->eraseFromParent();
1424 Modified = true;
1425 }
1426 }
1427
1428 if (CombinedStoreDsCntInstr) {
1429 // Similarly for S_WAIT_STORECNT_DSCNT.
1430 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1431 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1432 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1433 AMDGPU::OpName::simm16, NewEnc);
1434 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1435 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1436 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1437 Wait.StoreCnt = ~0u;
1438 Wait.DsCnt = ~0u;
1439
1440 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1441 ? dbgs() << "applyPreexistingWaitcnt\n"
1442 << "New Instr at block end: "
1443 << *CombinedStoreDsCntInstr << '\n'
1444 : dbgs() << "applyPreexistingWaitcnt\n"
1445 << "Old Instr: " << *It << "New Instr: "
1446 << *CombinedStoreDsCntInstr << '\n');
1447 } else {
1448 CombinedStoreDsCntInstr->eraseFromParent();
1449 Modified = true;
1450 }
1451 }
1452
1453 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1454 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1455 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1456 // instructions so that createNewWaitcnt() will create new combined
1457 // instructions to replace them.
1458
1459 if (Wait.DsCnt != ~0u) {
1460 // This is a vector of addresses in WaitInstrs pointing to instructions
1461 // that should be removed if they are present.
1462 SmallVector<MachineInstr **, 2> WaitsToErase;
1463
1464 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1465 // both) need to be waited for, ensure that there are no existing
1466 // individual wait count instructions for these.
1467
1468 if (Wait.LoadCnt != ~0u) {
1469 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1470 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1471 } else if (Wait.StoreCnt != ~0u) {
1472 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1473 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1474 }
1475
1476 for (MachineInstr **WI : WaitsToErase) {
1477 if (!*WI)
1478 continue;
1479
1480 (*WI)->eraseFromParent();
1481 *WI = nullptr;
1482 Modified = true;
1483 }
1484 }
1485
1486 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1487 if (!WaitInstrs[CT])
1488 continue;
1489
1490 unsigned NewCnt = getWait(Wait, CT);
1491 if (NewCnt != ~0u) {
1492 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1493 AMDGPU::OpName::simm16, NewCnt);
1494 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1495
1496 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1497 setNoWait(Wait, CT);
1498
1499 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1500 ? dbgs() << "applyPreexistingWaitcnt\n"
1501 << "New Instr at block end: " << *WaitInstrs[CT]
1502 << '\n'
1503 : dbgs() << "applyPreexistingWaitcnt\n"
1504 << "Old Instr: " << *It
1505 << "New Instr: " << *WaitInstrs[CT] << '\n');
1506 } else {
1507 WaitInstrs[CT]->eraseFromParent();
1508 Modified = true;
1509 }
1510 }
1511
1512 return Modified;
1513 }
1514
1515 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
createNewWaitcnt(MachineBasicBlock & Block,MachineBasicBlock::instr_iterator It,AMDGPU::Waitcnt Wait)1516 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1517 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1518 AMDGPU::Waitcnt Wait) {
1519 assert(ST);
1520 assert(!isNormalMode(MaxCounter));
1521
1522 bool Modified = false;
1523 const DebugLoc &DL = Block.findDebugLoc(It);
1524
1525 // Check for opportunities to use combined wait instructions.
1526 if (Wait.DsCnt != ~0u) {
1527 MachineInstr *SWaitInst = nullptr;
1528
1529 if (Wait.LoadCnt != ~0u) {
1530 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1531
1532 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1533 .addImm(Enc);
1534
1535 Wait.LoadCnt = ~0u;
1536 Wait.DsCnt = ~0u;
1537 } else if (Wait.StoreCnt != ~0u) {
1538 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1539
1540 SWaitInst =
1541 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1542 .addImm(Enc);
1543
1544 Wait.StoreCnt = ~0u;
1545 Wait.DsCnt = ~0u;
1546 }
1547
1548 if (SWaitInst) {
1549 Modified = true;
1550
1551 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1552 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1553 dbgs() << "New Instr: " << *SWaitInst << '\n');
1554 }
1555 }
1556
1557 // Generate an instruction for any remaining counter that needs
1558 // waiting for.
1559
1560 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1561 unsigned Count = getWait(Wait, CT);
1562 if (Count == ~0u)
1563 continue;
1564
1565 [[maybe_unused]] auto SWaitInst =
1566 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1567 .addImm(Count);
1568
1569 Modified = true;
1570
1571 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1572 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1573 dbgs() << "New Instr: " << *SWaitInst << '\n');
1574 }
1575
1576 return Modified;
1577 }
1578
readsVCCZ(const MachineInstr & MI)1579 static bool readsVCCZ(const MachineInstr &MI) {
1580 unsigned Opc = MI.getOpcode();
1581 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1582 !MI.getOperand(1).isUndef();
1583 }
1584
1585 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
callWaitsOnFunctionEntry(const MachineInstr & MI)1586 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1587 // Currently all conventions wait, but this may not always be the case.
1588 //
1589 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1590 // senses to omit the wait and do it in the caller.
1591 return true;
1592 }
1593
1594 /// \returns true if the callee is expected to wait for any outstanding waits
1595 /// before returning.
callWaitsOnFunctionReturn(const MachineInstr & MI)1596 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
1597 return true;
1598 }
1599
1600 /// Generate s_waitcnt instruction to be placed before cur_Inst.
1601 /// Instructions of a given type are returned in order,
1602 /// but instructions of different types can complete out of order.
1603 /// We rely on this in-order completion
1604 /// and simply assign a score to the memory access instructions.
1605 /// We keep track of the active "score bracket" to determine
1606 /// if an access of a memory read requires an s_waitcnt
1607 /// and if so what the value of each counter is.
1608 /// The "score bracket" is bound by the lower bound and upper bound
1609 /// scores (*_score_LB and *_score_ub respectively).
1610 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1611 /// flush the vmcnt counter here.
generateWaitcntInstBefore(MachineInstr & MI,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr,bool FlushVmCnt)1612 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1613 WaitcntBrackets &ScoreBrackets,
1614 MachineInstr *OldWaitcntInstr,
1615 bool FlushVmCnt) {
1616 setForceEmitWaitcnt();
1617
1618 if (MI.isMetaInstruction())
1619 return false;
1620
1621 AMDGPU::Waitcnt Wait;
1622
1623 // FIXME: This should have already been handled by the memory legalizer.
1624 // Removing this currently doesn't affect any lit tests, but we need to
1625 // verify that nothing was relying on this. The number of buffer invalidates
1626 // being handled here should not be expanded.
1627 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1628 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1629 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1630 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1631 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1632 Wait.LoadCnt = 0;
1633 }
1634
1635 // All waits must be resolved at call return.
1636 // NOTE: this could be improved with knowledge of all call sites or
1637 // with knowledge of the called routines.
1638 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1639 MI.getOpcode() == AMDGPU::SI_RETURN ||
1640 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1641 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1642 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1643 }
1644 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1645 // stores. In this case it can be useful to send a message to explicitly
1646 // release all VGPRs before the stores have completed, but it is only safe to
1647 // do this if:
1648 // * there are no outstanding scratch stores
1649 // * we are not in Dynamic VGPR mode
1650 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1651 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1652 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1653 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1654 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1655 ReleaseVGPRInsts.insert(&MI);
1656 }
1657 // Resolve vm waits before gs-done.
1658 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1659 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1660 ST->hasLegacyGeometry() &&
1661 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1662 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1663 Wait.LoadCnt = 0;
1664 }
1665
1666 // Export & GDS instructions do not read the EXEC mask until after the export
1667 // is granted (which can occur well after the instruction is issued).
1668 // The shader program must flush all EXP operations on the export-count
1669 // before overwriting the EXEC mask.
1670 else {
1671 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1672 // Export and GDS are tracked individually, either may trigger a waitcnt
1673 // for EXEC.
1674 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1675 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1676 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1677 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1678 Wait.ExpCnt = 0;
1679 }
1680 }
1681
1682 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1683 // The function is going to insert a wait on everything in its prolog.
1684 // This still needs to be careful if the call target is a load (e.g. a GOT
1685 // load). We also need to check WAW dependency with saved PC.
1686 Wait = AMDGPU::Waitcnt();
1687
1688 int CallAddrOpIdx =
1689 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1690
1691 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1692 RegInterval CallAddrOpInterval =
1693 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1694
1695 for (int RegNo = CallAddrOpInterval.first;
1696 RegNo < CallAddrOpInterval.second; ++RegNo)
1697 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1698
1699 int RtnAddrOpIdx =
1700 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1701 if (RtnAddrOpIdx != -1) {
1702 RegInterval RtnAddrOpInterval =
1703 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1704
1705 for (int RegNo = RtnAddrOpInterval.first;
1706 RegNo < RtnAddrOpInterval.second; ++RegNo)
1707 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1708 }
1709 }
1710 } else {
1711 // FIXME: Should not be relying on memoperands.
1712 // Look at the source operands of every instruction to see if
1713 // any of them results from a previous memory operation that affects
1714 // its current usage. If so, an s_waitcnt instruction needs to be
1715 // emitted.
1716 // If the source operand was defined by a load, add the s_waitcnt
1717 // instruction.
1718 //
1719 // Two cases are handled for destination operands:
1720 // 1) If the destination operand was defined by a load, add the s_waitcnt
1721 // instruction to guarantee the right WAW order.
1722 // 2) If a destination operand that was used by a recent export/store ins,
1723 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1724
1725 for (const MachineMemOperand *Memop : MI.memoperands()) {
1726 const Value *Ptr = Memop->getValue();
1727 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1728 addWait(Wait, SmemAccessCounter, 0);
1729 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1730 SLoadAddresses.erase(Ptr);
1731 }
1732 unsigned AS = Memop->getAddrSpace();
1733 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1734 continue;
1735 // No need to wait before load from VMEM to LDS.
1736 if (TII->mayWriteLDSThroughDMA(MI))
1737 continue;
1738
1739 // LOAD_CNT is only relevant to vgpr or LDS.
1740 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1741 bool FoundAliasingStore = false;
1742 // Only objects with alias scope info were added to LDSDMAScopes array.
1743 // In the absense of the scope info we will not be able to disambiguate
1744 // aliasing here. There is no need to try searching for a corresponding
1745 // store slot. This is conservatively correct because in that case we
1746 // will produce a wait using the first (general) LDS DMA wait slot which
1747 // will wait on all of them anyway.
1748 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1749 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1750 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1751 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1752 FoundAliasingStore = true;
1753 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1754 }
1755 }
1756 }
1757 if (!FoundAliasingStore)
1758 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1759 if (Memop->isStore()) {
1760 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1761 }
1762 }
1763
1764 // Loop over use and def operands.
1765 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1766 MachineOperand &Op = MI.getOperand(I);
1767 if (!Op.isReg())
1768 continue;
1769
1770 // If the instruction does not read tied source, skip the operand.
1771 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1772 continue;
1773
1774 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1775
1776 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1777 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1778 if (IsVGPR) {
1779 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1780 // previous write and this write are the same type of VMEM
1781 // instruction, in which case they are (in some architectures)
1782 // guaranteed to write their results in order anyway.
1783 if (Op.isUse() || !updateVMCntOnly(MI) ||
1784 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1785 getVmemType(MI)) ||
1786 !ST->hasVmemWriteVgprInOrder()) {
1787 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1788 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1789 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1790 ScoreBrackets.clearVgprVmemTypes(RegNo);
1791 }
1792 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1793 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1794 }
1795 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1796 } else {
1797 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1798 }
1799 }
1800 }
1801 }
1802 }
1803
1804 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1805 // not, we need to ensure the subtarget is capable of backing off barrier
1806 // instructions in case there are any outstanding memory operations that may
1807 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1808 if (TII->isBarrierStart(MI.getOpcode()) &&
1809 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1810 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1811 }
1812
1813 // TODO: Remove this work-around, enable the assert for Bug 457939
1814 // after fixing the scheduler. Also, the Shader Compiler code is
1815 // independent of target.
1816 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1817 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1818 Wait.DsCnt = 0;
1819 }
1820 }
1821
1822 // Verify that the wait is actually needed.
1823 ScoreBrackets.simplifyWaitcnt(Wait);
1824
1825 if (ForceEmitZeroWaitcnts)
1826 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1827
1828 if (ForceEmitWaitcnt[LOAD_CNT])
1829 Wait.LoadCnt = 0;
1830 if (ForceEmitWaitcnt[EXP_CNT])
1831 Wait.ExpCnt = 0;
1832 if (ForceEmitWaitcnt[DS_CNT])
1833 Wait.DsCnt = 0;
1834 if (ForceEmitWaitcnt[SAMPLE_CNT])
1835 Wait.SampleCnt = 0;
1836 if (ForceEmitWaitcnt[BVH_CNT])
1837 Wait.BvhCnt = 0;
1838 if (ForceEmitWaitcnt[KM_CNT])
1839 Wait.KmCnt = 0;
1840
1841 if (FlushVmCnt) {
1842 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1843 Wait.LoadCnt = 0;
1844 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1845 Wait.SampleCnt = 0;
1846 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1847 Wait.BvhCnt = 0;
1848 }
1849
1850 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1851 OldWaitcntInstr);
1852 }
1853
generateWaitcnt(AMDGPU::Waitcnt Wait,MachineBasicBlock::instr_iterator It,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr)1854 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1855 MachineBasicBlock::instr_iterator It,
1856 MachineBasicBlock &Block,
1857 WaitcntBrackets &ScoreBrackets,
1858 MachineInstr *OldWaitcntInstr) {
1859 bool Modified = false;
1860
1861 if (OldWaitcntInstr)
1862 // Try to merge the required wait with preexisting waitcnt instructions.
1863 // Also erase redundant waitcnt.
1864 Modified =
1865 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1866
1867 // Any counts that could have been applied to any existing waitcnt
1868 // instructions will have been done so, now deal with any remaining.
1869 ScoreBrackets.applyWaitcnt(Wait);
1870
1871 // ExpCnt can be merged into VINTERP.
1872 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1873 SIInstrInfo::isVINTERP(*It)) {
1874 MachineOperand *WaitExp =
1875 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1876 if (Wait.ExpCnt < WaitExp->getImm()) {
1877 WaitExp->setImm(Wait.ExpCnt);
1878 Modified = true;
1879 }
1880 Wait.ExpCnt = ~0u;
1881
1882 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1883 << "Update Instr: " << *It);
1884 }
1885
1886 if (WCG->createNewWaitcnt(Block, It, Wait))
1887 Modified = true;
1888
1889 return Modified;
1890 }
1891
1892 // This is a flat memory operation. Check to see if it has memory tokens other
1893 // than LDS. Other address spaces supported by flat memory operations involve
1894 // global memory.
mayAccessVMEMThroughFlat(const MachineInstr & MI) const1895 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1896 assert(TII->isFLAT(MI));
1897
1898 // All flat instructions use the VMEM counter.
1899 assert(TII->usesVM_CNT(MI));
1900
1901 // If there are no memory operands then conservatively assume the flat
1902 // operation may access VMEM.
1903 if (MI.memoperands_empty())
1904 return true;
1905
1906 // See if any memory operand specifies an address space that involves VMEM.
1907 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1908 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1909 // (GDS) address space is not supported by flat operations. Therefore, simply
1910 // return true unless only the LDS address space is found.
1911 for (const MachineMemOperand *Memop : MI.memoperands()) {
1912 unsigned AS = Memop->getAddrSpace();
1913 assert(AS != AMDGPUAS::REGION_ADDRESS);
1914 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1915 return true;
1916 }
1917
1918 return false;
1919 }
1920
1921 // This is a flat memory operation. Check to see if it has memory tokens for
1922 // either LDS or FLAT.
mayAccessLDSThroughFlat(const MachineInstr & MI) const1923 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1924 assert(TII->isFLAT(MI));
1925
1926 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1927 if (!TII->usesLGKM_CNT(MI))
1928 return false;
1929
1930 // If in tgsplit mode then there can be no use of LDS.
1931 if (ST->isTgSplitEnabled())
1932 return false;
1933
1934 // If there are no memory operands then conservatively assume the flat
1935 // operation may access LDS.
1936 if (MI.memoperands_empty())
1937 return true;
1938
1939 // See if any memory operand specifies an address space that involves LDS.
1940 for (const MachineMemOperand *Memop : MI.memoperands()) {
1941 unsigned AS = Memop->getAddrSpace();
1942 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1943 return true;
1944 }
1945
1946 return false;
1947 }
1948
1949 // This is a flat memory operation. Check to see if it has memory tokens for
1950 // either scratch or FLAT.
mayAccessScratchThroughFlat(const MachineInstr & MI) const1951 bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
1952 const MachineInstr &MI) const {
1953 assert(TII->isFLAT(MI));
1954
1955 // SCRATCH instructions always access scratch.
1956 if (TII->isFLATScratch(MI))
1957 return true;
1958
1959 // GLOBAL instructions never access scratch.
1960 if (TII->isFLATGlobal(MI))
1961 return false;
1962
1963 // If there are no memory operands then conservatively assume the flat
1964 // operation may access scratch.
1965 if (MI.memoperands_empty())
1966 return true;
1967
1968 // See if any memory operand specifies an address space that involves scratch.
1969 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
1970 unsigned AS = Memop->getAddrSpace();
1971 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
1972 });
1973 }
1974
isCacheInvOrWBInst(MachineInstr & Inst)1975 static bool isCacheInvOrWBInst(MachineInstr &Inst) {
1976 auto Opc = Inst.getOpcode();
1977 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
1978 Opc == AMDGPU::GLOBAL_WBINV;
1979 }
1980
updateEventWaitcntAfter(MachineInstr & Inst,WaitcntBrackets * ScoreBrackets)1981 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1982 WaitcntBrackets *ScoreBrackets) {
1983 // Now look at the instruction opcode. If it is a memory access
1984 // instruction, update the upper-bound of the appropriate counter's
1985 // bracket and the destination operand scores.
1986 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
1987
1988 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1989 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1990 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1991 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1992 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1993 } else {
1994 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1995 }
1996 } else if (TII->isFLAT(Inst)) {
1997 // TODO: Track this properly.
1998 if (isCacheInvOrWBInst(Inst))
1999 return;
2000
2001 assert(Inst.mayLoadOrStore());
2002
2003 int FlatASCount = 0;
2004
2005 if (mayAccessVMEMThroughFlat(Inst)) {
2006 ++FlatASCount;
2007 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2008 Inst);
2009 }
2010
2011 if (mayAccessLDSThroughFlat(Inst)) {
2012 ++FlatASCount;
2013 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2014 }
2015
2016 // A Flat memory operation must access at least one address space.
2017 assert(FlatASCount);
2018
2019 // This is a flat memory operation that access both VMEM and LDS, so note it
2020 // - it will require that both the VM and LGKM be flushed to zero if it is
2021 // pending when a VM or LGKM dependency occurs.
2022 if (FlatASCount > 1)
2023 ScoreBrackets->setPendingFlat();
2024 } else if (SIInstrInfo::isVMEM(Inst) &&
2025 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
2026 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2027 Inst);
2028
2029 if (ST->vmemWriteNeedsExpWaitcnt() &&
2030 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2031 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2032 }
2033 } else if (TII->isSMRD(Inst)) {
2034 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2035 } else if (Inst.isCall()) {
2036 if (callWaitsOnFunctionReturn(Inst)) {
2037 // Act as a wait on everything
2038 ScoreBrackets->applyWaitcnt(
2039 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2040 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2041 } else {
2042 // May need to way wait for anything.
2043 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2044 }
2045 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2046 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2047 } else if (TII->isVINTERP(Inst)) {
2048 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2049 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2050 } else if (SIInstrInfo::isEXP(Inst)) {
2051 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2052 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2053 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2054 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2055 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2056 else
2057 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2058 } else {
2059 switch (Inst.getOpcode()) {
2060 case AMDGPU::S_SENDMSG:
2061 case AMDGPU::S_SENDMSG_RTN_B32:
2062 case AMDGPU::S_SENDMSG_RTN_B64:
2063 case AMDGPU::S_SENDMSGHALT:
2064 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2065 break;
2066 case AMDGPU::S_MEMTIME:
2067 case AMDGPU::S_MEMREALTIME:
2068 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2069 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2070 case AMDGPU::S_BARRIER_LEAVE:
2071 case AMDGPU::S_GET_BARRIER_STATE_M0:
2072 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2073 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2074 break;
2075 }
2076 }
2077 }
2078
mergeScore(const MergeInfo & M,unsigned & Score,unsigned OtherScore)2079 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2080 unsigned OtherScore) {
2081 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2082 unsigned OtherShifted =
2083 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2084 Score = std::max(MyShifted, OtherShifted);
2085 return OtherShifted > MyShifted;
2086 }
2087
2088 /// Merge the pending events and associater score brackets of \p Other into
2089 /// this brackets status.
2090 ///
2091 /// Returns whether the merge resulted in a change that requires tighter waits
2092 /// (i.e. the merged brackets strictly dominate the original brackets).
merge(const WaitcntBrackets & Other)2093 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2094 bool StrictDom = false;
2095
2096 VgprUB = std::max(VgprUB, Other.VgprUB);
2097 SgprUB = std::max(SgprUB, Other.SgprUB);
2098
2099 for (auto T : inst_counter_types(MaxCounter)) {
2100 // Merge event flags for this counter
2101 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2102 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2103 if (OtherEvents & ~OldEvents)
2104 StrictDom = true;
2105 PendingEvents |= OtherEvents;
2106
2107 // Merge scores for this counter
2108 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2109 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2110 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2111 if (NewUB < ScoreLBs[T])
2112 report_fatal_error("waitcnt score overflow");
2113
2114 MergeInfo M;
2115 M.OldLB = ScoreLBs[T];
2116 M.OtherLB = Other.ScoreLBs[T];
2117 M.MyShift = NewUB - ScoreUBs[T];
2118 M.OtherShift = NewUB - Other.ScoreUBs[T];
2119
2120 ScoreUBs[T] = NewUB;
2121
2122 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2123
2124 for (int J = 0; J <= VgprUB; J++)
2125 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2126
2127 if (T == SmemAccessCounter) {
2128 for (int J = 0; J <= SgprUB; J++)
2129 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2130 }
2131 }
2132
2133 for (int J = 0; J <= VgprUB; J++) {
2134 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2135 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2136 VgprVmemTypes[J] = NewVmemTypes;
2137 }
2138
2139 return StrictDom;
2140 }
2141
isWaitInstr(MachineInstr & Inst)2142 static bool isWaitInstr(MachineInstr &Inst) {
2143 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2144 return Opcode == AMDGPU::S_WAITCNT ||
2145 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2146 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2147 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2148 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2149 counterTypeForInstr(Opcode).has_value();
2150 }
2151
2152 // Generate s_waitcnt instructions where needed.
insertWaitcntInBlock(MachineFunction & MF,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets)2153 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2154 MachineBasicBlock &Block,
2155 WaitcntBrackets &ScoreBrackets) {
2156 bool Modified = false;
2157
2158 LLVM_DEBUG({
2159 dbgs() << "*** Block" << Block.getNumber() << " ***";
2160 ScoreBrackets.dump();
2161 });
2162
2163 // Track the correctness of vccz through this basic block. There are two
2164 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2165 // ST->partialVCCWritesUpdateVCCZ().
2166 bool VCCZCorrect = true;
2167 if (ST->hasReadVCCZBug()) {
2168 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2169 // to vcc and then issued an smem load.
2170 VCCZCorrect = false;
2171 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2172 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2173 // to vcc_lo or vcc_hi.
2174 VCCZCorrect = false;
2175 }
2176
2177 // Walk over the instructions.
2178 MachineInstr *OldWaitcntInstr = nullptr;
2179
2180 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2181 E = Block.instr_end();
2182 Iter != E;) {
2183 MachineInstr &Inst = *Iter;
2184
2185 // Track pre-existing waitcnts that were added in earlier iterations or by
2186 // the memory legalizer.
2187 if (isWaitInstr(Inst)) {
2188 if (!OldWaitcntInstr)
2189 OldWaitcntInstr = &Inst;
2190 ++Iter;
2191 continue;
2192 }
2193
2194 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2195 isPreheaderToFlush(Block, ScoreBrackets);
2196
2197 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2198 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2199 FlushVmCnt);
2200 OldWaitcntInstr = nullptr;
2201
2202 // Restore vccz if it's not known to be correct already.
2203 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2204
2205 // Don't examine operands unless we need to track vccz correctness.
2206 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2207 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2208 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2209 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2210 if (!ST->partialVCCWritesUpdateVCCZ())
2211 VCCZCorrect = false;
2212 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2213 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2214 // vccz bit, so when we detect that an instruction may read from a
2215 // corrupt vccz bit, we need to:
2216 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2217 // operations to complete.
2218 // 2. Restore the correct value of vccz by writing the current value
2219 // of vcc back to vcc.
2220 if (ST->hasReadVCCZBug() &&
2221 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2222 // Writes to vcc while there's an outstanding smem read may get
2223 // clobbered as soon as any read completes.
2224 VCCZCorrect = false;
2225 } else {
2226 // Writes to vcc will fix any incorrect value in vccz.
2227 VCCZCorrect = true;
2228 }
2229 }
2230 }
2231
2232 if (TII->isSMRD(Inst)) {
2233 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2234 // No need to handle invariant loads when avoiding WAR conflicts, as
2235 // there cannot be a vector store to the same memory location.
2236 if (!Memop->isInvariant()) {
2237 const Value *Ptr = Memop->getValue();
2238 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2239 }
2240 }
2241 if (ST->hasReadVCCZBug()) {
2242 // This smem read could complete and clobber vccz at any time.
2243 VCCZCorrect = false;
2244 }
2245 }
2246
2247 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2248
2249 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2250 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2251 Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2252 ScoreBrackets.simplifyWaitcnt(Wait);
2253 Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2254 ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2255 }
2256
2257 LLVM_DEBUG({
2258 Inst.print(dbgs());
2259 ScoreBrackets.dump();
2260 });
2261
2262 // TODO: Remove this work-around after fixing the scheduler and enable the
2263 // assert above.
2264 if (RestoreVCCZ) {
2265 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2266 // bit is updated, so we can restore the bit by reading the value of
2267 // vcc and then writing it back to the register.
2268 BuildMI(Block, Inst, Inst.getDebugLoc(),
2269 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2270 TRI->getVCC())
2271 .addReg(TRI->getVCC());
2272 VCCZCorrect = true;
2273 Modified = true;
2274 }
2275
2276 ++Iter;
2277 }
2278
2279 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2280 // needed.
2281 AMDGPU::Waitcnt Wait;
2282 if (Block.getFirstTerminator() == Block.end() &&
2283 isPreheaderToFlush(Block, ScoreBrackets)) {
2284 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2285 Wait.LoadCnt = 0;
2286 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2287 Wait.SampleCnt = 0;
2288 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2289 Wait.BvhCnt = 0;
2290 }
2291
2292 // Combine or remove any redundant waitcnts at the end of the block.
2293 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2294 OldWaitcntInstr);
2295
2296 return Modified;
2297 }
2298
2299 // Return true if the given machine basic block is a preheader of a loop in
2300 // which we want to flush the vmcnt counter, and false otherwise.
isPreheaderToFlush(MachineBasicBlock & MBB,WaitcntBrackets & ScoreBrackets)2301 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2302 WaitcntBrackets &ScoreBrackets) {
2303 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2304 if (!IsInserted)
2305 return Iterator->second;
2306
2307 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2308 if (!Succ)
2309 return false;
2310
2311 MachineLoop *Loop = MLI->getLoopFor(Succ);
2312 if (!Loop)
2313 return false;
2314
2315 if (Loop->getLoopPreheader() == &MBB &&
2316 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2317 Iterator->second = true;
2318 return true;
2319 }
2320
2321 return false;
2322 }
2323
isVMEMOrFlatVMEM(const MachineInstr & MI) const2324 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2325 return SIInstrInfo::isVMEM(MI) ||
2326 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2327 }
2328
2329 // Return true if it is better to flush the vmcnt counter in the preheader of
2330 // the given loop. We currently decide to flush in two situations:
2331 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
2332 // vgpr containing a value that is loaded outside of the loop. (Only on
2333 // targets with no vscnt counter).
2334 // 2. The loop contains vmem load(s), but the loaded values are not used in the
2335 // loop, and at least one use of a vgpr containing a value that is loaded
2336 // outside of the loop.
shouldFlushVmCnt(MachineLoop * ML,WaitcntBrackets & Brackets)2337 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2338 WaitcntBrackets &Brackets) {
2339 bool HasVMemLoad = false;
2340 bool HasVMemStore = false;
2341 bool UsesVgprLoadedOutside = false;
2342 DenseSet<Register> VgprUse;
2343 DenseSet<Register> VgprDef;
2344
2345 for (MachineBasicBlock *MBB : ML->blocks()) {
2346 for (MachineInstr &MI : *MBB) {
2347 if (isVMEMOrFlatVMEM(MI)) {
2348 if (MI.mayLoad())
2349 HasVMemLoad = true;
2350 if (MI.mayStore())
2351 HasVMemStore = true;
2352 }
2353 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
2354 MachineOperand &Op = MI.getOperand(I);
2355 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2356 continue;
2357 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
2358 // Vgpr use
2359 if (Op.isUse()) {
2360 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2361 // If we find a register that is loaded inside the loop, 1. and 2.
2362 // are invalidated and we can exit.
2363 if (VgprDef.contains(RegNo))
2364 return false;
2365 VgprUse.insert(RegNo);
2366 // If at least one of Op's registers is in the score brackets, the
2367 // value is likely loaded outside of the loop.
2368 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2369 Brackets.getScoreLB(LOAD_CNT) ||
2370 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2371 Brackets.getScoreLB(SAMPLE_CNT) ||
2372 Brackets.getRegScore(RegNo, BVH_CNT) >
2373 Brackets.getScoreLB(BVH_CNT)) {
2374 UsesVgprLoadedOutside = true;
2375 break;
2376 }
2377 }
2378 }
2379 // VMem load vgpr def
2380 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2381 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2382 // If we find a register that is loaded inside the loop, 1. and 2.
2383 // are invalidated and we can exit.
2384 if (VgprUse.contains(RegNo))
2385 return false;
2386 VgprDef.insert(RegNo);
2387 }
2388 }
2389 }
2390 }
2391 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2392 return true;
2393 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2394 }
2395
runOnMachineFunction(MachineFunction & MF)2396 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2397 ST = &MF.getSubtarget<GCNSubtarget>();
2398 TII = ST->getInstrInfo();
2399 TRI = &TII->getRegisterInfo();
2400 MRI = &MF.getRegInfo();
2401 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2402 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2403 PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2404 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2405 AA = &AAR->getAAResults();
2406
2407 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
2408
2409 if (ST->hasExtendedWaitCounts()) {
2410 MaxCounter = NUM_EXTENDED_INST_CNTS;
2411 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2412 WCG = &WCGGFX12Plus;
2413 } else {
2414 MaxCounter = NUM_NORMAL_INST_CNTS;
2415 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2416 WCG = &WCGPreGFX12;
2417 }
2418
2419 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2420 for (auto T : inst_counter_types())
2421 ForceEmitWaitcnt[T] = false;
2422
2423 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2424
2425 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2426
2427 HardwareLimits Limits = {};
2428 if (ST->hasExtendedWaitCounts()) {
2429 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2430 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2431 } else {
2432 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2433 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2434 }
2435 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2436 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2437 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2438 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2439 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2440
2441 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2442 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2443 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2444 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2445
2446 RegisterEncoding Encoding = {};
2447 Encoding.VGPR0 =
2448 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2449 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2450 Encoding.SGPR0 =
2451 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2452 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2453
2454 BlockInfos.clear();
2455 bool Modified = false;
2456
2457 MachineBasicBlock &EntryBB = MF.front();
2458 MachineBasicBlock::iterator I = EntryBB.begin();
2459
2460 if (!MFI->isEntryFunction()) {
2461 // Wait for any outstanding memory operations that the input registers may
2462 // depend on. We can't track them and it's better to do the wait after the
2463 // costly call sequence.
2464
2465 // TODO: Could insert earlier and schedule more liberally with operations
2466 // that only use caller preserved registers.
2467 for (MachineBasicBlock::iterator E = EntryBB.end();
2468 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2469 ;
2470
2471 if (ST->hasExtendedWaitCounts()) {
2472 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2473 .addImm(0);
2474 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2475 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2476 continue;
2477
2478 BuildMI(EntryBB, I, DebugLoc(),
2479 TII->get(instrsForExtendedCounterTypes[CT]))
2480 .addImm(0);
2481 }
2482 } else {
2483 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2484 }
2485
2486 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2487 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2488 SmemAccessCounter);
2489 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2490 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2491
2492 Modified = true;
2493 }
2494
2495 // Keep iterating over the blocks in reverse post order, inserting and
2496 // updating s_waitcnt where needed, until a fix point is reached.
2497 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2498 BlockInfos.insert({MBB, BlockInfo()});
2499
2500 std::unique_ptr<WaitcntBrackets> Brackets;
2501 bool Repeat;
2502 do {
2503 Repeat = false;
2504
2505 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2506 ++BII) {
2507 MachineBasicBlock *MBB = BII->first;
2508 BlockInfo &BI = BII->second;
2509 if (!BI.Dirty)
2510 continue;
2511
2512 if (BI.Incoming) {
2513 if (!Brackets)
2514 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2515 else
2516 *Brackets = *BI.Incoming;
2517 } else {
2518 if (!Brackets)
2519 Brackets = std::make_unique<WaitcntBrackets>(
2520 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2521 SmemAccessCounter);
2522 else
2523 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2524 WaitEventMaskForInst, SmemAccessCounter);
2525 }
2526
2527 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2528 BI.Dirty = false;
2529
2530 if (Brackets->hasPendingEvent()) {
2531 BlockInfo *MoveBracketsToSucc = nullptr;
2532 for (MachineBasicBlock *Succ : MBB->successors()) {
2533 auto SuccBII = BlockInfos.find(Succ);
2534 BlockInfo &SuccBI = SuccBII->second;
2535 if (!SuccBI.Incoming) {
2536 SuccBI.Dirty = true;
2537 if (SuccBII <= BII)
2538 Repeat = true;
2539 if (!MoveBracketsToSucc) {
2540 MoveBracketsToSucc = &SuccBI;
2541 } else {
2542 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2543 }
2544 } else if (SuccBI.Incoming->merge(*Brackets)) {
2545 SuccBI.Dirty = true;
2546 if (SuccBII <= BII)
2547 Repeat = true;
2548 }
2549 }
2550 if (MoveBracketsToSucc)
2551 MoveBracketsToSucc->Incoming = std::move(Brackets);
2552 }
2553 }
2554 } while (Repeat);
2555
2556 if (ST->hasScalarStores()) {
2557 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2558 bool HaveScalarStores = false;
2559
2560 for (MachineBasicBlock &MBB : MF) {
2561 for (MachineInstr &MI : MBB) {
2562 if (!HaveScalarStores && TII->isScalarStore(MI))
2563 HaveScalarStores = true;
2564
2565 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2566 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2567 EndPgmBlocks.push_back(&MBB);
2568 }
2569 }
2570
2571 if (HaveScalarStores) {
2572 // If scalar writes are used, the cache must be flushed or else the next
2573 // wave to reuse the same scratch memory can be clobbered.
2574 //
2575 // Insert s_dcache_wb at wave termination points if there were any scalar
2576 // stores, and only if the cache hasn't already been flushed. This could
2577 // be improved by looking across blocks for flushes in postdominating
2578 // blocks from the stores but an explicitly requested flush is probably
2579 // very rare.
2580 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2581 bool SeenDCacheWB = false;
2582
2583 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2584 I != E; ++I) {
2585 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2586 SeenDCacheWB = true;
2587 else if (TII->isScalarStore(*I))
2588 SeenDCacheWB = false;
2589
2590 // FIXME: It would be better to insert this before a waitcnt if any.
2591 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2592 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2593 !SeenDCacheWB) {
2594 Modified = true;
2595 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2596 }
2597 }
2598 }
2599 }
2600 }
2601
2602 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2603 // instructions.
2604 for (MachineInstr *MI : ReleaseVGPRInsts) {
2605 if (ST->requiresNopBeforeDeallocVGPRs()) {
2606 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
2607 .addImm(0);
2608 }
2609 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2610 TII->get(AMDGPU::S_SENDMSG))
2611 .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2612 Modified = true;
2613 }
2614 ReleaseVGPRInsts.clear();
2615
2616 return Modified;
2617 }
2618