10b57cec5SDimitry Andric //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// Insert wait instructions for memory reads and writes.
110b57cec5SDimitry Andric ///
120b57cec5SDimitry Andric /// Memory reads and writes are issued asynchronously, so we need to insert
130b57cec5SDimitry Andric /// S_WAITCNT instructions when we want to access any of their results or
140b57cec5SDimitry Andric /// overwrite any register that's used asynchronously.
150b57cec5SDimitry Andric ///
160b57cec5SDimitry Andric /// TODO: This pass currently keeps one timeline per hardware counter. A more
170b57cec5SDimitry Andric /// finely-grained approach that keeps one timeline per event type could
180b57cec5SDimitry Andric /// sometimes get away with generating weaker s_waitcnt instructions. For
190b57cec5SDimitry Andric /// example, when both SMEM and LDS are in flight and we need to wait for
200b57cec5SDimitry Andric /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
210b57cec5SDimitry Andric /// but the pass will currently generate a conservative lgkmcnt(0) because
220b57cec5SDimitry Andric /// multiple event types are in flight.
230b57cec5SDimitry Andric //
240b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
250b57cec5SDimitry Andric
260b57cec5SDimitry Andric #include "AMDGPU.h"
27e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
28e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
290b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
30fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
315ffd83dbSDimitry Andric #include "llvm/ADT/MapVector.h"
320b57cec5SDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
330eae32dcSDimitry Andric #include "llvm/ADT/Sequence.h"
347a6dacacSDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
3581ad6265SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h"
36480093f4SDimitry Andric #include "llvm/CodeGen/MachinePostDominators.h"
37480093f4SDimitry Andric #include "llvm/InitializePasses.h"
380b57cec5SDimitry Andric #include "llvm/Support/DebugCounter.h"
3906c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h"
400b57cec5SDimitry Andric using namespace llvm;
410b57cec5SDimitry Andric
420b57cec5SDimitry Andric #define DEBUG_TYPE "si-insert-waitcnts"
430b57cec5SDimitry Andric
440b57cec5SDimitry Andric DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
450b57cec5SDimitry Andric "Force emit s_waitcnt expcnt(0) instrs");
460b57cec5SDimitry Andric DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
470b57cec5SDimitry Andric "Force emit s_waitcnt lgkmcnt(0) instrs");
480b57cec5SDimitry Andric DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
490b57cec5SDimitry Andric "Force emit s_waitcnt vmcnt(0) instrs");
500b57cec5SDimitry Andric
510b57cec5SDimitry Andric static cl::opt<bool> ForceEmitZeroFlag(
520b57cec5SDimitry Andric "amdgpu-waitcnt-forcezero",
530b57cec5SDimitry Andric cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
540b57cec5SDimitry Andric cl::init(false), cl::Hidden);
550b57cec5SDimitry Andric
560b57cec5SDimitry Andric namespace {
570b57cec5SDimitry Andric // Class of object that encapsulates latest instruction counter score
580b57cec5SDimitry Andric // associated with the operand. Used for determining whether
59349cc55cSDimitry Andric // s_waitcnt instruction needs to be emitted.
600b57cec5SDimitry Andric
617a6dacacSDimitry Andric enum InstCounterType {
627a6dacacSDimitry Andric LOAD_CNT = 0, // VMcnt prior to gfx12.
637a6dacacSDimitry Andric DS_CNT, // LKGMcnt prior to gfx12.
647a6dacacSDimitry Andric EXP_CNT, //
657a6dacacSDimitry Andric STORE_CNT, // VScnt in gfx10/gfx11.
667a6dacacSDimitry Andric NUM_NORMAL_INST_CNTS,
677a6dacacSDimitry Andric SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
687a6dacacSDimitry Andric BVH_CNT, // gfx12+ only.
697a6dacacSDimitry Andric KM_CNT, // gfx12+ only.
707a6dacacSDimitry Andric NUM_EXTENDED_INST_CNTS,
717a6dacacSDimitry Andric NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
727a6dacacSDimitry Andric };
730eae32dcSDimitry Andric } // namespace
740b57cec5SDimitry Andric
750eae32dcSDimitry Andric namespace llvm {
760eae32dcSDimitry Andric template <> struct enum_iteration_traits<InstCounterType> {
770eae32dcSDimitry Andric static constexpr bool is_iterable = true;
780eae32dcSDimitry Andric };
790eae32dcSDimitry Andric } // namespace llvm
800eae32dcSDimitry Andric
810eae32dcSDimitry Andric namespace {
827a6dacacSDimitry Andric // Return an iterator over all counters between LOAD_CNT (the first counter)
837a6dacacSDimitry Andric // and \c MaxCounter (exclusive, default value yields an enumeration over
847a6dacacSDimitry Andric // all counters).
inst_counter_types(InstCounterType MaxCounter=NUM_INST_CNTS)857a6dacacSDimitry Andric auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
867a6dacacSDimitry Andric return enum_seq(LOAD_CNT, MaxCounter);
877a6dacacSDimitry Andric }
880b57cec5SDimitry Andric
895ffd83dbSDimitry Andric using RegInterval = std::pair<int, int>;
900b57cec5SDimitry Andric
910eae32dcSDimitry Andric struct HardwareLimits {
927a6dacacSDimitry Andric unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
935ffd83dbSDimitry Andric unsigned ExpcntMax;
947a6dacacSDimitry Andric unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
957a6dacacSDimitry Andric unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
967a6dacacSDimitry Andric unsigned SamplecntMax; // gfx12+ only.
977a6dacacSDimitry Andric unsigned BvhcntMax; // gfx12+ only.
987a6dacacSDimitry Andric unsigned KmcntMax; // gfx12+ only.
990eae32dcSDimitry Andric };
1000b57cec5SDimitry Andric
1010eae32dcSDimitry Andric struct RegisterEncoding {
1020b57cec5SDimitry Andric unsigned VGPR0;
1030b57cec5SDimitry Andric unsigned VGPRL;
1040b57cec5SDimitry Andric unsigned SGPR0;
1050b57cec5SDimitry Andric unsigned SGPRL;
1060eae32dcSDimitry Andric };
1070b57cec5SDimitry Andric
1080b57cec5SDimitry Andric enum WaitEventType {
1090b57cec5SDimitry Andric VMEM_ACCESS, // vector-memory read & write
1100b57cec5SDimitry Andric VMEM_READ_ACCESS, // vector-memory read
1117a6dacacSDimitry Andric VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
1127a6dacacSDimitry Andric VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
11306c3fb27SDimitry Andric VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
11406c3fb27SDimitry Andric SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
1150b57cec5SDimitry Andric LDS_ACCESS, // lds read & write
1160b57cec5SDimitry Andric GDS_ACCESS, // gds read & write
1170b57cec5SDimitry Andric SQ_MESSAGE, // send message
1180b57cec5SDimitry Andric SMEM_ACCESS, // scalar-memory read & write
1190b57cec5SDimitry Andric EXP_GPR_LOCK, // export holding on its data src
1200b57cec5SDimitry Andric GDS_GPR_LOCK, // GDS holding on its data and addr src
1210b57cec5SDimitry Andric EXP_POS_ACCESS, // write to export position
1220b57cec5SDimitry Andric EXP_PARAM_ACCESS, // write to export parameter
1230b57cec5SDimitry Andric VMW_GPR_LOCK, // vector-memory write holding on its data src
12481ad6265SDimitry Andric EXP_LDS_ACCESS, // read by ldsdir counting as export
1250b57cec5SDimitry Andric NUM_WAIT_EVENTS,
1260b57cec5SDimitry Andric };
1270b57cec5SDimitry Andric
1280b57cec5SDimitry Andric // The mapping is:
1290b57cec5SDimitry Andric // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
1300b57cec5SDimitry Andric // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
1310b57cec5SDimitry Andric // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
1320b57cec5SDimitry Andric // We reserve a fixed number of VGPR slots in the scoring tables for
1330b57cec5SDimitry Andric // special tokens like SCMEM_LDS (needed for buffer load to LDS).
1340b57cec5SDimitry Andric enum RegisterMapping {
135fe6060f1SDimitry Andric SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
13681ad6265SDimitry Andric AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
1370b57cec5SDimitry Andric SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
1387a6dacacSDimitry Andric NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
1397a6dacacSDimitry Andric // Artificial register slots to track LDS writes into specific LDS locations
1407a6dacacSDimitry Andric // if a location is known. When slots are exhausted or location is
1417a6dacacSDimitry Andric // unknown use the first slot. The first slot is also always updated in
1427a6dacacSDimitry Andric // addition to known location's slot to properly generate waits if dependent
1437a6dacacSDimitry Andric // instruction's location is unknown.
1447a6dacacSDimitry Andric EXTRA_VGPR_LDS = 0,
1450b57cec5SDimitry Andric NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
1460b57cec5SDimitry Andric };
1470b57cec5SDimitry Andric
1485ffd83dbSDimitry Andric // Enumerate different types of result-returning VMEM operations. Although
1495ffd83dbSDimitry Andric // s_waitcnt orders them all with a single vmcnt counter, in the absence of
1505ffd83dbSDimitry Andric // s_waitcnt only instructions of the same VmemType are guaranteed to write
1515ffd83dbSDimitry Andric // their results in order -- so there is no need to insert an s_waitcnt between
1525ffd83dbSDimitry Andric // two instructions of the same type that write the same vgpr.
1535ffd83dbSDimitry Andric enum VmemType {
1545ffd83dbSDimitry Andric // BUF instructions and MIMG instructions without a sampler.
1555ffd83dbSDimitry Andric VMEM_NOSAMPLER,
1565ffd83dbSDimitry Andric // MIMG instructions with a sampler.
1575ffd83dbSDimitry Andric VMEM_SAMPLER,
1584824e7fdSDimitry Andric // BVH instructions
1597a6dacacSDimitry Andric VMEM_BVH,
1607a6dacacSDimitry Andric NUM_VMEM_TYPES
1615ffd83dbSDimitry Andric };
1625ffd83dbSDimitry Andric
1637a6dacacSDimitry Andric // Maps values of InstCounterType to the instruction that waits on that
1647a6dacacSDimitry Andric // counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
1657a6dacacSDimitry Andric // returns true.
1667a6dacacSDimitry Andric static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
1677a6dacacSDimitry Andric AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
1687a6dacacSDimitry Andric AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
1697a6dacacSDimitry Andric AMDGPU::S_WAIT_KMCNT};
1707a6dacacSDimitry Andric
updateVMCntOnly(const MachineInstr & Inst)171bdd1243dSDimitry Andric static bool updateVMCntOnly(const MachineInstr &Inst) {
172bdd1243dSDimitry Andric return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
173bdd1243dSDimitry Andric SIInstrInfo::isFLATScratch(Inst);
174bdd1243dSDimitry Andric }
175bdd1243dSDimitry Andric
1767a6dacacSDimitry Andric #ifndef NDEBUG
isNormalMode(InstCounterType MaxCounter)1777a6dacacSDimitry Andric static bool isNormalMode(InstCounterType MaxCounter) {
1787a6dacacSDimitry Andric return MaxCounter == NUM_NORMAL_INST_CNTS;
1797a6dacacSDimitry Andric }
1807a6dacacSDimitry Andric #endif // NDEBUG
1817a6dacacSDimitry Andric
getVmemType(const MachineInstr & Inst)1825ffd83dbSDimitry Andric VmemType getVmemType(const MachineInstr &Inst) {
183bdd1243dSDimitry Andric assert(updateVMCntOnly(Inst));
1847a6dacacSDimitry Andric if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
1857a6dacacSDimitry Andric !SIInstrInfo::isVSAMPLE(Inst))
1865ffd83dbSDimitry Andric return VMEM_NOSAMPLER;
1875ffd83dbSDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
1884824e7fdSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1894824e7fdSDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
1900fca6ea1SDimitry Andric // We have to make an additional check for isVSAMPLE here since some
1910fca6ea1SDimitry Andric // instructions don't have a sampler, but are still classified as sampler
1920fca6ea1SDimitry Andric // instructions for the purposes of e.g. waitcnt.
1934824e7fdSDimitry Andric return BaseInfo->BVH ? VMEM_BVH
1940fca6ea1SDimitry Andric : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
1950fca6ea1SDimitry Andric : VMEM_NOSAMPLER;
1965ffd83dbSDimitry Andric }
1975ffd83dbSDimitry Andric
getCounterRef(AMDGPU::Waitcnt & Wait,InstCounterType T)1987a6dacacSDimitry Andric unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
1990b57cec5SDimitry Andric switch (T) {
2007a6dacacSDimitry Andric case LOAD_CNT:
2017a6dacacSDimitry Andric return Wait.LoadCnt;
2020b57cec5SDimitry Andric case EXP_CNT:
2037a6dacacSDimitry Andric return Wait.ExpCnt;
2047a6dacacSDimitry Andric case DS_CNT:
2057a6dacacSDimitry Andric return Wait.DsCnt;
2067a6dacacSDimitry Andric case STORE_CNT:
2077a6dacacSDimitry Andric return Wait.StoreCnt;
2087a6dacacSDimitry Andric case SAMPLE_CNT:
2097a6dacacSDimitry Andric return Wait.SampleCnt;
2107a6dacacSDimitry Andric case BVH_CNT:
2117a6dacacSDimitry Andric return Wait.BvhCnt;
2127a6dacacSDimitry Andric case KM_CNT:
2137a6dacacSDimitry Andric return Wait.KmCnt;
2140b57cec5SDimitry Andric default:
2150b57cec5SDimitry Andric llvm_unreachable("bad InstCounterType");
2160b57cec5SDimitry Andric }
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric
addWait(AMDGPU::Waitcnt & Wait,InstCounterType T,unsigned Count)2197a6dacacSDimitry Andric void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
2207a6dacacSDimitry Andric unsigned &WC = getCounterRef(Wait, T);
2217a6dacacSDimitry Andric WC = std::min(WC, Count);
2227a6dacacSDimitry Andric }
2237a6dacacSDimitry Andric
setNoWait(AMDGPU::Waitcnt & Wait,InstCounterType T)2247a6dacacSDimitry Andric void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
2257a6dacacSDimitry Andric getCounterRef(Wait, T) = ~0u;
2267a6dacacSDimitry Andric }
2277a6dacacSDimitry Andric
getWait(AMDGPU::Waitcnt & Wait,InstCounterType T)2287a6dacacSDimitry Andric unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
2297a6dacacSDimitry Andric return getCounterRef(Wait, T);
2307a6dacacSDimitry Andric }
2317a6dacacSDimitry Andric
2327a6dacacSDimitry Andric // Mapping from event to counter according to the table masks.
eventCounter(const unsigned * masks,WaitEventType E)2337a6dacacSDimitry Andric InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
2347a6dacacSDimitry Andric for (auto T : inst_counter_types()) {
2357a6dacacSDimitry Andric if (masks[T] & (1 << E))
2367a6dacacSDimitry Andric return T;
2377a6dacacSDimitry Andric }
2387a6dacacSDimitry Andric llvm_unreachable("event type has no associated counter");
2397a6dacacSDimitry Andric }
2407a6dacacSDimitry Andric
2410b57cec5SDimitry Andric // This objects maintains the current score brackets of each wait counter, and
2420b57cec5SDimitry Andric // a per-register scoreboard for each wait counter.
2430b57cec5SDimitry Andric //
2440b57cec5SDimitry Andric // We also maintain the latest score for every event type that can change the
2450b57cec5SDimitry Andric // waitcnt in order to know if there are multiple types of events within
2460b57cec5SDimitry Andric // the brackets. When multiple types of event happen in the bracket,
2470b57cec5SDimitry Andric // wait count may get decreased out of order, therefore we need to put in
2480b57cec5SDimitry Andric // "s_waitcnt 0" before use.
2490b57cec5SDimitry Andric class WaitcntBrackets {
2500b57cec5SDimitry Andric public:
WaitcntBrackets(const GCNSubtarget * SubTarget,InstCounterType MaxCounter,HardwareLimits Limits,RegisterEncoding Encoding,const unsigned * WaitEventMaskForInst,InstCounterType SmemAccessCounter)2517a6dacacSDimitry Andric WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
2527a6dacacSDimitry Andric HardwareLimits Limits, RegisterEncoding Encoding,
2537a6dacacSDimitry Andric const unsigned *WaitEventMaskForInst,
2547a6dacacSDimitry Andric InstCounterType SmemAccessCounter)
2557a6dacacSDimitry Andric : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
2567a6dacacSDimitry Andric Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
2577a6dacacSDimitry Andric SmemAccessCounter(SmemAccessCounter) {}
2580b57cec5SDimitry Andric
getWaitCountMax(InstCounterType T) const2590eae32dcSDimitry Andric unsigned getWaitCountMax(InstCounterType T) const {
2600b57cec5SDimitry Andric switch (T) {
2617a6dacacSDimitry Andric case LOAD_CNT:
2627a6dacacSDimitry Andric return Limits.LoadcntMax;
2637a6dacacSDimitry Andric case DS_CNT:
2647a6dacacSDimitry Andric return Limits.DscntMax;
2650b57cec5SDimitry Andric case EXP_CNT:
2660eae32dcSDimitry Andric return Limits.ExpcntMax;
2677a6dacacSDimitry Andric case STORE_CNT:
2687a6dacacSDimitry Andric return Limits.StorecntMax;
2697a6dacacSDimitry Andric case SAMPLE_CNT:
2707a6dacacSDimitry Andric return Limits.SamplecntMax;
2717a6dacacSDimitry Andric case BVH_CNT:
2727a6dacacSDimitry Andric return Limits.BvhcntMax;
2737a6dacacSDimitry Andric case KM_CNT:
2747a6dacacSDimitry Andric return Limits.KmcntMax;
2750b57cec5SDimitry Andric default:
2760b57cec5SDimitry Andric break;
2770b57cec5SDimitry Andric }
2780b57cec5SDimitry Andric return 0;
2790b57cec5SDimitry Andric }
2800b57cec5SDimitry Andric
getScoreLB(InstCounterType T) const2815ffd83dbSDimitry Andric unsigned getScoreLB(InstCounterType T) const {
2820b57cec5SDimitry Andric assert(T < NUM_INST_CNTS);
2830b57cec5SDimitry Andric return ScoreLBs[T];
2840b57cec5SDimitry Andric }
2850b57cec5SDimitry Andric
getScoreUB(InstCounterType T) const2865ffd83dbSDimitry Andric unsigned getScoreUB(InstCounterType T) const {
2870b57cec5SDimitry Andric assert(T < NUM_INST_CNTS);
2880b57cec5SDimitry Andric return ScoreUBs[T];
2890b57cec5SDimitry Andric }
2900b57cec5SDimitry Andric
getScoreRange(InstCounterType T) const291bdd1243dSDimitry Andric unsigned getScoreRange(InstCounterType T) const {
292bdd1243dSDimitry Andric return getScoreUB(T) - getScoreLB(T);
2930b57cec5SDimitry Andric }
2940b57cec5SDimitry Andric
getRegScore(int GprNo,InstCounterType T) const295bdd1243dSDimitry Andric unsigned getRegScore(int GprNo, InstCounterType T) const {
2960b57cec5SDimitry Andric if (GprNo < NUM_ALL_VGPRS) {
2970b57cec5SDimitry Andric return VgprScores[T][GprNo];
2980b57cec5SDimitry Andric }
2997a6dacacSDimitry Andric assert(T == SmemAccessCounter);
3000b57cec5SDimitry Andric return SgprScores[GprNo - NUM_ALL_VGPRS];
3010b57cec5SDimitry Andric }
3020b57cec5SDimitry Andric
3030b57cec5SDimitry Andric bool merge(const WaitcntBrackets &Other);
3040b57cec5SDimitry Andric
305cb14a3feSDimitry Andric RegInterval getRegInterval(const MachineInstr *MI,
3060b57cec5SDimitry Andric const MachineRegisterInfo *MRI,
3075ffd83dbSDimitry Andric const SIRegisterInfo *TRI, unsigned OpNo) const;
3080b57cec5SDimitry Andric
3090b57cec5SDimitry Andric bool counterOutOfOrder(InstCounterType T) const;
310fe6060f1SDimitry Andric void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
311fe6060f1SDimitry Andric void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
312bdd1243dSDimitry Andric void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
3130b57cec5SDimitry Andric void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
3140b57cec5SDimitry Andric void applyWaitcnt(InstCounterType T, unsigned Count);
3150b57cec5SDimitry Andric void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
3160b57cec5SDimitry Andric const MachineRegisterInfo *MRI, WaitEventType E,
3170b57cec5SDimitry Andric MachineInstr &MI);
3180b57cec5SDimitry Andric
hasPendingEvent() const319bdd1243dSDimitry Andric unsigned hasPendingEvent() const { return PendingEvents; }
hasPendingEvent(WaitEventType E) const320bdd1243dSDimitry Andric unsigned hasPendingEvent(WaitEventType E) const {
3210b57cec5SDimitry Andric return PendingEvents & (1 << E);
3220b57cec5SDimitry Andric }
hasPendingEvent(InstCounterType T) const323bdd1243dSDimitry Andric unsigned hasPendingEvent(InstCounterType T) const {
324bdd1243dSDimitry Andric unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
325bdd1243dSDimitry Andric assert((HasPending != 0) == (getScoreRange(T) != 0));
326bdd1243dSDimitry Andric return HasPending;
327bdd1243dSDimitry Andric }
3280b57cec5SDimitry Andric
hasMixedPendingEvents(InstCounterType T) const3295ffd83dbSDimitry Andric bool hasMixedPendingEvents(InstCounterType T) const {
330bdd1243dSDimitry Andric unsigned Events = hasPendingEvent(T);
3315ffd83dbSDimitry Andric // Return true if more than one bit is set in Events.
3325ffd83dbSDimitry Andric return Events & (Events - 1);
3335ffd83dbSDimitry Andric }
3345ffd83dbSDimitry Andric
hasPendingFlat() const3350b57cec5SDimitry Andric bool hasPendingFlat() const {
3367a6dacacSDimitry Andric return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
3377a6dacacSDimitry Andric LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
3387a6dacacSDimitry Andric (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
3397a6dacacSDimitry Andric LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
3400b57cec5SDimitry Andric }
3410b57cec5SDimitry Andric
setPendingFlat()3420b57cec5SDimitry Andric void setPendingFlat() {
3437a6dacacSDimitry Andric LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
3447a6dacacSDimitry Andric LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
3450b57cec5SDimitry Andric }
3460b57cec5SDimitry Andric
3475ffd83dbSDimitry Andric // Return true if there might be pending writes to the specified vgpr by VMEM
3485ffd83dbSDimitry Andric // instructions with types different from V.
hasOtherPendingVmemTypes(int GprNo,VmemType V) const3495ffd83dbSDimitry Andric bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
3505ffd83dbSDimitry Andric assert(GprNo < NUM_ALL_VGPRS);
3515ffd83dbSDimitry Andric return VgprVmemTypes[GprNo] & ~(1 << V);
3525ffd83dbSDimitry Andric }
3535ffd83dbSDimitry Andric
clearVgprVmemTypes(int GprNo)3545ffd83dbSDimitry Andric void clearVgprVmemTypes(int GprNo) {
3555ffd83dbSDimitry Andric assert(GprNo < NUM_ALL_VGPRS);
3565ffd83dbSDimitry Andric VgprVmemTypes[GprNo] = 0;
3575ffd83dbSDimitry Andric }
3585ffd83dbSDimitry Andric
setStateOnFunctionEntryOrReturn()359297eecfbSDimitry Andric void setStateOnFunctionEntryOrReturn() {
3607a6dacacSDimitry Andric setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
3617a6dacacSDimitry Andric PendingEvents |= WaitEventMaskForInst[STORE_CNT];
3627a6dacacSDimitry Andric }
3637a6dacacSDimitry Andric
getLDSDMAStores() const3647a6dacacSDimitry Andric ArrayRef<const MachineInstr *> getLDSDMAStores() const {
3657a6dacacSDimitry Andric return LDSDMAStores;
3665f757f3fSDimitry Andric }
3675f757f3fSDimitry Andric
3680b57cec5SDimitry Andric void print(raw_ostream &);
dump()3690b57cec5SDimitry Andric void dump() { print(dbgs()); }
3700b57cec5SDimitry Andric
3710b57cec5SDimitry Andric private:
3720b57cec5SDimitry Andric struct MergeInfo {
3735ffd83dbSDimitry Andric unsigned OldLB;
3745ffd83dbSDimitry Andric unsigned OtherLB;
3755ffd83dbSDimitry Andric unsigned MyShift;
3765ffd83dbSDimitry Andric unsigned OtherShift;
3770b57cec5SDimitry Andric };
3785ffd83dbSDimitry Andric static bool mergeScore(const MergeInfo &M, unsigned &Score,
3795ffd83dbSDimitry Andric unsigned OtherScore);
3800b57cec5SDimitry Andric
setScoreLB(InstCounterType T,unsigned Val)3815ffd83dbSDimitry Andric void setScoreLB(InstCounterType T, unsigned Val) {
3820b57cec5SDimitry Andric assert(T < NUM_INST_CNTS);
3830b57cec5SDimitry Andric ScoreLBs[T] = Val;
3840b57cec5SDimitry Andric }
3850b57cec5SDimitry Andric
setScoreUB(InstCounterType T,unsigned Val)3865ffd83dbSDimitry Andric void setScoreUB(InstCounterType T, unsigned Val) {
3870b57cec5SDimitry Andric assert(T < NUM_INST_CNTS);
3880b57cec5SDimitry Andric ScoreUBs[T] = Val;
389bdd1243dSDimitry Andric
390bdd1243dSDimitry Andric if (T != EXP_CNT)
391bdd1243dSDimitry Andric return;
392bdd1243dSDimitry Andric
393bdd1243dSDimitry Andric if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
394bdd1243dSDimitry Andric ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
3950b57cec5SDimitry Andric }
3960b57cec5SDimitry Andric
setRegScore(int GprNo,InstCounterType T,unsigned Val)3975ffd83dbSDimitry Andric void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
3980b57cec5SDimitry Andric if (GprNo < NUM_ALL_VGPRS) {
3995ffd83dbSDimitry Andric VgprUB = std::max(VgprUB, GprNo);
4000b57cec5SDimitry Andric VgprScores[T][GprNo] = Val;
4010b57cec5SDimitry Andric } else {
4027a6dacacSDimitry Andric assert(T == SmemAccessCounter);
4035ffd83dbSDimitry Andric SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
4040b57cec5SDimitry Andric SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
4050b57cec5SDimitry Andric }
4060b57cec5SDimitry Andric }
4070b57cec5SDimitry Andric
4080b57cec5SDimitry Andric void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
4090b57cec5SDimitry Andric const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
4105ffd83dbSDimitry Andric unsigned OpNo, unsigned Val);
4110b57cec5SDimitry Andric
4120b57cec5SDimitry Andric const GCNSubtarget *ST = nullptr;
4137a6dacacSDimitry Andric InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
4140eae32dcSDimitry Andric HardwareLimits Limits = {};
4150eae32dcSDimitry Andric RegisterEncoding Encoding = {};
4167a6dacacSDimitry Andric const unsigned *WaitEventMaskForInst;
4177a6dacacSDimitry Andric InstCounterType SmemAccessCounter;
4185ffd83dbSDimitry Andric unsigned ScoreLBs[NUM_INST_CNTS] = {0};
4195ffd83dbSDimitry Andric unsigned ScoreUBs[NUM_INST_CNTS] = {0};
4205ffd83dbSDimitry Andric unsigned PendingEvents = 0;
4210b57cec5SDimitry Andric // Remember the last flat memory operation.
4225ffd83dbSDimitry Andric unsigned LastFlat[NUM_INST_CNTS] = {0};
4230b57cec5SDimitry Andric // wait_cnt scores for every vgpr.
4240b57cec5SDimitry Andric // Keep track of the VgprUB and SgprUB to make merge at join efficient.
4255ffd83dbSDimitry Andric int VgprUB = -1;
4265ffd83dbSDimitry Andric int SgprUB = -1;
4275ffd83dbSDimitry Andric unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
4287a6dacacSDimitry Andric // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
4297a6dacacSDimitry Andric // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
4305ffd83dbSDimitry Andric unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
4315ffd83dbSDimitry Andric // Bitmask of the VmemTypes of VMEM instructions that might have a pending
4325ffd83dbSDimitry Andric // write to each vgpr.
4335ffd83dbSDimitry Andric unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
4347a6dacacSDimitry Andric // Store representative LDS DMA operations. The only useful info here is
4357a6dacacSDimitry Andric // alias info. One store is kept per unique AAInfo.
4367a6dacacSDimitry Andric SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
4377a6dacacSDimitry Andric };
4387a6dacacSDimitry Andric
4397a6dacacSDimitry Andric // This abstracts the logic for generating and updating S_WAIT* instructions
4407a6dacacSDimitry Andric // away from the analysis that determines where they are needed. This was
4417a6dacacSDimitry Andric // done because the set of counters and instructions for waiting on them
4427a6dacacSDimitry Andric // underwent a major shift with gfx12, sufficiently so that having this
4437a6dacacSDimitry Andric // abstraction allows the main analysis logic to be simpler than it would
4447a6dacacSDimitry Andric // otherwise have had to become.
4457a6dacacSDimitry Andric class WaitcntGenerator {
4467a6dacacSDimitry Andric protected:
4477a6dacacSDimitry Andric const GCNSubtarget *ST = nullptr;
4487a6dacacSDimitry Andric const SIInstrInfo *TII = nullptr;
4497a6dacacSDimitry Andric AMDGPU::IsaVersion IV;
4507a6dacacSDimitry Andric InstCounterType MaxCounter;
4510fca6ea1SDimitry Andric bool OptNone;
4527a6dacacSDimitry Andric
4537a6dacacSDimitry Andric public:
4540fca6ea1SDimitry Andric WaitcntGenerator() = default;
WaitcntGenerator(const MachineFunction & MF,InstCounterType MaxCounter)4550fca6ea1SDimitry Andric WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
4560fca6ea1SDimitry Andric : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
4570fca6ea1SDimitry Andric IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
4580fca6ea1SDimitry Andric OptNone(MF.getFunction().hasOptNone() ||
4590fca6ea1SDimitry Andric MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
4600fca6ea1SDimitry Andric
4610fca6ea1SDimitry Andric // Return true if the current function should be compiled with no
4620fca6ea1SDimitry Andric // optimization.
isOptNone() const4630fca6ea1SDimitry Andric bool isOptNone() const { return OptNone; }
4647a6dacacSDimitry Andric
4657a6dacacSDimitry Andric // Edits an existing sequence of wait count instructions according
4667a6dacacSDimitry Andric // to an incoming Waitcnt value, which is itself updated to reflect
4677a6dacacSDimitry Andric // any new wait count instructions which may need to be generated by
4687a6dacacSDimitry Andric // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
4697a6dacacSDimitry Andric // were made.
4707a6dacacSDimitry Andric //
4717a6dacacSDimitry Andric // This editing will usually be merely updated operands, but it may also
4727a6dacacSDimitry Andric // delete instructions if the incoming Wait value indicates they are not
4737a6dacacSDimitry Andric // needed. It may also remove existing instructions for which a wait
4747a6dacacSDimitry Andric // is needed if it can be determined that it is better to generate new
4757a6dacacSDimitry Andric // instructions later, as can happen on gfx12.
4767a6dacacSDimitry Andric virtual bool
4777a6dacacSDimitry Andric applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
4787a6dacacSDimitry Andric MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
4797a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It) const = 0;
4807a6dacacSDimitry Andric
4817a6dacacSDimitry Andric // Transform a soft waitcnt into a normal one.
4827a6dacacSDimitry Andric bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
4837a6dacacSDimitry Andric
4847a6dacacSDimitry Andric // Generates new wait count instructions according to the value of
4857a6dacacSDimitry Andric // Wait, returning true if any new instructions were created.
4867a6dacacSDimitry Andric virtual bool createNewWaitcnt(MachineBasicBlock &Block,
4877a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It,
4887a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) = 0;
4897a6dacacSDimitry Andric
4907a6dacacSDimitry Andric // Returns an array of bit masks which can be used to map values in
4917a6dacacSDimitry Andric // WaitEventType to corresponding counter values in InstCounterType.
4927a6dacacSDimitry Andric virtual const unsigned *getWaitEventMask() const = 0;
4937a6dacacSDimitry Andric
4940fca6ea1SDimitry Andric // Returns a new waitcnt with all counters except VScnt set to 0. If
4950fca6ea1SDimitry Andric // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
4960fca6ea1SDimitry Andric virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
4970fca6ea1SDimitry Andric
4987a6dacacSDimitry Andric virtual ~WaitcntGenerator() = default;
4990fca6ea1SDimitry Andric
5000fca6ea1SDimitry Andric // Create a mask value from the initializer list of wait event types.
5010fca6ea1SDimitry Andric static constexpr unsigned
eventMask(std::initializer_list<WaitEventType> Events)5020fca6ea1SDimitry Andric eventMask(std::initializer_list<WaitEventType> Events) {
5030fca6ea1SDimitry Andric unsigned Mask = 0;
5040fca6ea1SDimitry Andric for (auto &E : Events)
5050fca6ea1SDimitry Andric Mask |= 1 << E;
5060fca6ea1SDimitry Andric
5070fca6ea1SDimitry Andric return Mask;
5080fca6ea1SDimitry Andric }
5097a6dacacSDimitry Andric };
5107a6dacacSDimitry Andric
5117a6dacacSDimitry Andric class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
5127a6dacacSDimitry Andric public:
5130fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12() = default;
WaitcntGeneratorPreGFX12(const MachineFunction & MF)5140fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12(const MachineFunction &MF)
5150fca6ea1SDimitry Andric : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
5167a6dacacSDimitry Andric
5177a6dacacSDimitry Andric bool
5187a6dacacSDimitry Andric applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
5197a6dacacSDimitry Andric MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
5207a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It) const override;
5217a6dacacSDimitry Andric
5227a6dacacSDimitry Andric bool createNewWaitcnt(MachineBasicBlock &Block,
5237a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It,
5247a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) override;
5257a6dacacSDimitry Andric
getWaitEventMask() const5267a6dacacSDimitry Andric const unsigned *getWaitEventMask() const override {
5277a6dacacSDimitry Andric assert(ST);
5287a6dacacSDimitry Andric
5297a6dacacSDimitry Andric static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
5300fca6ea1SDimitry Andric eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
5310fca6ea1SDimitry Andric VMEM_BVH_READ_ACCESS}),
5320fca6ea1SDimitry Andric eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
5330fca6ea1SDimitry Andric eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
5340fca6ea1SDimitry Andric EXP_POS_ACCESS, EXP_LDS_ACCESS}),
5350fca6ea1SDimitry Andric eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
5367a6dacacSDimitry Andric 0,
5377a6dacacSDimitry Andric 0,
5387a6dacacSDimitry Andric 0};
5397a6dacacSDimitry Andric
5407a6dacacSDimitry Andric return WaitEventMaskForInstPreGFX12;
5417a6dacacSDimitry Andric }
5420fca6ea1SDimitry Andric
5430fca6ea1SDimitry Andric AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
5447a6dacacSDimitry Andric };
5457a6dacacSDimitry Andric
5467a6dacacSDimitry Andric class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
5477a6dacacSDimitry Andric public:
5480fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus() = default;
WaitcntGeneratorGFX12Plus(const MachineFunction & MF,InstCounterType MaxCounter)5490fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
5500fca6ea1SDimitry Andric InstCounterType MaxCounter)
5510fca6ea1SDimitry Andric : WaitcntGenerator(MF, MaxCounter) {}
5527a6dacacSDimitry Andric
5537a6dacacSDimitry Andric bool
5547a6dacacSDimitry Andric applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
5557a6dacacSDimitry Andric MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
5567a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It) const override;
5577a6dacacSDimitry Andric
5587a6dacacSDimitry Andric bool createNewWaitcnt(MachineBasicBlock &Block,
5597a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It,
5607a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) override;
5617a6dacacSDimitry Andric
getWaitEventMask() const5627a6dacacSDimitry Andric const unsigned *getWaitEventMask() const override {
5637a6dacacSDimitry Andric assert(ST);
5647a6dacacSDimitry Andric
5657a6dacacSDimitry Andric static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
5660fca6ea1SDimitry Andric eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
5670fca6ea1SDimitry Andric eventMask({LDS_ACCESS, GDS_ACCESS}),
5680fca6ea1SDimitry Andric eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
5690fca6ea1SDimitry Andric EXP_POS_ACCESS, EXP_LDS_ACCESS}),
5700fca6ea1SDimitry Andric eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
5710fca6ea1SDimitry Andric eventMask({VMEM_SAMPLER_READ_ACCESS}),
5720fca6ea1SDimitry Andric eventMask({VMEM_BVH_READ_ACCESS}),
5730fca6ea1SDimitry Andric eventMask({SMEM_ACCESS, SQ_MESSAGE})};
5747a6dacacSDimitry Andric
5757a6dacacSDimitry Andric return WaitEventMaskForInstGFX12Plus;
5767a6dacacSDimitry Andric }
5770fca6ea1SDimitry Andric
5780fca6ea1SDimitry Andric AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
5790b57cec5SDimitry Andric };
5800b57cec5SDimitry Andric
5810b57cec5SDimitry Andric class SIInsertWaitcnts : public MachineFunctionPass {
5820b57cec5SDimitry Andric private:
5830b57cec5SDimitry Andric const GCNSubtarget *ST = nullptr;
5840b57cec5SDimitry Andric const SIInstrInfo *TII = nullptr;
5850b57cec5SDimitry Andric const SIRegisterInfo *TRI = nullptr;
5860b57cec5SDimitry Andric const MachineRegisterInfo *MRI = nullptr;
5870b57cec5SDimitry Andric
588480093f4SDimitry Andric DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
58981ad6265SDimitry Andric DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
59081ad6265SDimitry Andric MachineLoopInfo *MLI;
591480093f4SDimitry Andric MachinePostDominatorTree *PDT;
5927a6dacacSDimitry Andric AliasAnalysis *AA = nullptr;
5930b57cec5SDimitry Andric
5940b57cec5SDimitry Andric struct BlockInfo {
5950b57cec5SDimitry Andric std::unique_ptr<WaitcntBrackets> Incoming;
5960b57cec5SDimitry Andric bool Dirty = true;
5970b57cec5SDimitry Andric };
5980b57cec5SDimitry Andric
5997a6dacacSDimitry Andric InstCounterType SmemAccessCounter;
6007a6dacacSDimitry Andric
6015ffd83dbSDimitry Andric MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
6020b57cec5SDimitry Andric
6030b57cec5SDimitry Andric // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
6040b57cec5SDimitry Andric // because of amdgpu-waitcnt-forcezero flag
6050b57cec5SDimitry Andric bool ForceEmitZeroWaitcnts;
6060b57cec5SDimitry Andric bool ForceEmitWaitcnt[NUM_INST_CNTS];
6070b57cec5SDimitry Andric
6087a6dacacSDimitry Andric // In any given run of this pass, WCG will point to one of these two
6097a6dacacSDimitry Andric // generator objects, which must have been re-initialised before use
6107a6dacacSDimitry Andric // from a value made using a subtarget constructor.
6117a6dacacSDimitry Andric WaitcntGeneratorPreGFX12 WCGPreGFX12;
6127a6dacacSDimitry Andric WaitcntGeneratorGFX12Plus WCGGFX12Plus;
6137a6dacacSDimitry Andric
6147a6dacacSDimitry Andric WaitcntGenerator *WCG = nullptr;
6157a6dacacSDimitry Andric
61606c3fb27SDimitry Andric // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
61706c3fb27SDimitry Andric // message.
61806c3fb27SDimitry Andric DenseSet<MachineInstr *> ReleaseVGPRInsts;
61906c3fb27SDimitry Andric
6207a6dacacSDimitry Andric InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
6217a6dacacSDimitry Andric
6220b57cec5SDimitry Andric public:
6230b57cec5SDimitry Andric static char ID;
6240b57cec5SDimitry Andric
SIInsertWaitcnts()6250b57cec5SDimitry Andric SIInsertWaitcnts() : MachineFunctionPass(ID) {
6260b57cec5SDimitry Andric (void)ForceExpCounter;
6270b57cec5SDimitry Andric (void)ForceLgkmCounter;
6280b57cec5SDimitry Andric (void)ForceVMCounter;
6290b57cec5SDimitry Andric }
6300b57cec5SDimitry Andric
63181ad6265SDimitry Andric bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
63281ad6265SDimitry Andric bool isPreheaderToFlush(MachineBasicBlock &MBB,
63381ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets);
63406c3fb27SDimitry Andric bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
6350b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
6360b57cec5SDimitry Andric
getPassName() const6370b57cec5SDimitry Andric StringRef getPassName() const override {
6380b57cec5SDimitry Andric return "SI insert wait instructions";
6390b57cec5SDimitry Andric }
6400b57cec5SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const6410b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
6420b57cec5SDimitry Andric AU.setPreservesCFG();
6430fca6ea1SDimitry Andric AU.addRequired<MachineLoopInfoWrapperPass>();
6440fca6ea1SDimitry Andric AU.addRequired<MachinePostDominatorTreeWrapperPass>();
6457a6dacacSDimitry Andric AU.addUsedIfAvailable<AAResultsWrapperPass>();
6467a6dacacSDimitry Andric AU.addPreserved<AAResultsWrapperPass>();
6470b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
6480b57cec5SDimitry Andric }
6490b57cec5SDimitry Andric
isForceEmitWaitcnt() const6500b57cec5SDimitry Andric bool isForceEmitWaitcnt() const {
6510b57cec5SDimitry Andric for (auto T : inst_counter_types())
6520b57cec5SDimitry Andric if (ForceEmitWaitcnt[T])
6530b57cec5SDimitry Andric return true;
6540b57cec5SDimitry Andric return false;
6550b57cec5SDimitry Andric }
6560b57cec5SDimitry Andric
setForceEmitWaitcnt()6570b57cec5SDimitry Andric void setForceEmitWaitcnt() {
6580b57cec5SDimitry Andric // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
6590b57cec5SDimitry Andric // For debug builds, get the debug counter info and adjust if need be
6600b57cec5SDimitry Andric #ifndef NDEBUG
6610b57cec5SDimitry Andric if (DebugCounter::isCounterSet(ForceExpCounter) &&
6620b57cec5SDimitry Andric DebugCounter::shouldExecute(ForceExpCounter)) {
6630b57cec5SDimitry Andric ForceEmitWaitcnt[EXP_CNT] = true;
6640b57cec5SDimitry Andric } else {
6650b57cec5SDimitry Andric ForceEmitWaitcnt[EXP_CNT] = false;
6660b57cec5SDimitry Andric }
6670b57cec5SDimitry Andric
6680b57cec5SDimitry Andric if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
6690b57cec5SDimitry Andric DebugCounter::shouldExecute(ForceLgkmCounter)) {
6707a6dacacSDimitry Andric ForceEmitWaitcnt[DS_CNT] = true;
6717a6dacacSDimitry Andric ForceEmitWaitcnt[KM_CNT] = true;
6720b57cec5SDimitry Andric } else {
6737a6dacacSDimitry Andric ForceEmitWaitcnt[DS_CNT] = false;
6747a6dacacSDimitry Andric ForceEmitWaitcnt[KM_CNT] = false;
6750b57cec5SDimitry Andric }
6760b57cec5SDimitry Andric
6770b57cec5SDimitry Andric if (DebugCounter::isCounterSet(ForceVMCounter) &&
6780b57cec5SDimitry Andric DebugCounter::shouldExecute(ForceVMCounter)) {
6797a6dacacSDimitry Andric ForceEmitWaitcnt[LOAD_CNT] = true;
6807a6dacacSDimitry Andric ForceEmitWaitcnt[SAMPLE_CNT] = true;
6817a6dacacSDimitry Andric ForceEmitWaitcnt[BVH_CNT] = true;
6820b57cec5SDimitry Andric } else {
6837a6dacacSDimitry Andric ForceEmitWaitcnt[LOAD_CNT] = false;
6847a6dacacSDimitry Andric ForceEmitWaitcnt[SAMPLE_CNT] = false;
6857a6dacacSDimitry Andric ForceEmitWaitcnt[BVH_CNT] = false;
6860b57cec5SDimitry Andric }
6870b57cec5SDimitry Andric #endif // NDEBUG
6880b57cec5SDimitry Andric }
6890b57cec5SDimitry Andric
690bdd1243dSDimitry Andric // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
691bdd1243dSDimitry Andric // FLAT instruction.
getVmemWaitEventType(const MachineInstr & Inst) const692bdd1243dSDimitry Andric WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
6937a6dacacSDimitry Andric // Maps VMEM access types to their corresponding WaitEventType.
6947a6dacacSDimitry Andric static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
6957a6dacacSDimitry Andric VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
6967a6dacacSDimitry Andric
697bdd1243dSDimitry Andric assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
6985f757f3fSDimitry Andric // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
6995f757f3fSDimitry Andric // these should use VM_CNT.
7005f757f3fSDimitry Andric if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
701bdd1243dSDimitry Andric return VMEM_ACCESS;
7020fca6ea1SDimitry Andric if (Inst.mayStore() &&
7030fca6ea1SDimitry Andric (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
70406c3fb27SDimitry Andric // FLAT and SCRATCH instructions may access scratch. Other VMEM
70506c3fb27SDimitry Andric // instructions do not.
70606c3fb27SDimitry Andric if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
70706c3fb27SDimitry Andric return SCRATCH_WRITE_ACCESS;
708bdd1243dSDimitry Andric return VMEM_WRITE_ACCESS;
70906c3fb27SDimitry Andric }
7107a6dacacSDimitry Andric if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
711bdd1243dSDimitry Andric return VMEM_READ_ACCESS;
7127a6dacacSDimitry Andric return VmemReadMapping[getVmemType(Inst)];
713bdd1243dSDimitry Andric }
714bdd1243dSDimitry Andric
715e8d8bef9SDimitry Andric bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
7160b57cec5SDimitry Andric bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
71706c3fb27SDimitry Andric bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
7180b57cec5SDimitry Andric bool generateWaitcntInstBefore(MachineInstr &MI,
7190b57cec5SDimitry Andric WaitcntBrackets &ScoreBrackets,
72081ad6265SDimitry Andric MachineInstr *OldWaitcntInstr,
72181ad6265SDimitry Andric bool FlushVmCnt);
72281ad6265SDimitry Andric bool generateWaitcnt(AMDGPU::Waitcnt Wait,
72381ad6265SDimitry Andric MachineBasicBlock::instr_iterator It,
72481ad6265SDimitry Andric MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
7250b57cec5SDimitry Andric MachineInstr *OldWaitcntInstr);
7260b57cec5SDimitry Andric void updateEventWaitcntAfter(MachineInstr &Inst,
7270b57cec5SDimitry Andric WaitcntBrackets *ScoreBrackets);
7280b57cec5SDimitry Andric bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
7290b57cec5SDimitry Andric WaitcntBrackets &ScoreBrackets);
7300b57cec5SDimitry Andric };
7310b57cec5SDimitry Andric
7320b57cec5SDimitry Andric } // end anonymous namespace
7330b57cec5SDimitry Andric
getRegInterval(const MachineInstr * MI,const MachineRegisterInfo * MRI,const SIRegisterInfo * TRI,unsigned OpNo) const7340b57cec5SDimitry Andric RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
7350b57cec5SDimitry Andric const MachineRegisterInfo *MRI,
7360b57cec5SDimitry Andric const SIRegisterInfo *TRI,
7375ffd83dbSDimitry Andric unsigned OpNo) const {
7380b57cec5SDimitry Andric const MachineOperand &Op = MI->getOperand(OpNo);
739fe6060f1SDimitry Andric if (!TRI->isInAllocatableClass(Op.getReg()))
7400b57cec5SDimitry Andric return {-1, -1};
7410b57cec5SDimitry Andric
7420b57cec5SDimitry Andric // A use via a PW operand does not need a waitcnt.
7430b57cec5SDimitry Andric // A partial write is not a WAW.
7440b57cec5SDimitry Andric assert(!Op.getSubReg() || !Op.isUndef());
7450b57cec5SDimitry Andric
7460b57cec5SDimitry Andric RegInterval Result;
7470b57cec5SDimitry Andric
7485f757f3fSDimitry Andric unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
7495f757f3fSDimitry Andric AMDGPU::HWEncoding::REG_IDX_MASK;
7500b57cec5SDimitry Andric
751fe6060f1SDimitry Andric if (TRI->isVectorRegister(*MRI, Op.getReg())) {
7520eae32dcSDimitry Andric assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
7530eae32dcSDimitry Andric Result.first = Reg - Encoding.VGPR0;
754fe6060f1SDimitry Andric if (TRI->isAGPR(*MRI, Op.getReg()))
755fe6060f1SDimitry Andric Result.first += AGPR_OFFSET;
7560b57cec5SDimitry Andric assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
7575ffd83dbSDimitry Andric } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
7580eae32dcSDimitry Andric assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
7590eae32dcSDimitry Andric Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
7600b57cec5SDimitry Andric assert(Result.first >= NUM_ALL_VGPRS &&
7610b57cec5SDimitry Andric Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
7620b57cec5SDimitry Andric }
7630b57cec5SDimitry Andric // TODO: Handle TTMP
7645ffd83dbSDimitry Andric // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
7650b57cec5SDimitry Andric else
7660b57cec5SDimitry Andric return {-1, -1};
7670b57cec5SDimitry Andric
768cb14a3feSDimitry Andric const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
7690b57cec5SDimitry Andric unsigned Size = TRI->getRegSizeInBits(*RC);
7705ffd83dbSDimitry Andric Result.second = Result.first + ((Size + 16) / 32);
7710b57cec5SDimitry Andric
7720b57cec5SDimitry Andric return Result;
7730b57cec5SDimitry Andric }
7740b57cec5SDimitry Andric
setExpScore(const MachineInstr * MI,const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,unsigned OpNo,unsigned Val)7750b57cec5SDimitry Andric void WaitcntBrackets::setExpScore(const MachineInstr *MI,
7760b57cec5SDimitry Andric const SIInstrInfo *TII,
7770b57cec5SDimitry Andric const SIRegisterInfo *TRI,
7780b57cec5SDimitry Andric const MachineRegisterInfo *MRI, unsigned OpNo,
7795ffd83dbSDimitry Andric unsigned Val) {
780cb14a3feSDimitry Andric RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
781fe6060f1SDimitry Andric assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
7825ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
7830b57cec5SDimitry Andric setRegScore(RegNo, EXP_CNT, Val);
7840b57cec5SDimitry Andric }
7850b57cec5SDimitry Andric }
7860b57cec5SDimitry Andric
updateByEvent(const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,WaitEventType E,MachineInstr & Inst)7870b57cec5SDimitry Andric void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
7880b57cec5SDimitry Andric const SIRegisterInfo *TRI,
7890b57cec5SDimitry Andric const MachineRegisterInfo *MRI,
7900b57cec5SDimitry Andric WaitEventType E, MachineInstr &Inst) {
7917a6dacacSDimitry Andric InstCounterType T = eventCounter(WaitEventMaskForInst, E);
7927a6dacacSDimitry Andric
7937a6dacacSDimitry Andric unsigned UB = getScoreUB(T);
7947a6dacacSDimitry Andric unsigned CurrScore = UB + 1;
7950b57cec5SDimitry Andric if (CurrScore == 0)
7960b57cec5SDimitry Andric report_fatal_error("InsertWaitcnt score wraparound");
7970b57cec5SDimitry Andric // PendingEvents and ScoreUB need to be update regardless if this event
7980b57cec5SDimitry Andric // changes the score of a register or not.
7990b57cec5SDimitry Andric // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
8000b57cec5SDimitry Andric PendingEvents |= 1 << E;
8010b57cec5SDimitry Andric setScoreUB(T, CurrScore);
8020b57cec5SDimitry Andric
8030b57cec5SDimitry Andric if (T == EXP_CNT) {
8040b57cec5SDimitry Andric // Put score on the source vgprs. If this is a store, just use those
8050b57cec5SDimitry Andric // specific register(s).
8060b57cec5SDimitry Andric if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
8070b57cec5SDimitry Andric int AddrOpIdx =
8080b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
8090b57cec5SDimitry Andric // All GDS operations must protect their address register (same as
8100b57cec5SDimitry Andric // export.)
8110b57cec5SDimitry Andric if (AddrOpIdx != -1) {
8120b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
8130b57cec5SDimitry Andric }
8140b57cec5SDimitry Andric
8150b57cec5SDimitry Andric if (Inst.mayStore()) {
816bdd1243dSDimitry Andric if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
8170b57cec5SDimitry Andric setExpScore(
8180b57cec5SDimitry Andric &Inst, TII, TRI, MRI,
8190b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
8200b57cec5SDimitry Andric CurrScore);
8210b57cec5SDimitry Andric }
822bdd1243dSDimitry Andric if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
8230b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI,
8240b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
8250b57cec5SDimitry Andric AMDGPU::OpName::data1),
8260b57cec5SDimitry Andric CurrScore);
8270b57cec5SDimitry Andric }
8285f757f3fSDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
8290b57cec5SDimitry Andric Inst.getOpcode() != AMDGPU::DS_APPEND &&
8300b57cec5SDimitry Andric Inst.getOpcode() != AMDGPU::DS_CONSUME &&
8310b57cec5SDimitry Andric Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
8320b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
8330b57cec5SDimitry Andric const MachineOperand &Op = Inst.getOperand(I);
834fe6060f1SDimitry Andric if (Op.isReg() && !Op.isDef() &&
835fe6060f1SDimitry Andric TRI->isVectorRegister(*MRI, Op.getReg())) {
8360b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
8370b57cec5SDimitry Andric }
8380b57cec5SDimitry Andric }
8390b57cec5SDimitry Andric }
8400b57cec5SDimitry Andric } else if (TII->isFLAT(Inst)) {
8410b57cec5SDimitry Andric if (Inst.mayStore()) {
8420b57cec5SDimitry Andric setExpScore(
8430b57cec5SDimitry Andric &Inst, TII, TRI, MRI,
8440b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8450b57cec5SDimitry Andric CurrScore);
846fe6060f1SDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst)) {
8470b57cec5SDimitry Andric setExpScore(
8480b57cec5SDimitry Andric &Inst, TII, TRI, MRI,
8490b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8500b57cec5SDimitry Andric CurrScore);
8510b57cec5SDimitry Andric }
8520b57cec5SDimitry Andric } else if (TII->isMIMG(Inst)) {
8530b57cec5SDimitry Andric if (Inst.mayStore()) {
8540b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
855fe6060f1SDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst)) {
8560b57cec5SDimitry Andric setExpScore(
8570b57cec5SDimitry Andric &Inst, TII, TRI, MRI,
8580b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8590b57cec5SDimitry Andric CurrScore);
8600b57cec5SDimitry Andric }
8610b57cec5SDimitry Andric } else if (TII->isMTBUF(Inst)) {
8620b57cec5SDimitry Andric if (Inst.mayStore()) {
8630b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
8640b57cec5SDimitry Andric }
8650b57cec5SDimitry Andric } else if (TII->isMUBUF(Inst)) {
8660b57cec5SDimitry Andric if (Inst.mayStore()) {
8670b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
868fe6060f1SDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst)) {
8690b57cec5SDimitry Andric setExpScore(
8700b57cec5SDimitry Andric &Inst, TII, TRI, MRI,
8710b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8720b57cec5SDimitry Andric CurrScore);
8730b57cec5SDimitry Andric }
87481ad6265SDimitry Andric } else if (TII->isLDSDIR(Inst)) {
87581ad6265SDimitry Andric // LDSDIR instructions attach the score to the destination.
87681ad6265SDimitry Andric setExpScore(
87781ad6265SDimitry Andric &Inst, TII, TRI, MRI,
87881ad6265SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
87981ad6265SDimitry Andric CurrScore);
8800b57cec5SDimitry Andric } else {
8810b57cec5SDimitry Andric if (TII->isEXP(Inst)) {
8820b57cec5SDimitry Andric // For export the destination registers are really temps that
8830b57cec5SDimitry Andric // can be used as the actual source after export patching, so
8840b57cec5SDimitry Andric // we need to treat them like sources and set the EXP_CNT
8850b57cec5SDimitry Andric // score.
8860b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
8870b57cec5SDimitry Andric MachineOperand &DefMO = Inst.getOperand(I);
8880b57cec5SDimitry Andric if (DefMO.isReg() && DefMO.isDef() &&
8895ffd83dbSDimitry Andric TRI->isVGPR(*MRI, DefMO.getReg())) {
890e8d8bef9SDimitry Andric setRegScore(
891e8d8bef9SDimitry Andric TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
892e8d8bef9SDimitry Andric EXP_CNT, CurrScore);
8930b57cec5SDimitry Andric }
8940b57cec5SDimitry Andric }
8950b57cec5SDimitry Andric }
8960b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
8970b57cec5SDimitry Andric MachineOperand &MO = Inst.getOperand(I);
898fe6060f1SDimitry Andric if (MO.isReg() && !MO.isDef() &&
899fe6060f1SDimitry Andric TRI->isVectorRegister(*MRI, MO.getReg())) {
9000b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
9010b57cec5SDimitry Andric }
9020b57cec5SDimitry Andric }
9030b57cec5SDimitry Andric }
9045f757f3fSDimitry Andric } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
9050b57cec5SDimitry Andric // Match the score to the destination registers.
9060b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
9075ffd83dbSDimitry Andric auto &Op = Inst.getOperand(I);
9085ffd83dbSDimitry Andric if (!Op.isReg() || !Op.isDef())
9090b57cec5SDimitry Andric continue;
910cb14a3feSDimitry Andric RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
9117a6dacacSDimitry Andric if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
9125ffd83dbSDimitry Andric if (Interval.first >= NUM_ALL_VGPRS)
9135ffd83dbSDimitry Andric continue;
914bdd1243dSDimitry Andric if (updateVMCntOnly(Inst)) {
9155f757f3fSDimitry Andric // updateVMCntOnly should only leave us with VGPRs
9165f757f3fSDimitry Andric // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
9175f757f3fSDimitry Andric // defs. That's required for a sane index into `VgprMemTypes` below
9185f757f3fSDimitry Andric assert(TRI->isVectorRegister(*MRI, Op.getReg()));
9195ffd83dbSDimitry Andric VmemType V = getVmemType(Inst);
9205ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
9215ffd83dbSDimitry Andric VgprVmemTypes[RegNo] |= 1 << V;
9225ffd83dbSDimitry Andric }
9235ffd83dbSDimitry Andric }
9245ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
9250b57cec5SDimitry Andric setRegScore(RegNo, T, CurrScore);
9260b57cec5SDimitry Andric }
9270b57cec5SDimitry Andric }
9285f757f3fSDimitry Andric if (Inst.mayStore() &&
9295f757f3fSDimitry Andric (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
9305f757f3fSDimitry Andric // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
9315f757f3fSDimitry Andric // written can be accessed. A load from LDS to VMEM does not need a wait.
9327a6dacacSDimitry Andric unsigned Slot = 0;
9337a6dacacSDimitry Andric for (const auto *MemOp : Inst.memoperands()) {
9347a6dacacSDimitry Andric if (!MemOp->isStore() ||
9357a6dacacSDimitry Andric MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
9367a6dacacSDimitry Andric continue;
9377a6dacacSDimitry Andric // Comparing just AA info does not guarantee memoperands are equal
9387a6dacacSDimitry Andric // in general, but this is so for LDS DMA in practice.
9397a6dacacSDimitry Andric auto AAI = MemOp->getAAInfo();
9407a6dacacSDimitry Andric // Alias scope information gives a way to definitely identify an
9417a6dacacSDimitry Andric // original memory object and practically produced in the module LDS
9427a6dacacSDimitry Andric // lowering pass. If there is no scope available we will not be able
9437a6dacacSDimitry Andric // to disambiguate LDS aliasing as after the module lowering all LDS
9447a6dacacSDimitry Andric // is squashed into a single big object. Do not attempt to use one of
9457a6dacacSDimitry Andric // the limited LDSDMAStores for something we will not be able to use
9467a6dacacSDimitry Andric // anyway.
9477a6dacacSDimitry Andric if (!AAI || !AAI.Scope)
9487a6dacacSDimitry Andric break;
9497a6dacacSDimitry Andric for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
9507a6dacacSDimitry Andric for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
9517a6dacacSDimitry Andric if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
9527a6dacacSDimitry Andric Slot = I + 1;
9537a6dacacSDimitry Andric break;
9547a6dacacSDimitry Andric }
9557a6dacacSDimitry Andric }
9567a6dacacSDimitry Andric }
9577a6dacacSDimitry Andric if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
9587a6dacacSDimitry Andric break;
9597a6dacacSDimitry Andric LDSDMAStores.push_back(&Inst);
9607a6dacacSDimitry Andric Slot = LDSDMAStores.size();
9617a6dacacSDimitry Andric break;
9627a6dacacSDimitry Andric }
9637a6dacacSDimitry Andric setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
9647a6dacacSDimitry Andric if (Slot)
9650b57cec5SDimitry Andric setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
9660b57cec5SDimitry Andric }
9670b57cec5SDimitry Andric }
9680b57cec5SDimitry Andric }
9690b57cec5SDimitry Andric
print(raw_ostream & OS)9700b57cec5SDimitry Andric void WaitcntBrackets::print(raw_ostream &OS) {
9710b57cec5SDimitry Andric OS << '\n';
9727a6dacacSDimitry Andric for (auto T : inst_counter_types(MaxCounter)) {
973bdd1243dSDimitry Andric unsigned SR = getScoreRange(T);
9740b57cec5SDimitry Andric
9750b57cec5SDimitry Andric switch (T) {
9767a6dacacSDimitry Andric case LOAD_CNT:
9777a6dacacSDimitry Andric OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
9787a6dacacSDimitry Andric << SR << "): ";
9790b57cec5SDimitry Andric break;
9807a6dacacSDimitry Andric case DS_CNT:
9817a6dacacSDimitry Andric OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
9827a6dacacSDimitry Andric << SR << "): ";
9830b57cec5SDimitry Andric break;
9840b57cec5SDimitry Andric case EXP_CNT:
985bdd1243dSDimitry Andric OS << " EXP_CNT(" << SR << "): ";
9860b57cec5SDimitry Andric break;
9877a6dacacSDimitry Andric case STORE_CNT:
9887a6dacacSDimitry Andric OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
9897a6dacacSDimitry Andric << SR << "): ";
9907a6dacacSDimitry Andric break;
9917a6dacacSDimitry Andric case SAMPLE_CNT:
9927a6dacacSDimitry Andric OS << " SAMPLE_CNT(" << SR << "): ";
9937a6dacacSDimitry Andric break;
9947a6dacacSDimitry Andric case BVH_CNT:
9957a6dacacSDimitry Andric OS << " BVH_CNT(" << SR << "): ";
9967a6dacacSDimitry Andric break;
9977a6dacacSDimitry Andric case KM_CNT:
9987a6dacacSDimitry Andric OS << " KM_CNT(" << SR << "): ";
9990b57cec5SDimitry Andric break;
10000b57cec5SDimitry Andric default:
1001bdd1243dSDimitry Andric OS << " UNKNOWN(" << SR << "): ";
10020b57cec5SDimitry Andric break;
10030b57cec5SDimitry Andric }
10040b57cec5SDimitry Andric
1005bdd1243dSDimitry Andric if (SR != 0) {
10060b57cec5SDimitry Andric // Print vgpr scores.
1007bdd1243dSDimitry Andric unsigned LB = getScoreLB(T);
1008bdd1243dSDimitry Andric
10095ffd83dbSDimitry Andric for (int J = 0; J <= VgprUB; J++) {
10105ffd83dbSDimitry Andric unsigned RegScore = getRegScore(J, T);
10110b57cec5SDimitry Andric if (RegScore <= LB)
10120b57cec5SDimitry Andric continue;
10135ffd83dbSDimitry Andric unsigned RelScore = RegScore - LB - 1;
10140b57cec5SDimitry Andric if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
10150b57cec5SDimitry Andric OS << RelScore << ":v" << J << " ";
10160b57cec5SDimitry Andric } else {
10170b57cec5SDimitry Andric OS << RelScore << ":ds ";
10180b57cec5SDimitry Andric }
10190b57cec5SDimitry Andric }
10200b57cec5SDimitry Andric // Also need to print sgpr scores for lgkm_cnt.
10217a6dacacSDimitry Andric if (T == SmemAccessCounter) {
10225ffd83dbSDimitry Andric for (int J = 0; J <= SgprUB; J++) {
10237a6dacacSDimitry Andric unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
10240b57cec5SDimitry Andric if (RegScore <= LB)
10250b57cec5SDimitry Andric continue;
10265ffd83dbSDimitry Andric unsigned RelScore = RegScore - LB - 1;
10270b57cec5SDimitry Andric OS << RelScore << ":s" << J << " ";
10280b57cec5SDimitry Andric }
10290b57cec5SDimitry Andric }
10300b57cec5SDimitry Andric }
10310b57cec5SDimitry Andric OS << '\n';
10320b57cec5SDimitry Andric }
10330b57cec5SDimitry Andric OS << '\n';
10340b57cec5SDimitry Andric }
10350b57cec5SDimitry Andric
10360b57cec5SDimitry Andric /// Simplify the waitcnt, in the sense of removing redundant counts, and return
10370b57cec5SDimitry Andric /// whether a waitcnt instruction is needed at all.
simplifyWaitcnt(AMDGPU::Waitcnt & Wait) const1038fe6060f1SDimitry Andric void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
10397a6dacacSDimitry Andric simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1040fe6060f1SDimitry Andric simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
10417a6dacacSDimitry Andric simplifyWaitcnt(DS_CNT, Wait.DsCnt);
10427a6dacacSDimitry Andric simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
10437a6dacacSDimitry Andric simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
10447a6dacacSDimitry Andric simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
10457a6dacacSDimitry Andric simplifyWaitcnt(KM_CNT, Wait.KmCnt);
10460b57cec5SDimitry Andric }
10470b57cec5SDimitry Andric
simplifyWaitcnt(InstCounterType T,unsigned & Count) const1048fe6060f1SDimitry Andric void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
10490b57cec5SDimitry Andric unsigned &Count) const {
1050fe6060f1SDimitry Andric // The number of outstanding events for this type, T, can be calculated
1051fe6060f1SDimitry Andric // as (UB - LB). If the current Count is greater than or equal to the number
1052fe6060f1SDimitry Andric // of outstanding events, then the wait for this counter is redundant.
1053bdd1243dSDimitry Andric if (Count >= getScoreRange(T))
10540b57cec5SDimitry Andric Count = ~0u;
10550b57cec5SDimitry Andric }
10560b57cec5SDimitry Andric
determineWait(InstCounterType T,int RegNo,AMDGPU::Waitcnt & Wait) const1057bdd1243dSDimitry Andric void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
10580b57cec5SDimitry Andric AMDGPU::Waitcnt &Wait) const {
1059bdd1243dSDimitry Andric unsigned ScoreToWait = getRegScore(RegNo, T);
1060bdd1243dSDimitry Andric
10610b57cec5SDimitry Andric // If the score of src_operand falls within the bracket, we need an
10620b57cec5SDimitry Andric // s_waitcnt instruction.
10635ffd83dbSDimitry Andric const unsigned LB = getScoreLB(T);
10645ffd83dbSDimitry Andric const unsigned UB = getScoreUB(T);
10650b57cec5SDimitry Andric if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
10667a6dacacSDimitry Andric if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
10670b57cec5SDimitry Andric !ST->hasFlatLgkmVMemCountInOrder()) {
10680b57cec5SDimitry Andric // If there is a pending FLAT operation, and this is a VMem or LGKM
10690b57cec5SDimitry Andric // waitcnt and the target can report early completion, then we need
10700b57cec5SDimitry Andric // to force a waitcnt 0.
10710b57cec5SDimitry Andric addWait(Wait, T, 0);
10720b57cec5SDimitry Andric } else if (counterOutOfOrder(T)) {
10730b57cec5SDimitry Andric // Counter can get decremented out-of-order when there
10740b57cec5SDimitry Andric // are multiple types event in the bracket. Also emit an s_wait counter
10750b57cec5SDimitry Andric // with a conservative value of 0 for the counter.
10760b57cec5SDimitry Andric addWait(Wait, T, 0);
10770b57cec5SDimitry Andric } else {
1078480093f4SDimitry Andric // If a counter has been maxed out avoid overflow by waiting for
1079480093f4SDimitry Andric // MAX(CounterType) - 1 instead.
10805ffd83dbSDimitry Andric unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1081480093f4SDimitry Andric addWait(Wait, T, NeededWait);
10820b57cec5SDimitry Andric }
10830b57cec5SDimitry Andric }
10840b57cec5SDimitry Andric }
10850b57cec5SDimitry Andric
applyWaitcnt(const AMDGPU::Waitcnt & Wait)10860b57cec5SDimitry Andric void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
10877a6dacacSDimitry Andric applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
10880b57cec5SDimitry Andric applyWaitcnt(EXP_CNT, Wait.ExpCnt);
10897a6dacacSDimitry Andric applyWaitcnt(DS_CNT, Wait.DsCnt);
10907a6dacacSDimitry Andric applyWaitcnt(STORE_CNT, Wait.StoreCnt);
10917a6dacacSDimitry Andric applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
10927a6dacacSDimitry Andric applyWaitcnt(BVH_CNT, Wait.BvhCnt);
10937a6dacacSDimitry Andric applyWaitcnt(KM_CNT, Wait.KmCnt);
10940b57cec5SDimitry Andric }
10950b57cec5SDimitry Andric
applyWaitcnt(InstCounterType T,unsigned Count)10960b57cec5SDimitry Andric void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
10975ffd83dbSDimitry Andric const unsigned UB = getScoreUB(T);
10980b57cec5SDimitry Andric if (Count >= UB)
10990b57cec5SDimitry Andric return;
11000b57cec5SDimitry Andric if (Count != 0) {
11010b57cec5SDimitry Andric if (counterOutOfOrder(T))
11020b57cec5SDimitry Andric return;
11030b57cec5SDimitry Andric setScoreLB(T, std::max(getScoreLB(T), UB - Count));
11040b57cec5SDimitry Andric } else {
11050b57cec5SDimitry Andric setScoreLB(T, UB);
11060b57cec5SDimitry Andric PendingEvents &= ~WaitEventMaskForInst[T];
11070b57cec5SDimitry Andric }
11080b57cec5SDimitry Andric }
11090b57cec5SDimitry Andric
11100b57cec5SDimitry Andric // Where there are multiple types of event in the bracket of a counter,
11110b57cec5SDimitry Andric // the decrement may go out of order.
counterOutOfOrder(InstCounterType T) const11120b57cec5SDimitry Andric bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
11130b57cec5SDimitry Andric // Scalar memory read always can go out of order.
11147a6dacacSDimitry Andric if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
11150b57cec5SDimitry Andric return true;
11165ffd83dbSDimitry Andric return hasMixedPendingEvents(T);
11170b57cec5SDimitry Andric }
11180b57cec5SDimitry Andric
11190b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
11200b57cec5SDimitry Andric false)
11210fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
11220fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
11230b57cec5SDimitry Andric INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
11240b57cec5SDimitry Andric false)
11250b57cec5SDimitry Andric
11260b57cec5SDimitry Andric char SIInsertWaitcnts::ID = 0;
11270b57cec5SDimitry Andric
11280b57cec5SDimitry Andric char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
11290b57cec5SDimitry Andric
createSIInsertWaitcntsPass()11300b57cec5SDimitry Andric FunctionPass *llvm::createSIInsertWaitcntsPass() {
11310b57cec5SDimitry Andric return new SIInsertWaitcnts();
11320b57cec5SDimitry Andric }
11330b57cec5SDimitry Andric
updateOperandIfDifferent(MachineInstr & MI,uint16_t OpName,unsigned NewEnc)1134bdd1243dSDimitry Andric static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
1135bdd1243dSDimitry Andric unsigned NewEnc) {
1136bdd1243dSDimitry Andric int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1137bdd1243dSDimitry Andric assert(OpIdx >= 0);
1138bdd1243dSDimitry Andric
1139bdd1243dSDimitry Andric MachineOperand &MO = MI.getOperand(OpIdx);
1140bdd1243dSDimitry Andric
1141bdd1243dSDimitry Andric if (NewEnc == MO.getImm())
1142bdd1243dSDimitry Andric return false;
1143bdd1243dSDimitry Andric
1144bdd1243dSDimitry Andric MO.setImm(NewEnc);
1145bdd1243dSDimitry Andric return true;
1146bdd1243dSDimitry Andric }
1147bdd1243dSDimitry Andric
11487a6dacacSDimitry Andric /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
11497a6dacacSDimitry Andric /// and if so, which counter it is waiting on.
counterTypeForInstr(unsigned Opcode)11507a6dacacSDimitry Andric static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
11517a6dacacSDimitry Andric switch (Opcode) {
11527a6dacacSDimitry Andric case AMDGPU::S_WAIT_LOADCNT:
11537a6dacacSDimitry Andric return LOAD_CNT;
11547a6dacacSDimitry Andric case AMDGPU::S_WAIT_EXPCNT:
11557a6dacacSDimitry Andric return EXP_CNT;
11567a6dacacSDimitry Andric case AMDGPU::S_WAIT_STORECNT:
11577a6dacacSDimitry Andric return STORE_CNT;
11587a6dacacSDimitry Andric case AMDGPU::S_WAIT_SAMPLECNT:
11597a6dacacSDimitry Andric return SAMPLE_CNT;
11607a6dacacSDimitry Andric case AMDGPU::S_WAIT_BVHCNT:
11617a6dacacSDimitry Andric return BVH_CNT;
11627a6dacacSDimitry Andric case AMDGPU::S_WAIT_DSCNT:
11637a6dacacSDimitry Andric return DS_CNT;
11647a6dacacSDimitry Andric case AMDGPU::S_WAIT_KMCNT:
11657a6dacacSDimitry Andric return KM_CNT;
11667a6dacacSDimitry Andric default:
11677a6dacacSDimitry Andric return {};
11687a6dacacSDimitry Andric }
11697a6dacacSDimitry Andric }
11707a6dacacSDimitry Andric
promoteSoftWaitCnt(MachineInstr * Waitcnt) const11717a6dacacSDimitry Andric bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
11727a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
11737a6dacacSDimitry Andric if (Opcode == Waitcnt->getOpcode())
11745f757f3fSDimitry Andric return false;
11755f757f3fSDimitry Andric
11767a6dacacSDimitry Andric Waitcnt->setDesc(TII->get(Opcode));
11775f757f3fSDimitry Andric return true;
11785f757f3fSDimitry Andric }
11795f757f3fSDimitry Andric
11807a6dacacSDimitry Andric /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
11817a6dacacSDimitry Andric /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
11827a6dacacSDimitry Andric /// from \p Wait that were added by previous passes. Currently this pass
11837a6dacacSDimitry Andric /// conservatively assumes that these preexisting waits are required for
11847a6dacacSDimitry Andric /// correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const11857a6dacacSDimitry Andric bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
118681ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1187bdd1243dSDimitry Andric AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
11887a6dacacSDimitry Andric assert(ST);
11897a6dacacSDimitry Andric assert(isNormalMode(MaxCounter));
11907a6dacacSDimitry Andric
1191fe6060f1SDimitry Andric bool Modified = false;
1192fe6060f1SDimitry Andric MachineInstr *WaitcntInstr = nullptr;
1193fe6060f1SDimitry Andric MachineInstr *WaitcntVsCntInstr = nullptr;
119481ad6265SDimitry Andric
119581ad6265SDimitry Andric for (auto &II :
119681ad6265SDimitry Andric make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
119781ad6265SDimitry Andric if (II.isMetaInstruction())
1198fe6060f1SDimitry Andric continue;
1199fe6060f1SDimitry Andric
12007a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
12010fca6ea1SDimitry Andric bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
12025f757f3fSDimitry Andric
12035f757f3fSDimitry Andric // Update required wait count. If this is a soft waitcnt (= it was added
12045f757f3fSDimitry Andric // by an earlier pass), it may be entirely removed.
12057a6dacacSDimitry Andric if (Opcode == AMDGPU::S_WAITCNT) {
120681ad6265SDimitry Andric unsigned IEnc = II.getOperand(0).getImm();
1207fe6060f1SDimitry Andric AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
12080fca6ea1SDimitry Andric if (TrySimplify)
12095f757f3fSDimitry Andric ScoreBrackets.simplifyWaitcnt(OldWait);
1210fe6060f1SDimitry Andric Wait = Wait.combined(OldWait);
1211fe6060f1SDimitry Andric
1212fe6060f1SDimitry Andric // Merge consecutive waitcnt of the same type by erasing multiples.
12130fca6ea1SDimitry Andric if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
121481ad6265SDimitry Andric II.eraseFromParent();
1215fe6060f1SDimitry Andric Modified = true;
12165f757f3fSDimitry Andric } else
12175f757f3fSDimitry Andric WaitcntInstr = &II;
1218fe6060f1SDimitry Andric } else {
12197a6dacacSDimitry Andric assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
122081ad6265SDimitry Andric assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
12215f757f3fSDimitry Andric
1222fe6060f1SDimitry Andric unsigned OldVSCnt =
122381ad6265SDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
12240fca6ea1SDimitry Andric if (TrySimplify)
12257a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
12267a6dacacSDimitry Andric Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1227fe6060f1SDimitry Andric
12280fca6ea1SDimitry Andric if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
122981ad6265SDimitry Andric II.eraseFromParent();
1230fe6060f1SDimitry Andric Modified = true;
12315f757f3fSDimitry Andric } else
12325f757f3fSDimitry Andric WaitcntVsCntInstr = &II;
1233fe6060f1SDimitry Andric }
1234fe6060f1SDimitry Andric }
1235fe6060f1SDimitry Andric
1236fe6060f1SDimitry Andric if (WaitcntInstr) {
12375f757f3fSDimitry Andric Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1238bdd1243dSDimitry Andric AMDGPU::encodeWaitcnt(IV, Wait));
12395f757f3fSDimitry Andric Modified |= promoteSoftWaitCnt(WaitcntInstr);
12405f757f3fSDimitry Andric
12417a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
12427a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
12437a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
12447a6dacacSDimitry Andric Wait.LoadCnt = ~0u;
1245fe6060f1SDimitry Andric Wait.ExpCnt = ~0u;
12467a6dacacSDimitry Andric Wait.DsCnt = ~0u;
1247fe6060f1SDimitry Andric
12487a6dacacSDimitry Andric LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
12495f757f3fSDimitry Andric ? dbgs()
12505f757f3fSDimitry Andric << "applyPreexistingWaitcnt\n"
12515f757f3fSDimitry Andric << "New Instr at block end: " << *WaitcntInstr << '\n'
125281ad6265SDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n"
125381ad6265SDimitry Andric << "Old Instr: " << *It
125481ad6265SDimitry Andric << "New Instr: " << *WaitcntInstr << '\n');
1255fe6060f1SDimitry Andric }
1256fe6060f1SDimitry Andric
1257fe6060f1SDimitry Andric if (WaitcntVsCntInstr) {
1258bdd1243dSDimitry Andric Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
12597a6dacacSDimitry Andric AMDGPU::OpName::simm16, Wait.StoreCnt);
12605f757f3fSDimitry Andric Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1261fe6060f1SDimitry Andric
12627a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
12637a6dacacSDimitry Andric Wait.StoreCnt = ~0u;
12647a6dacacSDimitry Andric
12657a6dacacSDimitry Andric LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
126681ad6265SDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n"
12675f757f3fSDimitry Andric << "New Instr at block end: " << *WaitcntVsCntInstr
12685f757f3fSDimitry Andric << '\n'
126981ad6265SDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n"
127081ad6265SDimitry Andric << "Old Instr: " << *It
1271fe6060f1SDimitry Andric << "New Instr: " << *WaitcntVsCntInstr << '\n');
1272fe6060f1SDimitry Andric }
1273fe6060f1SDimitry Andric
1274fe6060f1SDimitry Andric return Modified;
1275fe6060f1SDimitry Andric }
1276fe6060f1SDimitry Andric
12777a6dacacSDimitry Andric /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
12787a6dacacSDimitry Andric /// required counters in \p Wait
createNewWaitcnt(MachineBasicBlock & Block,MachineBasicBlock::instr_iterator It,AMDGPU::Waitcnt Wait)12797a6dacacSDimitry Andric bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
12807a6dacacSDimitry Andric MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
12817a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) {
12827a6dacacSDimitry Andric assert(ST);
12837a6dacacSDimitry Andric assert(isNormalMode(MaxCounter));
12847a6dacacSDimitry Andric
12857a6dacacSDimitry Andric bool Modified = false;
12867a6dacacSDimitry Andric const DebugLoc &DL = Block.findDebugLoc(It);
12877a6dacacSDimitry Andric
12887a6dacacSDimitry Andric // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
12897a6dacacSDimitry Andric // single instruction while VScnt has its own instruction.
12907a6dacacSDimitry Andric if (Wait.hasWaitExceptStoreCnt()) {
12917a6dacacSDimitry Andric unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
12927a6dacacSDimitry Andric [[maybe_unused]] auto SWaitInst =
12937a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
12947a6dacacSDimitry Andric Modified = true;
12957a6dacacSDimitry Andric
12967a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
12977a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
12987a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n');
12997a6dacacSDimitry Andric }
13007a6dacacSDimitry Andric
13017a6dacacSDimitry Andric if (Wait.hasWaitStoreCnt()) {
13027a6dacacSDimitry Andric assert(ST->hasVscnt());
13037a6dacacSDimitry Andric
13047a6dacacSDimitry Andric [[maybe_unused]] auto SWaitInst =
13057a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
13067a6dacacSDimitry Andric .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
13077a6dacacSDimitry Andric .addImm(Wait.StoreCnt);
13087a6dacacSDimitry Andric Modified = true;
13097a6dacacSDimitry Andric
13107a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
13117a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
13127a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n');
13137a6dacacSDimitry Andric }
13147a6dacacSDimitry Andric
13157a6dacacSDimitry Andric return Modified;
13167a6dacacSDimitry Andric }
13177a6dacacSDimitry Andric
13180fca6ea1SDimitry Andric AMDGPU::Waitcnt
getAllZeroWaitcnt(bool IncludeVSCnt) const13190fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
13200fca6ea1SDimitry Andric return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
13210fca6ea1SDimitry Andric }
13220fca6ea1SDimitry Andric
13230fca6ea1SDimitry Andric AMDGPU::Waitcnt
getAllZeroWaitcnt(bool IncludeVSCnt) const13240fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
13250fca6ea1SDimitry Andric return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
13260fca6ea1SDimitry Andric }
13270fca6ea1SDimitry Andric
13287a6dacacSDimitry Andric /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
13297a6dacacSDimitry Andric /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
13307a6dacacSDimitry Andric /// were added by previous passes. Currently this pass conservatively
13317a6dacacSDimitry Andric /// assumes that these preexisting waits are required for correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const13327a6dacacSDimitry Andric bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
13337a6dacacSDimitry Andric WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
13347a6dacacSDimitry Andric AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
13357a6dacacSDimitry Andric assert(ST);
13367a6dacacSDimitry Andric assert(!isNormalMode(MaxCounter));
13377a6dacacSDimitry Andric
13387a6dacacSDimitry Andric bool Modified = false;
13397a6dacacSDimitry Andric MachineInstr *CombinedLoadDsCntInstr = nullptr;
13407a6dacacSDimitry Andric MachineInstr *CombinedStoreDsCntInstr = nullptr;
13417a6dacacSDimitry Andric MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
13427a6dacacSDimitry Andric
13437a6dacacSDimitry Andric for (auto &II :
13447a6dacacSDimitry Andric make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
13457a6dacacSDimitry Andric if (II.isMetaInstruction())
13467a6dacacSDimitry Andric continue;
13477a6dacacSDimitry Andric
13487a6dacacSDimitry Andric MachineInstr **UpdatableInstr;
13497a6dacacSDimitry Andric
13507a6dacacSDimitry Andric // Update required wait count. If this is a soft waitcnt (= it was added
13517a6dacacSDimitry Andric // by an earlier pass), it may be entirely removed.
13527a6dacacSDimitry Andric
13537a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
13540fca6ea1SDimitry Andric bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
13550fca6ea1SDimitry Andric
13560fca6ea1SDimitry Andric // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
13570fca6ea1SDimitry Andric // attempt to do more than that either.
13580fca6ea1SDimitry Andric if (Opcode == AMDGPU::S_WAITCNT)
13590fca6ea1SDimitry Andric continue;
13607a6dacacSDimitry Andric
13617a6dacacSDimitry Andric if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
13627a6dacacSDimitry Andric unsigned OldEnc =
13637a6dacacSDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13647a6dacacSDimitry Andric AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
13650fca6ea1SDimitry Andric if (TrySimplify)
13667a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(OldWait);
13677a6dacacSDimitry Andric Wait = Wait.combined(OldWait);
13687a6dacacSDimitry Andric UpdatableInstr = &CombinedLoadDsCntInstr;
13697a6dacacSDimitry Andric } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
13707a6dacacSDimitry Andric unsigned OldEnc =
13717a6dacacSDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13727a6dacacSDimitry Andric AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
13730fca6ea1SDimitry Andric if (TrySimplify)
13747a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(OldWait);
13757a6dacacSDimitry Andric Wait = Wait.combined(OldWait);
13767a6dacacSDimitry Andric UpdatableInstr = &CombinedStoreDsCntInstr;
13777a6dacacSDimitry Andric } else {
13787a6dacacSDimitry Andric std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
13797a6dacacSDimitry Andric assert(CT.has_value());
13807a6dacacSDimitry Andric unsigned OldCnt =
13817a6dacacSDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13820fca6ea1SDimitry Andric if (TrySimplify)
13837a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
13847a6dacacSDimitry Andric addWait(Wait, CT.value(), OldCnt);
13857a6dacacSDimitry Andric UpdatableInstr = &WaitInstrs[CT.value()];
13867a6dacacSDimitry Andric }
13877a6dacacSDimitry Andric
13887a6dacacSDimitry Andric // Merge consecutive waitcnt of the same type by erasing multiples.
13897a6dacacSDimitry Andric if (!*UpdatableInstr) {
13907a6dacacSDimitry Andric *UpdatableInstr = &II;
13917a6dacacSDimitry Andric } else {
13927a6dacacSDimitry Andric II.eraseFromParent();
13937a6dacacSDimitry Andric Modified = true;
13947a6dacacSDimitry Andric }
13957a6dacacSDimitry Andric }
13967a6dacacSDimitry Andric
13977a6dacacSDimitry Andric if (CombinedLoadDsCntInstr) {
13987a6dacacSDimitry Andric // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
13997a6dacacSDimitry Andric // to be waited for. Otherwise, let the instruction be deleted so
14007a6dacacSDimitry Andric // the appropriate single counter wait instruction can be inserted
14017a6dacacSDimitry Andric // instead, when new S_WAIT_*CNT instructions are inserted by
14027a6dacacSDimitry Andric // createNewWaitcnt(). As a side effect, resetting the wait counts will
14037a6dacacSDimitry Andric // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
14047a6dacacSDimitry Andric // the loop below that deals with single counter instructions.
14057a6dacacSDimitry Andric if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
14067a6dacacSDimitry Andric unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
14077a6dacacSDimitry Andric Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
14087a6dacacSDimitry Andric AMDGPU::OpName::simm16, NewEnc);
14097a6dacacSDimitry Andric Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
14107a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
14117a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
14127a6dacacSDimitry Andric Wait.LoadCnt = ~0u;
14137a6dacacSDimitry Andric Wait.DsCnt = ~0u;
14147a6dacacSDimitry Andric
14157a6dacacSDimitry Andric LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
14167a6dacacSDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n"
14177a6dacacSDimitry Andric << "New Instr at block end: "
14187a6dacacSDimitry Andric << *CombinedLoadDsCntInstr << '\n'
14197a6dacacSDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n"
14207a6dacacSDimitry Andric << "Old Instr: " << *It << "New Instr: "
14217a6dacacSDimitry Andric << *CombinedLoadDsCntInstr << '\n');
14227a6dacacSDimitry Andric } else {
14237a6dacacSDimitry Andric CombinedLoadDsCntInstr->eraseFromParent();
14247a6dacacSDimitry Andric Modified = true;
14257a6dacacSDimitry Andric }
14267a6dacacSDimitry Andric }
14277a6dacacSDimitry Andric
14287a6dacacSDimitry Andric if (CombinedStoreDsCntInstr) {
14297a6dacacSDimitry Andric // Similarly for S_WAIT_STORECNT_DSCNT.
14307a6dacacSDimitry Andric if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
14317a6dacacSDimitry Andric unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
14327a6dacacSDimitry Andric Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
14337a6dacacSDimitry Andric AMDGPU::OpName::simm16, NewEnc);
14347a6dacacSDimitry Andric Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
14357a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
14367a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
14377a6dacacSDimitry Andric Wait.StoreCnt = ~0u;
14387a6dacacSDimitry Andric Wait.DsCnt = ~0u;
14397a6dacacSDimitry Andric
14407a6dacacSDimitry Andric LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
14417a6dacacSDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n"
14427a6dacacSDimitry Andric << "New Instr at block end: "
14437a6dacacSDimitry Andric << *CombinedStoreDsCntInstr << '\n'
14447a6dacacSDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n"
14457a6dacacSDimitry Andric << "Old Instr: " << *It << "New Instr: "
14467a6dacacSDimitry Andric << *CombinedStoreDsCntInstr << '\n');
14477a6dacacSDimitry Andric } else {
14487a6dacacSDimitry Andric CombinedStoreDsCntInstr->eraseFromParent();
14497a6dacacSDimitry Andric Modified = true;
14507a6dacacSDimitry Andric }
14517a6dacacSDimitry Andric }
14527a6dacacSDimitry Andric
14537a6dacacSDimitry Andric // Look for an opportunity to convert existing S_WAIT_LOADCNT,
14547a6dacacSDimitry Andric // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
14557a6dacacSDimitry Andric // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
14567a6dacacSDimitry Andric // instructions so that createNewWaitcnt() will create new combined
14577a6dacacSDimitry Andric // instructions to replace them.
14587a6dacacSDimitry Andric
14597a6dacacSDimitry Andric if (Wait.DsCnt != ~0u) {
14607a6dacacSDimitry Andric // This is a vector of addresses in WaitInstrs pointing to instructions
14617a6dacacSDimitry Andric // that should be removed if they are present.
14627a6dacacSDimitry Andric SmallVector<MachineInstr **, 2> WaitsToErase;
14637a6dacacSDimitry Andric
14647a6dacacSDimitry Andric // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
14657a6dacacSDimitry Andric // both) need to be waited for, ensure that there are no existing
14667a6dacacSDimitry Andric // individual wait count instructions for these.
14677a6dacacSDimitry Andric
14687a6dacacSDimitry Andric if (Wait.LoadCnt != ~0u) {
14697a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
14707a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
14717a6dacacSDimitry Andric } else if (Wait.StoreCnt != ~0u) {
14727a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
14737a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
14747a6dacacSDimitry Andric }
14757a6dacacSDimitry Andric
14767a6dacacSDimitry Andric for (MachineInstr **WI : WaitsToErase) {
14777a6dacacSDimitry Andric if (!*WI)
14787a6dacacSDimitry Andric continue;
14797a6dacacSDimitry Andric
14807a6dacacSDimitry Andric (*WI)->eraseFromParent();
14817a6dacacSDimitry Andric *WI = nullptr;
14827a6dacacSDimitry Andric Modified = true;
14837a6dacacSDimitry Andric }
14847a6dacacSDimitry Andric }
14857a6dacacSDimitry Andric
14867a6dacacSDimitry Andric for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
14877a6dacacSDimitry Andric if (!WaitInstrs[CT])
14887a6dacacSDimitry Andric continue;
14897a6dacacSDimitry Andric
14907a6dacacSDimitry Andric unsigned NewCnt = getWait(Wait, CT);
14917a6dacacSDimitry Andric if (NewCnt != ~0u) {
14927a6dacacSDimitry Andric Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
14937a6dacacSDimitry Andric AMDGPU::OpName::simm16, NewCnt);
14947a6dacacSDimitry Andric Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
14957a6dacacSDimitry Andric
14967a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(CT, NewCnt);
14977a6dacacSDimitry Andric setNoWait(Wait, CT);
14987a6dacacSDimitry Andric
14997a6dacacSDimitry Andric LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
15007a6dacacSDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n"
15017a6dacacSDimitry Andric << "New Instr at block end: " << *WaitInstrs[CT]
15027a6dacacSDimitry Andric << '\n'
15037a6dacacSDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n"
15047a6dacacSDimitry Andric << "Old Instr: " << *It
15057a6dacacSDimitry Andric << "New Instr: " << *WaitInstrs[CT] << '\n');
15067a6dacacSDimitry Andric } else {
15077a6dacacSDimitry Andric WaitInstrs[CT]->eraseFromParent();
15087a6dacacSDimitry Andric Modified = true;
15097a6dacacSDimitry Andric }
15107a6dacacSDimitry Andric }
15117a6dacacSDimitry Andric
15127a6dacacSDimitry Andric return Modified;
15137a6dacacSDimitry Andric }
15147a6dacacSDimitry Andric
15157a6dacacSDimitry Andric /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
createNewWaitcnt(MachineBasicBlock & Block,MachineBasicBlock::instr_iterator It,AMDGPU::Waitcnt Wait)15167a6dacacSDimitry Andric bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
15177a6dacacSDimitry Andric MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
15187a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) {
15197a6dacacSDimitry Andric assert(ST);
15207a6dacacSDimitry Andric assert(!isNormalMode(MaxCounter));
15217a6dacacSDimitry Andric
15227a6dacacSDimitry Andric bool Modified = false;
15237a6dacacSDimitry Andric const DebugLoc &DL = Block.findDebugLoc(It);
15247a6dacacSDimitry Andric
15257a6dacacSDimitry Andric // Check for opportunities to use combined wait instructions.
15267a6dacacSDimitry Andric if (Wait.DsCnt != ~0u) {
15277a6dacacSDimitry Andric MachineInstr *SWaitInst = nullptr;
15287a6dacacSDimitry Andric
15297a6dacacSDimitry Andric if (Wait.LoadCnt != ~0u) {
15307a6dacacSDimitry Andric unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
15317a6dacacSDimitry Andric
15327a6dacacSDimitry Andric SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
15337a6dacacSDimitry Andric .addImm(Enc);
15347a6dacacSDimitry Andric
15357a6dacacSDimitry Andric Wait.LoadCnt = ~0u;
15367a6dacacSDimitry Andric Wait.DsCnt = ~0u;
15377a6dacacSDimitry Andric } else if (Wait.StoreCnt != ~0u) {
15387a6dacacSDimitry Andric unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
15397a6dacacSDimitry Andric
15407a6dacacSDimitry Andric SWaitInst =
15417a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
15427a6dacacSDimitry Andric .addImm(Enc);
15437a6dacacSDimitry Andric
15447a6dacacSDimitry Andric Wait.StoreCnt = ~0u;
15457a6dacacSDimitry Andric Wait.DsCnt = ~0u;
15467a6dacacSDimitry Andric }
15477a6dacacSDimitry Andric
15487a6dacacSDimitry Andric if (SWaitInst) {
15497a6dacacSDimitry Andric Modified = true;
15507a6dacacSDimitry Andric
15517a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
15527a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
15537a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n');
15547a6dacacSDimitry Andric }
15557a6dacacSDimitry Andric }
15567a6dacacSDimitry Andric
15577a6dacacSDimitry Andric // Generate an instruction for any remaining counter that needs
15587a6dacacSDimitry Andric // waiting for.
15597a6dacacSDimitry Andric
15607a6dacacSDimitry Andric for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
15617a6dacacSDimitry Andric unsigned Count = getWait(Wait, CT);
15627a6dacacSDimitry Andric if (Count == ~0u)
15637a6dacacSDimitry Andric continue;
15647a6dacacSDimitry Andric
15657a6dacacSDimitry Andric [[maybe_unused]] auto SWaitInst =
15667a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
15677a6dacacSDimitry Andric .addImm(Count);
15687a6dacacSDimitry Andric
15697a6dacacSDimitry Andric Modified = true;
15707a6dacacSDimitry Andric
15717a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
15727a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
15737a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n');
15747a6dacacSDimitry Andric }
15757a6dacacSDimitry Andric
15767a6dacacSDimitry Andric return Modified;
15777a6dacacSDimitry Andric }
15787a6dacacSDimitry Andric
readsVCCZ(const MachineInstr & MI)15790b57cec5SDimitry Andric static bool readsVCCZ(const MachineInstr &MI) {
15800b57cec5SDimitry Andric unsigned Opc = MI.getOpcode();
15810b57cec5SDimitry Andric return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
15820b57cec5SDimitry Andric !MI.getOperand(1).isUndef();
15830b57cec5SDimitry Andric }
15840b57cec5SDimitry Andric
15850b57cec5SDimitry Andric /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
callWaitsOnFunctionEntry(const MachineInstr & MI)15860b57cec5SDimitry Andric static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
15870b57cec5SDimitry Andric // Currently all conventions wait, but this may not always be the case.
15880b57cec5SDimitry Andric //
15890b57cec5SDimitry Andric // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
15900b57cec5SDimitry Andric // senses to omit the wait and do it in the caller.
15910b57cec5SDimitry Andric return true;
15920b57cec5SDimitry Andric }
15930b57cec5SDimitry Andric
15940b57cec5SDimitry Andric /// \returns true if the callee is expected to wait for any outstanding waits
15950b57cec5SDimitry Andric /// before returning.
callWaitsOnFunctionReturn(const MachineInstr & MI)15960b57cec5SDimitry Andric static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
15970b57cec5SDimitry Andric return true;
15980b57cec5SDimitry Andric }
15990b57cec5SDimitry Andric
16000b57cec5SDimitry Andric /// Generate s_waitcnt instruction to be placed before cur_Inst.
16010b57cec5SDimitry Andric /// Instructions of a given type are returned in order,
16020b57cec5SDimitry Andric /// but instructions of different types can complete out of order.
16030b57cec5SDimitry Andric /// We rely on this in-order completion
16040b57cec5SDimitry Andric /// and simply assign a score to the memory access instructions.
16050b57cec5SDimitry Andric /// We keep track of the active "score bracket" to determine
16060b57cec5SDimitry Andric /// if an access of a memory read requires an s_waitcnt
16070b57cec5SDimitry Andric /// and if so what the value of each counter is.
16080b57cec5SDimitry Andric /// The "score bracket" is bound by the lower bound and upper bound
16090b57cec5SDimitry Andric /// scores (*_score_LB and *_score_ub respectively).
161081ad6265SDimitry Andric /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
161181ad6265SDimitry Andric /// flush the vmcnt counter here.
generateWaitcntInstBefore(MachineInstr & MI,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr,bool FlushVmCnt)161281ad6265SDimitry Andric bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
161381ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets,
161481ad6265SDimitry Andric MachineInstr *OldWaitcntInstr,
161581ad6265SDimitry Andric bool FlushVmCnt) {
16160b57cec5SDimitry Andric setForceEmitWaitcnt();
16170b57cec5SDimitry Andric
1618e8d8bef9SDimitry Andric if (MI.isMetaInstruction())
16190b57cec5SDimitry Andric return false;
16200b57cec5SDimitry Andric
16210b57cec5SDimitry Andric AMDGPU::Waitcnt Wait;
16220b57cec5SDimitry Andric
1623fe6060f1SDimitry Andric // FIXME: This should have already been handled by the memory legalizer.
1624fe6060f1SDimitry Andric // Removing this currently doesn't affect any lit tests, but we need to
1625fe6060f1SDimitry Andric // verify that nothing was relying on this. The number of buffer invalidates
1626fe6060f1SDimitry Andric // being handled here should not be expanded.
16270b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
16280b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
16290b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
16300b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
16310b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
16327a6dacacSDimitry Andric Wait.LoadCnt = 0;
16330b57cec5SDimitry Andric }
16340b57cec5SDimitry Andric
16350b57cec5SDimitry Andric // All waits must be resolved at call return.
16360b57cec5SDimitry Andric // NOTE: this could be improved with knowledge of all call sites or
16370b57cec5SDimitry Andric // with knowledge of the called routines.
16380b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
163981ad6265SDimitry Andric MI.getOpcode() == AMDGPU::SI_RETURN ||
16400b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
16410b57cec5SDimitry Andric (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
16420fca6ea1SDimitry Andric Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
164306c3fb27SDimitry Andric }
164406c3fb27SDimitry Andric // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
164506c3fb27SDimitry Andric // stores. In this case it can be useful to send a message to explicitly
164606c3fb27SDimitry Andric // release all VGPRs before the stores have completed, but it is only safe to
16477a6dacacSDimitry Andric // do this if:
16487a6dacacSDimitry Andric // * there are no outstanding scratch stores
16497a6dacacSDimitry Andric // * we are not in Dynamic VGPR mode
165006c3fb27SDimitry Andric else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
165106c3fb27SDimitry Andric MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
16520fca6ea1SDimitry Andric if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
16537a6dacacSDimitry Andric ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
165406c3fb27SDimitry Andric !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
165506c3fb27SDimitry Andric ReleaseVGPRInsts.insert(&MI);
16560b57cec5SDimitry Andric }
16570b57cec5SDimitry Andric // Resolve vm waits before gs-done.
16580b57cec5SDimitry Andric else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
16590b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
166081ad6265SDimitry Andric ST->hasLegacyGeometry() &&
166181ad6265SDimitry Andric ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
166281ad6265SDimitry Andric AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
16637a6dacacSDimitry Andric Wait.LoadCnt = 0;
16640b57cec5SDimitry Andric }
16650b57cec5SDimitry Andric
16660b57cec5SDimitry Andric // Export & GDS instructions do not read the EXEC mask until after the export
16670b57cec5SDimitry Andric // is granted (which can occur well after the instruction is issued).
16680b57cec5SDimitry Andric // The shader program must flush all EXP operations on the export-count
16690b57cec5SDimitry Andric // before overwriting the EXEC mask.
16700b57cec5SDimitry Andric else {
16710b57cec5SDimitry Andric if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
16720b57cec5SDimitry Andric // Export and GDS are tracked individually, either may trigger a waitcnt
16730b57cec5SDimitry Andric // for EXEC.
16740b57cec5SDimitry Andric if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
16750b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
16760b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
16770b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
16780b57cec5SDimitry Andric Wait.ExpCnt = 0;
16790b57cec5SDimitry Andric }
16800b57cec5SDimitry Andric }
16810b57cec5SDimitry Andric
16820b57cec5SDimitry Andric if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1683480093f4SDimitry Andric // The function is going to insert a wait on everything in its prolog.
1684480093f4SDimitry Andric // This still needs to be careful if the call target is a load (e.g. a GOT
168581ad6265SDimitry Andric // load). We also need to check WAW dependency with saved PC.
16860b57cec5SDimitry Andric Wait = AMDGPU::Waitcnt();
16870b57cec5SDimitry Andric
16880b57cec5SDimitry Andric int CallAddrOpIdx =
16890b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1690e8d8bef9SDimitry Andric
1691e8d8bef9SDimitry Andric if (MI.getOperand(CallAddrOpIdx).isReg()) {
16925ffd83dbSDimitry Andric RegInterval CallAddrOpInterval =
1693cb14a3feSDimitry Andric ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1694480093f4SDimitry Andric
16955ffd83dbSDimitry Andric for (int RegNo = CallAddrOpInterval.first;
1696480093f4SDimitry Andric RegNo < CallAddrOpInterval.second; ++RegNo)
16977a6dacacSDimitry Andric ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1698480093f4SDimitry Andric
1699480093f4SDimitry Andric int RtnAddrOpIdx =
1700480093f4SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1701480093f4SDimitry Andric if (RtnAddrOpIdx != -1) {
17025ffd83dbSDimitry Andric RegInterval RtnAddrOpInterval =
1703cb14a3feSDimitry Andric ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1704480093f4SDimitry Andric
17055ffd83dbSDimitry Andric for (int RegNo = RtnAddrOpInterval.first;
1706480093f4SDimitry Andric RegNo < RtnAddrOpInterval.second; ++RegNo)
17077a6dacacSDimitry Andric ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
17080b57cec5SDimitry Andric }
1709e8d8bef9SDimitry Andric }
17100b57cec5SDimitry Andric } else {
17110b57cec5SDimitry Andric // FIXME: Should not be relying on memoperands.
17120b57cec5SDimitry Andric // Look at the source operands of every instruction to see if
17130b57cec5SDimitry Andric // any of them results from a previous memory operation that affects
17140b57cec5SDimitry Andric // its current usage. If so, an s_waitcnt instruction needs to be
17150b57cec5SDimitry Andric // emitted.
17160b57cec5SDimitry Andric // If the source operand was defined by a load, add the s_waitcnt
17170b57cec5SDimitry Andric // instruction.
17185ffd83dbSDimitry Andric //
17190b57cec5SDimitry Andric // Two cases are handled for destination operands:
17200b57cec5SDimitry Andric // 1) If the destination operand was defined by a load, add the s_waitcnt
17210b57cec5SDimitry Andric // instruction to guarantee the right WAW order.
17220b57cec5SDimitry Andric // 2) If a destination operand that was used by a recent export/store ins,
17230b57cec5SDimitry Andric // add s_waitcnt on exp_cnt to guarantee the WAR order.
17247a6dacacSDimitry Andric
17250b57cec5SDimitry Andric for (const MachineMemOperand *Memop : MI.memoperands()) {
1726480093f4SDimitry Andric const Value *Ptr = Memop->getValue();
17275ffd83dbSDimitry Andric if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
17287a6dacacSDimitry Andric addWait(Wait, SmemAccessCounter, 0);
17295ffd83dbSDimitry Andric if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1730480093f4SDimitry Andric SLoadAddresses.erase(Ptr);
1731480093f4SDimitry Andric }
17320b57cec5SDimitry Andric unsigned AS = Memop->getAddrSpace();
173381ad6265SDimitry Andric if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
173481ad6265SDimitry Andric continue;
173581ad6265SDimitry Andric // No need to wait before load from VMEM to LDS.
17365f757f3fSDimitry Andric if (TII->mayWriteLDSThroughDMA(MI))
17370b57cec5SDimitry Andric continue;
17387a6dacacSDimitry Andric
17397a6dacacSDimitry Andric // LOAD_CNT is only relevant to vgpr or LDS.
17400b57cec5SDimitry Andric unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
17417a6dacacSDimitry Andric bool FoundAliasingStore = false;
17427a6dacacSDimitry Andric // Only objects with alias scope info were added to LDSDMAScopes array.
17437a6dacacSDimitry Andric // In the absense of the scope info we will not be able to disambiguate
17447a6dacacSDimitry Andric // aliasing here. There is no need to try searching for a corresponding
17457a6dacacSDimitry Andric // store slot. This is conservatively correct because in that case we
17467a6dacacSDimitry Andric // will produce a wait using the first (general) LDS DMA wait slot which
17477a6dacacSDimitry Andric // will wait on all of them anyway.
17487a6dacacSDimitry Andric if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
17497a6dacacSDimitry Andric const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17507a6dacacSDimitry Andric for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
17517a6dacacSDimitry Andric if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
17527a6dacacSDimitry Andric FoundAliasingStore = true;
17537a6dacacSDimitry Andric ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
17547a6dacacSDimitry Andric }
17557a6dacacSDimitry Andric }
17567a6dacacSDimitry Andric }
17577a6dacacSDimitry Andric if (!FoundAliasingStore)
17587a6dacacSDimitry Andric ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
17595ffd83dbSDimitry Andric if (Memop->isStore()) {
1760bdd1243dSDimitry Andric ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17610b57cec5SDimitry Andric }
17620b57cec5SDimitry Andric }
17635ffd83dbSDimitry Andric
17645ffd83dbSDimitry Andric // Loop over use and def operands.
17650b57cec5SDimitry Andric for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
17665ffd83dbSDimitry Andric MachineOperand &Op = MI.getOperand(I);
17675ffd83dbSDimitry Andric if (!Op.isReg())
17685ffd83dbSDimitry Andric continue;
1769bdd1243dSDimitry Andric
1770bdd1243dSDimitry Andric // If the instruction does not read tied source, skip the operand.
1771bdd1243dSDimitry Andric if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1772bdd1243dSDimitry Andric continue;
1773bdd1243dSDimitry Andric
1774cb14a3feSDimitry Andric RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1775e8d8bef9SDimitry Andric
1776fe6060f1SDimitry Andric const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
17775ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1778e8d8bef9SDimitry Andric if (IsVGPR) {
17795ffd83dbSDimitry Andric // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
17805ffd83dbSDimitry Andric // previous write and this write are the same type of VMEM
1781*6c4b055cSDimitry Andric // instruction, in which case they are (in some architectures)
1782*6c4b055cSDimitry Andric // guaranteed to write their results in order anyway.
1783bdd1243dSDimitry Andric if (Op.isUse() || !updateVMCntOnly(MI) ||
17845ffd83dbSDimitry Andric ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1785*6c4b055cSDimitry Andric getVmemType(MI)) ||
1786*6c4b055cSDimitry Andric !ST->hasVmemWriteVgprInOrder()) {
17877a6dacacSDimitry Andric ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
17887a6dacacSDimitry Andric ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
17897a6dacacSDimitry Andric ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
17905ffd83dbSDimitry Andric ScoreBrackets.clearVgprVmemTypes(RegNo);
17915ffd83dbSDimitry Andric }
179281ad6265SDimitry Andric if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1793bdd1243dSDimitry Andric ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17940b57cec5SDimitry Andric }
17957a6dacacSDimitry Andric ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
17967a6dacacSDimitry Andric } else {
17977a6dacacSDimitry Andric ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
17985ffd83dbSDimitry Andric }
17990b57cec5SDimitry Andric }
18005ffd83dbSDimitry Andric }
18010b57cec5SDimitry Andric }
18020b57cec5SDimitry Andric }
18030b57cec5SDimitry Andric
1804bdd1243dSDimitry Andric // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1805bdd1243dSDimitry Andric // not, we need to ensure the subtarget is capable of backing off barrier
1806bdd1243dSDimitry Andric // instructions in case there are any outstanding memory operations that may
1807bdd1243dSDimitry Andric // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
18083a079333SDimitry Andric if (TII->isBarrierStart(MI.getOpcode()) &&
1809bdd1243dSDimitry Andric !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
18100fca6ea1SDimitry Andric Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
18110b57cec5SDimitry Andric }
18120b57cec5SDimitry Andric
18130b57cec5SDimitry Andric // TODO: Remove this work-around, enable the assert for Bug 457939
18140b57cec5SDimitry Andric // after fixing the scheduler. Also, the Shader Compiler code is
18150b57cec5SDimitry Andric // independent of target.
18160b57cec5SDimitry Andric if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1817bdd1243dSDimitry Andric if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
18187a6dacacSDimitry Andric Wait.DsCnt = 0;
18190b57cec5SDimitry Andric }
18200b57cec5SDimitry Andric }
18210b57cec5SDimitry Andric
1822fe6060f1SDimitry Andric // Verify that the wait is actually needed.
1823fe6060f1SDimitry Andric ScoreBrackets.simplifyWaitcnt(Wait);
18240b57cec5SDimitry Andric
18250b57cec5SDimitry Andric if (ForceEmitZeroWaitcnts)
18260fca6ea1SDimitry Andric Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
18270b57cec5SDimitry Andric
18287a6dacacSDimitry Andric if (ForceEmitWaitcnt[LOAD_CNT])
18297a6dacacSDimitry Andric Wait.LoadCnt = 0;
18300b57cec5SDimitry Andric if (ForceEmitWaitcnt[EXP_CNT])
18310b57cec5SDimitry Andric Wait.ExpCnt = 0;
18327a6dacacSDimitry Andric if (ForceEmitWaitcnt[DS_CNT])
18337a6dacacSDimitry Andric Wait.DsCnt = 0;
18347a6dacacSDimitry Andric if (ForceEmitWaitcnt[SAMPLE_CNT])
18357a6dacacSDimitry Andric Wait.SampleCnt = 0;
18367a6dacacSDimitry Andric if (ForceEmitWaitcnt[BVH_CNT])
18377a6dacacSDimitry Andric Wait.BvhCnt = 0;
18387a6dacacSDimitry Andric if (ForceEmitWaitcnt[KM_CNT])
18397a6dacacSDimitry Andric Wait.KmCnt = 0;
18400b57cec5SDimitry Andric
184181ad6265SDimitry Andric if (FlushVmCnt) {
18427a6dacacSDimitry Andric if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
18437a6dacacSDimitry Andric Wait.LoadCnt = 0;
18447a6dacacSDimitry Andric if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
18457a6dacacSDimitry Andric Wait.SampleCnt = 0;
18467a6dacacSDimitry Andric if (ScoreBrackets.hasPendingEvent(BVH_CNT))
18477a6dacacSDimitry Andric Wait.BvhCnt = 0;
184881ad6265SDimitry Andric }
184981ad6265SDimitry Andric
185081ad6265SDimitry Andric return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
185181ad6265SDimitry Andric OldWaitcntInstr);
185281ad6265SDimitry Andric }
185381ad6265SDimitry Andric
generateWaitcnt(AMDGPU::Waitcnt Wait,MachineBasicBlock::instr_iterator It,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr)185481ad6265SDimitry Andric bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
185581ad6265SDimitry Andric MachineBasicBlock::instr_iterator It,
185681ad6265SDimitry Andric MachineBasicBlock &Block,
185781ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets,
185881ad6265SDimitry Andric MachineInstr *OldWaitcntInstr) {
185981ad6265SDimitry Andric bool Modified = false;
186081ad6265SDimitry Andric
186181ad6265SDimitry Andric if (OldWaitcntInstr)
1862fe6060f1SDimitry Andric // Try to merge the required wait with preexisting waitcnt instructions.
1863fe6060f1SDimitry Andric // Also erase redundant waitcnt.
1864fe6060f1SDimitry Andric Modified =
18657a6dacacSDimitry Andric WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
18667a6dacacSDimitry Andric
18677a6dacacSDimitry Andric // Any counts that could have been applied to any existing waitcnt
18687a6dacacSDimitry Andric // instructions will have been done so, now deal with any remaining.
1869fe6060f1SDimitry Andric ScoreBrackets.applyWaitcnt(Wait);
187081ad6265SDimitry Andric
187181ad6265SDimitry Andric // ExpCnt can be merged into VINTERP.
187281ad6265SDimitry Andric if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
187381ad6265SDimitry Andric SIInstrInfo::isVINTERP(*It)) {
187481ad6265SDimitry Andric MachineOperand *WaitExp =
187581ad6265SDimitry Andric TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
187681ad6265SDimitry Andric if (Wait.ExpCnt < WaitExp->getImm()) {
187781ad6265SDimitry Andric WaitExp->setImm(Wait.ExpCnt);
187881ad6265SDimitry Andric Modified = true;
187981ad6265SDimitry Andric }
188081ad6265SDimitry Andric Wait.ExpCnt = ~0u;
188181ad6265SDimitry Andric
18827a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
188381ad6265SDimitry Andric << "Update Instr: " << *It);
18840b57cec5SDimitry Andric }
18850b57cec5SDimitry Andric
18867a6dacacSDimitry Andric if (WCG->createNewWaitcnt(Block, It, Wait))
18870b57cec5SDimitry Andric Modified = true;
18880b57cec5SDimitry Andric
18890b57cec5SDimitry Andric return Modified;
18900b57cec5SDimitry Andric }
18910b57cec5SDimitry Andric
1892e8d8bef9SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens other
1893e8d8bef9SDimitry Andric // than LDS. Other address spaces supported by flat memory operations involve
1894e8d8bef9SDimitry Andric // global memory.
mayAccessVMEMThroughFlat(const MachineInstr & MI) const1895e8d8bef9SDimitry Andric bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1896e8d8bef9SDimitry Andric assert(TII->isFLAT(MI));
1897e8d8bef9SDimitry Andric
1898e8d8bef9SDimitry Andric // All flat instructions use the VMEM counter.
1899e8d8bef9SDimitry Andric assert(TII->usesVM_CNT(MI));
1900e8d8bef9SDimitry Andric
1901e8d8bef9SDimitry Andric // If there are no memory operands then conservatively assume the flat
1902e8d8bef9SDimitry Andric // operation may access VMEM.
19030b57cec5SDimitry Andric if (MI.memoperands_empty())
19040b57cec5SDimitry Andric return true;
19050b57cec5SDimitry Andric
1906e8d8bef9SDimitry Andric // See if any memory operand specifies an address space that involves VMEM.
1907e8d8bef9SDimitry Andric // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1908e8d8bef9SDimitry Andric // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1909e8d8bef9SDimitry Andric // (GDS) address space is not supported by flat operations. Therefore, simply
1910e8d8bef9SDimitry Andric // return true unless only the LDS address space is found.
1911e8d8bef9SDimitry Andric for (const MachineMemOperand *Memop : MI.memoperands()) {
1912e8d8bef9SDimitry Andric unsigned AS = Memop->getAddrSpace();
1913e8d8bef9SDimitry Andric assert(AS != AMDGPUAS::REGION_ADDRESS);
1914e8d8bef9SDimitry Andric if (AS != AMDGPUAS::LOCAL_ADDRESS)
1915e8d8bef9SDimitry Andric return true;
1916e8d8bef9SDimitry Andric }
1917e8d8bef9SDimitry Andric
1918e8d8bef9SDimitry Andric return false;
1919e8d8bef9SDimitry Andric }
1920e8d8bef9SDimitry Andric
1921e8d8bef9SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens for
1922e8d8bef9SDimitry Andric // either LDS or FLAT.
mayAccessLDSThroughFlat(const MachineInstr & MI) const1923e8d8bef9SDimitry Andric bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1924e8d8bef9SDimitry Andric assert(TII->isFLAT(MI));
1925e8d8bef9SDimitry Andric
1926e8d8bef9SDimitry Andric // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1927e8d8bef9SDimitry Andric if (!TII->usesLGKM_CNT(MI))
1928e8d8bef9SDimitry Andric return false;
1929e8d8bef9SDimitry Andric
1930fe6060f1SDimitry Andric // If in tgsplit mode then there can be no use of LDS.
1931fe6060f1SDimitry Andric if (ST->isTgSplitEnabled())
1932fe6060f1SDimitry Andric return false;
1933fe6060f1SDimitry Andric
1934e8d8bef9SDimitry Andric // If there are no memory operands then conservatively assume the flat
1935e8d8bef9SDimitry Andric // operation may access LDS.
1936e8d8bef9SDimitry Andric if (MI.memoperands_empty())
1937e8d8bef9SDimitry Andric return true;
1938e8d8bef9SDimitry Andric
1939e8d8bef9SDimitry Andric // See if any memory operand specifies an address space that involves LDS.
19400b57cec5SDimitry Andric for (const MachineMemOperand *Memop : MI.memoperands()) {
19410b57cec5SDimitry Andric unsigned AS = Memop->getAddrSpace();
19420b57cec5SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
19430b57cec5SDimitry Andric return true;
19440b57cec5SDimitry Andric }
19450b57cec5SDimitry Andric
19460b57cec5SDimitry Andric return false;
19470b57cec5SDimitry Andric }
19480b57cec5SDimitry Andric
194906c3fb27SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens for
195006c3fb27SDimitry Andric // either scratch or FLAT.
mayAccessScratchThroughFlat(const MachineInstr & MI) const195106c3fb27SDimitry Andric bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
195206c3fb27SDimitry Andric const MachineInstr &MI) const {
195306c3fb27SDimitry Andric assert(TII->isFLAT(MI));
195406c3fb27SDimitry Andric
195506c3fb27SDimitry Andric // SCRATCH instructions always access scratch.
195606c3fb27SDimitry Andric if (TII->isFLATScratch(MI))
195706c3fb27SDimitry Andric return true;
195806c3fb27SDimitry Andric
195906c3fb27SDimitry Andric // GLOBAL instructions never access scratch.
196006c3fb27SDimitry Andric if (TII->isFLATGlobal(MI))
196106c3fb27SDimitry Andric return false;
196206c3fb27SDimitry Andric
196306c3fb27SDimitry Andric // If there are no memory operands then conservatively assume the flat
196406c3fb27SDimitry Andric // operation may access scratch.
196506c3fb27SDimitry Andric if (MI.memoperands_empty())
196606c3fb27SDimitry Andric return true;
196706c3fb27SDimitry Andric
196806c3fb27SDimitry Andric // See if any memory operand specifies an address space that involves scratch.
196906c3fb27SDimitry Andric return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
197006c3fb27SDimitry Andric unsigned AS = Memop->getAddrSpace();
197106c3fb27SDimitry Andric return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
197206c3fb27SDimitry Andric });
197306c3fb27SDimitry Andric }
197406c3fb27SDimitry Andric
isCacheInvOrWBInst(MachineInstr & Inst)19751db9f3b2SDimitry Andric static bool isCacheInvOrWBInst(MachineInstr &Inst) {
19761db9f3b2SDimitry Andric auto Opc = Inst.getOpcode();
19771db9f3b2SDimitry Andric return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
19781db9f3b2SDimitry Andric Opc == AMDGPU::GLOBAL_WBINV;
19791db9f3b2SDimitry Andric }
19801db9f3b2SDimitry Andric
updateEventWaitcntAfter(MachineInstr & Inst,WaitcntBrackets * ScoreBrackets)19810b57cec5SDimitry Andric void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19820b57cec5SDimitry Andric WaitcntBrackets *ScoreBrackets) {
19830b57cec5SDimitry Andric // Now look at the instruction opcode. If it is a memory access
19840b57cec5SDimitry Andric // instruction, update the upper-bound of the appropriate counter's
19850b57cec5SDimitry Andric // bracket and the destination operand scores.
19867a6dacacSDimitry Andric // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
19877a6dacacSDimitry Andric
19880b57cec5SDimitry Andric if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
19890b57cec5SDimitry Andric if (TII->isAlwaysGDS(Inst.getOpcode()) ||
19900b57cec5SDimitry Andric TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
19910b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
19920b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
19930b57cec5SDimitry Andric } else {
19940b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
19950b57cec5SDimitry Andric }
19960b57cec5SDimitry Andric } else if (TII->isFLAT(Inst)) {
19971db9f3b2SDimitry Andric // TODO: Track this properly.
19981db9f3b2SDimitry Andric if (isCacheInvOrWBInst(Inst))
19991db9f3b2SDimitry Andric return;
20001db9f3b2SDimitry Andric
2001480093f4SDimitry Andric assert(Inst.mayLoadOrStore());
20020b57cec5SDimitry Andric
2003e8d8bef9SDimitry Andric int FlatASCount = 0;
2004e8d8bef9SDimitry Andric
2005e8d8bef9SDimitry Andric if (mayAccessVMEMThroughFlat(Inst)) {
2006e8d8bef9SDimitry Andric ++FlatASCount;
2007bdd1243dSDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2008bdd1243dSDimitry Andric Inst);
20090b57cec5SDimitry Andric }
20100b57cec5SDimitry Andric
2011e8d8bef9SDimitry Andric if (mayAccessLDSThroughFlat(Inst)) {
2012e8d8bef9SDimitry Andric ++FlatASCount;
20130b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
20140b57cec5SDimitry Andric }
2015e8d8bef9SDimitry Andric
2016e8d8bef9SDimitry Andric // A Flat memory operation must access at least one address space.
2017e8d8bef9SDimitry Andric assert(FlatASCount);
2018e8d8bef9SDimitry Andric
2019e8d8bef9SDimitry Andric // This is a flat memory operation that access both VMEM and LDS, so note it
2020e8d8bef9SDimitry Andric // - it will require that both the VM and LGKM be flushed to zero if it is
2021e8d8bef9SDimitry Andric // pending when a VM or LGKM dependency occurs.
2022e8d8bef9SDimitry Andric if (FlatASCount > 1)
2023e8d8bef9SDimitry Andric ScoreBrackets->setPendingFlat();
20240b57cec5SDimitry Andric } else if (SIInstrInfo::isVMEM(Inst) &&
2025fe6060f1SDimitry Andric !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
2026bdd1243dSDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2027bdd1243dSDimitry Andric Inst);
20280b57cec5SDimitry Andric
20290b57cec5SDimitry Andric if (ST->vmemWriteNeedsExpWaitcnt() &&
2030fe6060f1SDimitry Andric (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
20310b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
20320b57cec5SDimitry Andric }
20330b57cec5SDimitry Andric } else if (TII->isSMRD(Inst)) {
20340b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
20350b57cec5SDimitry Andric } else if (Inst.isCall()) {
20360b57cec5SDimitry Andric if (callWaitsOnFunctionReturn(Inst)) {
20370b57cec5SDimitry Andric // Act as a wait on everything
20387a6dacacSDimitry Andric ScoreBrackets->applyWaitcnt(
20390fca6ea1SDimitry Andric WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2040297eecfbSDimitry Andric ScoreBrackets->setStateOnFunctionEntryOrReturn();
20410b57cec5SDimitry Andric } else {
20420b57cec5SDimitry Andric // May need to way wait for anything.
20430b57cec5SDimitry Andric ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
20440b57cec5SDimitry Andric }
204581ad6265SDimitry Andric } else if (SIInstrInfo::isLDSDIR(Inst)) {
204681ad6265SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
204781ad6265SDimitry Andric } else if (TII->isVINTERP(Inst)) {
204881ad6265SDimitry Andric int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
204981ad6265SDimitry Andric ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2050e8d8bef9SDimitry Andric } else if (SIInstrInfo::isEXP(Inst)) {
2051e8d8bef9SDimitry Andric unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2052e8d8bef9SDimitry Andric if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2053e8d8bef9SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2054e8d8bef9SDimitry Andric else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2055e8d8bef9SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2056e8d8bef9SDimitry Andric else
2057e8d8bef9SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
20580b57cec5SDimitry Andric } else {
20590b57cec5SDimitry Andric switch (Inst.getOpcode()) {
20600b57cec5SDimitry Andric case AMDGPU::S_SENDMSG:
206181ad6265SDimitry Andric case AMDGPU::S_SENDMSG_RTN_B32:
206281ad6265SDimitry Andric case AMDGPU::S_SENDMSG_RTN_B64:
20630b57cec5SDimitry Andric case AMDGPU::S_SENDMSGHALT:
20640b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
20650b57cec5SDimitry Andric break;
20660b57cec5SDimitry Andric case AMDGPU::S_MEMTIME:
20670b57cec5SDimitry Andric case AMDGPU::S_MEMREALTIME:
20685f757f3fSDimitry Andric case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
20695f757f3fSDimitry Andric case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
20705f757f3fSDimitry Andric case AMDGPU::S_BARRIER_LEAVE:
20715f757f3fSDimitry Andric case AMDGPU::S_GET_BARRIER_STATE_M0:
20725f757f3fSDimitry Andric case AMDGPU::S_GET_BARRIER_STATE_IMM:
20730b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
20740b57cec5SDimitry Andric break;
20750b57cec5SDimitry Andric }
20760b57cec5SDimitry Andric }
20770b57cec5SDimitry Andric }
20780b57cec5SDimitry Andric
mergeScore(const MergeInfo & M,unsigned & Score,unsigned OtherScore)20795ffd83dbSDimitry Andric bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
20805ffd83dbSDimitry Andric unsigned OtherScore) {
20815ffd83dbSDimitry Andric unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
20825ffd83dbSDimitry Andric unsigned OtherShifted =
20830b57cec5SDimitry Andric OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
20840b57cec5SDimitry Andric Score = std::max(MyShifted, OtherShifted);
20850b57cec5SDimitry Andric return OtherShifted > MyShifted;
20860b57cec5SDimitry Andric }
20870b57cec5SDimitry Andric
20880b57cec5SDimitry Andric /// Merge the pending events and associater score brackets of \p Other into
20890b57cec5SDimitry Andric /// this brackets status.
20900b57cec5SDimitry Andric ///
20910b57cec5SDimitry Andric /// Returns whether the merge resulted in a change that requires tighter waits
20920b57cec5SDimitry Andric /// (i.e. the merged brackets strictly dominate the original brackets).
merge(const WaitcntBrackets & Other)20930b57cec5SDimitry Andric bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
20940b57cec5SDimitry Andric bool StrictDom = false;
20950b57cec5SDimitry Andric
20965ffd83dbSDimitry Andric VgprUB = std::max(VgprUB, Other.VgprUB);
20975ffd83dbSDimitry Andric SgprUB = std::max(SgprUB, Other.SgprUB);
20985ffd83dbSDimitry Andric
20997a6dacacSDimitry Andric for (auto T : inst_counter_types(MaxCounter)) {
21000b57cec5SDimitry Andric // Merge event flags for this counter
21015ffd83dbSDimitry Andric const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
21025ffd83dbSDimitry Andric const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
21030b57cec5SDimitry Andric if (OtherEvents & ~OldEvents)
21040b57cec5SDimitry Andric StrictDom = true;
21050b57cec5SDimitry Andric PendingEvents |= OtherEvents;
21060b57cec5SDimitry Andric
21070b57cec5SDimitry Andric // Merge scores for this counter
21085ffd83dbSDimitry Andric const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
21095ffd83dbSDimitry Andric const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
21105ffd83dbSDimitry Andric const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
21115ffd83dbSDimitry Andric if (NewUB < ScoreLBs[T])
21125ffd83dbSDimitry Andric report_fatal_error("waitcnt score overflow");
21135ffd83dbSDimitry Andric
21140b57cec5SDimitry Andric MergeInfo M;
21150b57cec5SDimitry Andric M.OldLB = ScoreLBs[T];
21160b57cec5SDimitry Andric M.OtherLB = Other.ScoreLBs[T];
21175ffd83dbSDimitry Andric M.MyShift = NewUB - ScoreUBs[T];
21185ffd83dbSDimitry Andric M.OtherShift = NewUB - Other.ScoreUBs[T];
21190b57cec5SDimitry Andric
21200b57cec5SDimitry Andric ScoreUBs[T] = NewUB;
21210b57cec5SDimitry Andric
21220b57cec5SDimitry Andric StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
21230b57cec5SDimitry Andric
2124bdd1243dSDimitry Andric for (int J = 0; J <= VgprUB; J++)
2125bdd1243dSDimitry Andric StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
21265ffd83dbSDimitry Andric
21277a6dacacSDimitry Andric if (T == SmemAccessCounter) {
2128bdd1243dSDimitry Andric for (int J = 0; J <= SgprUB; J++)
2129bdd1243dSDimitry Andric StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
21300b57cec5SDimitry Andric }
21310b57cec5SDimitry Andric }
21320b57cec5SDimitry Andric
2133bdd1243dSDimitry Andric for (int J = 0; J <= VgprUB; J++) {
2134bdd1243dSDimitry Andric unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2135bdd1243dSDimitry Andric StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2136bdd1243dSDimitry Andric VgprVmemTypes[J] = NewVmemTypes;
21370b57cec5SDimitry Andric }
21380b57cec5SDimitry Andric
21390b57cec5SDimitry Andric return StrictDom;
21400b57cec5SDimitry Andric }
21410b57cec5SDimitry Andric
isWaitInstr(MachineInstr & Inst)2142bdd1243dSDimitry Andric static bool isWaitInstr(MachineInstr &Inst) {
21437a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
21447a6dacacSDimitry Andric return Opcode == AMDGPU::S_WAITCNT ||
21457a6dacacSDimitry Andric (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
21467a6dacacSDimitry Andric Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
21477a6dacacSDimitry Andric Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
21487a6dacacSDimitry Andric Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
21497a6dacacSDimitry Andric counterTypeForInstr(Opcode).has_value();
2150bdd1243dSDimitry Andric }
2151bdd1243dSDimitry Andric
21520b57cec5SDimitry Andric // Generate s_waitcnt instructions where needed.
insertWaitcntInBlock(MachineFunction & MF,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets)21530b57cec5SDimitry Andric bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
21540b57cec5SDimitry Andric MachineBasicBlock &Block,
21550b57cec5SDimitry Andric WaitcntBrackets &ScoreBrackets) {
21560b57cec5SDimitry Andric bool Modified = false;
21570b57cec5SDimitry Andric
21580b57cec5SDimitry Andric LLVM_DEBUG({
21590b57cec5SDimitry Andric dbgs() << "*** Block" << Block.getNumber() << " ***";
21600b57cec5SDimitry Andric ScoreBrackets.dump();
21610b57cec5SDimitry Andric });
21620b57cec5SDimitry Andric
2163e8d8bef9SDimitry Andric // Track the correctness of vccz through this basic block. There are two
2164e8d8bef9SDimitry Andric // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2165e8d8bef9SDimitry Andric // ST->partialVCCWritesUpdateVCCZ().
21665ffd83dbSDimitry Andric bool VCCZCorrect = true;
2167e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug()) {
2168e8d8bef9SDimitry Andric // vccz could be incorrect at a basic block boundary if a predecessor wrote
2169e8d8bef9SDimitry Andric // to vcc and then issued an smem load.
2170e8d8bef9SDimitry Andric VCCZCorrect = false;
2171e8d8bef9SDimitry Andric } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2172e8d8bef9SDimitry Andric // vccz could be incorrect at a basic block boundary if a predecessor wrote
2173e8d8bef9SDimitry Andric // to vcc_lo or vcc_hi.
2174e8d8bef9SDimitry Andric VCCZCorrect = false;
2175e8d8bef9SDimitry Andric }
21765ffd83dbSDimitry Andric
21770b57cec5SDimitry Andric // Walk over the instructions.
21780b57cec5SDimitry Andric MachineInstr *OldWaitcntInstr = nullptr;
21790b57cec5SDimitry Andric
21800b57cec5SDimitry Andric for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
21810b57cec5SDimitry Andric E = Block.instr_end();
21820b57cec5SDimitry Andric Iter != E;) {
21830b57cec5SDimitry Andric MachineInstr &Inst = *Iter;
21840b57cec5SDimitry Andric
2185fe6060f1SDimitry Andric // Track pre-existing waitcnts that were added in earlier iterations or by
2186fe6060f1SDimitry Andric // the memory legalizer.
2187bdd1243dSDimitry Andric if (isWaitInstr(Inst)) {
21880b57cec5SDimitry Andric if (!OldWaitcntInstr)
21890b57cec5SDimitry Andric OldWaitcntInstr = &Inst;
21900b57cec5SDimitry Andric ++Iter;
21910b57cec5SDimitry Andric continue;
21920b57cec5SDimitry Andric }
21930b57cec5SDimitry Andric
219481ad6265SDimitry Andric bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
219581ad6265SDimitry Andric isPreheaderToFlush(Block, ScoreBrackets);
219681ad6265SDimitry Andric
2197e8d8bef9SDimitry Andric // Generate an s_waitcnt instruction to be placed before Inst, if needed.
219881ad6265SDimitry Andric Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
219981ad6265SDimitry Andric FlushVmCnt);
2200e8d8bef9SDimitry Andric OldWaitcntInstr = nullptr;
2201e8d8bef9SDimitry Andric
2202e8d8bef9SDimitry Andric // Restore vccz if it's not known to be correct already.
2203e8d8bef9SDimitry Andric bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2204e8d8bef9SDimitry Andric
2205e8d8bef9SDimitry Andric // Don't examine operands unless we need to track vccz correctness.
2206e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
22070fca6ea1SDimitry Andric if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
22080fca6ea1SDimitry Andric Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2209e8d8bef9SDimitry Andric // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2210e8d8bef9SDimitry Andric if (!ST->partialVCCWritesUpdateVCCZ())
2211e8d8bef9SDimitry Andric VCCZCorrect = false;
22120fca6ea1SDimitry Andric } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
22135ffd83dbSDimitry Andric // There is a hardware bug on CI/SI where SMRD instruction may corrupt
22145ffd83dbSDimitry Andric // vccz bit, so when we detect that an instruction may read from a
22155ffd83dbSDimitry Andric // corrupt vccz bit, we need to:
22165ffd83dbSDimitry Andric // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
22175ffd83dbSDimitry Andric // operations to complete.
22185ffd83dbSDimitry Andric // 2. Restore the correct value of vccz by writing the current value
22195ffd83dbSDimitry Andric // of vcc back to vcc.
2220e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug() &&
22210b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2222e8d8bef9SDimitry Andric // Writes to vcc while there's an outstanding smem read may get
2223e8d8bef9SDimitry Andric // clobbered as soon as any read completes.
2224e8d8bef9SDimitry Andric VCCZCorrect = false;
2225e8d8bef9SDimitry Andric } else {
2226e8d8bef9SDimitry Andric // Writes to vcc will fix any incorrect value in vccz.
2227e8d8bef9SDimitry Andric VCCZCorrect = true;
22285ffd83dbSDimitry Andric }
22290b57cec5SDimitry Andric }
22300b57cec5SDimitry Andric }
22310b57cec5SDimitry Andric
2232480093f4SDimitry Andric if (TII->isSMRD(Inst)) {
2233480093f4SDimitry Andric for (const MachineMemOperand *Memop : Inst.memoperands()) {
2234fe6060f1SDimitry Andric // No need to handle invariant loads when avoiding WAR conflicts, as
2235fe6060f1SDimitry Andric // there cannot be a vector store to the same memory location.
2236fe6060f1SDimitry Andric if (!Memop->isInvariant()) {
2237480093f4SDimitry Andric const Value *Ptr = Memop->getValue();
2238bdd1243dSDimitry Andric SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2239480093f4SDimitry Andric }
2240fe6060f1SDimitry Andric }
2241e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug()) {
2242e8d8bef9SDimitry Andric // This smem read could complete and clobber vccz at any time.
22435ffd83dbSDimitry Andric VCCZCorrect = false;
22445ffd83dbSDimitry Andric }
2245e8d8bef9SDimitry Andric }
22460b57cec5SDimitry Andric
22470b57cec5SDimitry Andric updateEventWaitcntAfter(Inst, &ScoreBrackets);
22480b57cec5SDimitry Andric
22490fca6ea1SDimitry Andric if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
22500fca6ea1SDimitry Andric AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
22510fca6ea1SDimitry Andric Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
22520fca6ea1SDimitry Andric ScoreBrackets.simplifyWaitcnt(Wait);
22530fca6ea1SDimitry Andric Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
22540fca6ea1SDimitry Andric ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
22550b57cec5SDimitry Andric }
22560b57cec5SDimitry Andric
22570b57cec5SDimitry Andric LLVM_DEBUG({
22580b57cec5SDimitry Andric Inst.print(dbgs());
22590b57cec5SDimitry Andric ScoreBrackets.dump();
22600b57cec5SDimitry Andric });
22610b57cec5SDimitry Andric
22620b57cec5SDimitry Andric // TODO: Remove this work-around after fixing the scheduler and enable the
22630b57cec5SDimitry Andric // assert above.
22645ffd83dbSDimitry Andric if (RestoreVCCZ) {
22650b57cec5SDimitry Andric // Restore the vccz bit. Any time a value is written to vcc, the vcc
22660b57cec5SDimitry Andric // bit is updated, so we can restore the bit by reading the value of
22670b57cec5SDimitry Andric // vcc and then writing it back to the register.
22680b57cec5SDimitry Andric BuildMI(Block, Inst, Inst.getDebugLoc(),
22690b57cec5SDimitry Andric TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
22700b57cec5SDimitry Andric TRI->getVCC())
22710b57cec5SDimitry Andric .addReg(TRI->getVCC());
22725ffd83dbSDimitry Andric VCCZCorrect = true;
22730b57cec5SDimitry Andric Modified = true;
22740b57cec5SDimitry Andric }
22750b57cec5SDimitry Andric
22760b57cec5SDimitry Andric ++Iter;
22770b57cec5SDimitry Andric }
22780b57cec5SDimitry Andric
22790fca6ea1SDimitry Andric // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
22800fca6ea1SDimitry Andric // needed.
22810fca6ea1SDimitry Andric AMDGPU::Waitcnt Wait;
228281ad6265SDimitry Andric if (Block.getFirstTerminator() == Block.end() &&
22830fca6ea1SDimitry Andric isPreheaderToFlush(Block, ScoreBrackets)) {
22840fca6ea1SDimitry Andric if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
22850fca6ea1SDimitry Andric Wait.LoadCnt = 0;
22860fca6ea1SDimitry Andric if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
22870fca6ea1SDimitry Andric Wait.SampleCnt = 0;
22880fca6ea1SDimitry Andric if (ScoreBrackets.hasPendingEvent(BVH_CNT))
22890fca6ea1SDimitry Andric Wait.BvhCnt = 0;
22900fca6ea1SDimitry Andric }
22910fca6ea1SDimitry Andric
22920fca6ea1SDimitry Andric // Combine or remove any redundant waitcnts at the end of the block.
22930fca6ea1SDimitry Andric Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
22940fca6ea1SDimitry Andric OldWaitcntInstr);
229581ad6265SDimitry Andric
22960b57cec5SDimitry Andric return Modified;
22970b57cec5SDimitry Andric }
22980b57cec5SDimitry Andric
229981ad6265SDimitry Andric // Return true if the given machine basic block is a preheader of a loop in
230081ad6265SDimitry Andric // which we want to flush the vmcnt counter, and false otherwise.
isPreheaderToFlush(MachineBasicBlock & MBB,WaitcntBrackets & ScoreBrackets)230181ad6265SDimitry Andric bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
230281ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets) {
23035f757f3fSDimitry Andric auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
23045f757f3fSDimitry Andric if (!IsInserted)
23055f757f3fSDimitry Andric return Iterator->second;
230681ad6265SDimitry Andric
230781ad6265SDimitry Andric MachineBasicBlock *Succ = MBB.getSingleSuccessor();
230881ad6265SDimitry Andric if (!Succ)
23095f757f3fSDimitry Andric return false;
231081ad6265SDimitry Andric
231181ad6265SDimitry Andric MachineLoop *Loop = MLI->getLoopFor(Succ);
231281ad6265SDimitry Andric if (!Loop)
23135f757f3fSDimitry Andric return false;
231481ad6265SDimitry Andric
23155f757f3fSDimitry Andric if (Loop->getLoopPreheader() == &MBB &&
23165f757f3fSDimitry Andric shouldFlushVmCnt(Loop, ScoreBrackets)) {
23175f757f3fSDimitry Andric Iterator->second = true;
23185f757f3fSDimitry Andric return true;
23195f757f3fSDimitry Andric }
232081ad6265SDimitry Andric
23215f757f3fSDimitry Andric return false;
232281ad6265SDimitry Andric }
232381ad6265SDimitry Andric
isVMEMOrFlatVMEM(const MachineInstr & MI) const232406c3fb27SDimitry Andric bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
232506c3fb27SDimitry Andric return SIInstrInfo::isVMEM(MI) ||
232606c3fb27SDimitry Andric (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
232706c3fb27SDimitry Andric }
232806c3fb27SDimitry Andric
232981ad6265SDimitry Andric // Return true if it is better to flush the vmcnt counter in the preheader of
233081ad6265SDimitry Andric // the given loop. We currently decide to flush in two situations:
233181ad6265SDimitry Andric // 1. The loop contains vmem store(s), no vmem load and at least one use of a
233281ad6265SDimitry Andric // vgpr containing a value that is loaded outside of the loop. (Only on
233381ad6265SDimitry Andric // targets with no vscnt counter).
233481ad6265SDimitry Andric // 2. The loop contains vmem load(s), but the loaded values are not used in the
233581ad6265SDimitry Andric // loop, and at least one use of a vgpr containing a value that is loaded
233681ad6265SDimitry Andric // outside of the loop.
shouldFlushVmCnt(MachineLoop * ML,WaitcntBrackets & Brackets)233781ad6265SDimitry Andric bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
233881ad6265SDimitry Andric WaitcntBrackets &Brackets) {
233981ad6265SDimitry Andric bool HasVMemLoad = false;
234081ad6265SDimitry Andric bool HasVMemStore = false;
234181ad6265SDimitry Andric bool UsesVgprLoadedOutside = false;
234281ad6265SDimitry Andric DenseSet<Register> VgprUse;
234381ad6265SDimitry Andric DenseSet<Register> VgprDef;
234481ad6265SDimitry Andric
234581ad6265SDimitry Andric for (MachineBasicBlock *MBB : ML->blocks()) {
234681ad6265SDimitry Andric for (MachineInstr &MI : *MBB) {
234706c3fb27SDimitry Andric if (isVMEMOrFlatVMEM(MI)) {
234881ad6265SDimitry Andric if (MI.mayLoad())
234981ad6265SDimitry Andric HasVMemLoad = true;
235081ad6265SDimitry Andric if (MI.mayStore())
235181ad6265SDimitry Andric HasVMemStore = true;
235281ad6265SDimitry Andric }
235381ad6265SDimitry Andric for (unsigned I = 0; I < MI.getNumOperands(); I++) {
235481ad6265SDimitry Andric MachineOperand &Op = MI.getOperand(I);
235581ad6265SDimitry Andric if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
235681ad6265SDimitry Andric continue;
2357cb14a3feSDimitry Andric RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
235881ad6265SDimitry Andric // Vgpr use
235981ad6265SDimitry Andric if (Op.isUse()) {
236081ad6265SDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
236181ad6265SDimitry Andric // If we find a register that is loaded inside the loop, 1. and 2.
236281ad6265SDimitry Andric // are invalidated and we can exit.
236381ad6265SDimitry Andric if (VgprDef.contains(RegNo))
236481ad6265SDimitry Andric return false;
236581ad6265SDimitry Andric VgprUse.insert(RegNo);
236681ad6265SDimitry Andric // If at least one of Op's registers is in the score brackets, the
236781ad6265SDimitry Andric // value is likely loaded outside of the loop.
23687a6dacacSDimitry Andric if (Brackets.getRegScore(RegNo, LOAD_CNT) >
23697a6dacacSDimitry Andric Brackets.getScoreLB(LOAD_CNT) ||
23707a6dacacSDimitry Andric Brackets.getRegScore(RegNo, SAMPLE_CNT) >
23717a6dacacSDimitry Andric Brackets.getScoreLB(SAMPLE_CNT) ||
23727a6dacacSDimitry Andric Brackets.getRegScore(RegNo, BVH_CNT) >
23737a6dacacSDimitry Andric Brackets.getScoreLB(BVH_CNT)) {
237481ad6265SDimitry Andric UsesVgprLoadedOutside = true;
237581ad6265SDimitry Andric break;
237681ad6265SDimitry Andric }
237781ad6265SDimitry Andric }
237881ad6265SDimitry Andric }
237981ad6265SDimitry Andric // VMem load vgpr def
238006c3fb27SDimitry Andric else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
238181ad6265SDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
238281ad6265SDimitry Andric // If we find a register that is loaded inside the loop, 1. and 2.
238381ad6265SDimitry Andric // are invalidated and we can exit.
238481ad6265SDimitry Andric if (VgprUse.contains(RegNo))
238581ad6265SDimitry Andric return false;
238681ad6265SDimitry Andric VgprDef.insert(RegNo);
238781ad6265SDimitry Andric }
238881ad6265SDimitry Andric }
238981ad6265SDimitry Andric }
239081ad6265SDimitry Andric }
239181ad6265SDimitry Andric if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
239281ad6265SDimitry Andric return true;
2393*6c4b055cSDimitry Andric return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
239481ad6265SDimitry Andric }
239581ad6265SDimitry Andric
runOnMachineFunction(MachineFunction & MF)23960b57cec5SDimitry Andric bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
23970b57cec5SDimitry Andric ST = &MF.getSubtarget<GCNSubtarget>();
23980b57cec5SDimitry Andric TII = ST->getInstrInfo();
23990b57cec5SDimitry Andric TRI = &TII->getRegisterInfo();
24000b57cec5SDimitry Andric MRI = &MF.getRegInfo();
24010b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
24020fca6ea1SDimitry Andric MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
24030fca6ea1SDimitry Andric PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
24047a6dacacSDimitry Andric if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
24057a6dacacSDimitry Andric AA = &AAR->getAAResults();
24067a6dacacSDimitry Andric
24077a6dacacSDimitry Andric AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
24087a6dacacSDimitry Andric
24097a6dacacSDimitry Andric if (ST->hasExtendedWaitCounts()) {
24107a6dacacSDimitry Andric MaxCounter = NUM_EXTENDED_INST_CNTS;
24110fca6ea1SDimitry Andric WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
24127a6dacacSDimitry Andric WCG = &WCGGFX12Plus;
24137a6dacacSDimitry Andric } else {
24147a6dacacSDimitry Andric MaxCounter = NUM_NORMAL_INST_CNTS;
24150fca6ea1SDimitry Andric WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
24167a6dacacSDimitry Andric WCG = &WCGPreGFX12;
24177a6dacacSDimitry Andric }
24180b57cec5SDimitry Andric
24190b57cec5SDimitry Andric ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
24200b57cec5SDimitry Andric for (auto T : inst_counter_types())
24210b57cec5SDimitry Andric ForceEmitWaitcnt[T] = false;
24220b57cec5SDimitry Andric
24237a6dacacSDimitry Andric const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
24247a6dacacSDimitry Andric
24257a6dacacSDimitry Andric SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
24267a6dacacSDimitry Andric
24270eae32dcSDimitry Andric HardwareLimits Limits = {};
24287a6dacacSDimitry Andric if (ST->hasExtendedWaitCounts()) {
24297a6dacacSDimitry Andric Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
24307a6dacacSDimitry Andric Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
24317a6dacacSDimitry Andric } else {
24327a6dacacSDimitry Andric Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
24337a6dacacSDimitry Andric Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
24347a6dacacSDimitry Andric }
24350eae32dcSDimitry Andric Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
24367a6dacacSDimitry Andric Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
24377a6dacacSDimitry Andric Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
24387a6dacacSDimitry Andric Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
24397a6dacacSDimitry Andric Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
24400b57cec5SDimitry Andric
24415ffd83dbSDimitry Andric unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
24425ffd83dbSDimitry Andric unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
24435ffd83dbSDimitry Andric assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
24445ffd83dbSDimitry Andric assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
24450b57cec5SDimitry Andric
24460eae32dcSDimitry Andric RegisterEncoding Encoding = {};
24475f757f3fSDimitry Andric Encoding.VGPR0 =
24485f757f3fSDimitry Andric TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
24490eae32dcSDimitry Andric Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
24505f757f3fSDimitry Andric Encoding.SGPR0 =
24515f757f3fSDimitry Andric TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
24520eae32dcSDimitry Andric Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
24530b57cec5SDimitry Andric
24540b57cec5SDimitry Andric BlockInfos.clear();
2455fe6060f1SDimitry Andric bool Modified = false;
2456fe6060f1SDimitry Andric
24577a6dacacSDimitry Andric MachineBasicBlock &EntryBB = MF.front();
24587a6dacacSDimitry Andric MachineBasicBlock::iterator I = EntryBB.begin();
24597a6dacacSDimitry Andric
2460fe6060f1SDimitry Andric if (!MFI->isEntryFunction()) {
2461fe6060f1SDimitry Andric // Wait for any outstanding memory operations that the input registers may
2462fe6060f1SDimitry Andric // depend on. We can't track them and it's better to do the wait after the
2463fe6060f1SDimitry Andric // costly call sequence.
2464fe6060f1SDimitry Andric
2465fe6060f1SDimitry Andric // TODO: Could insert earlier and schedule more liberally with operations
2466fe6060f1SDimitry Andric // that only use caller preserved registers.
2467fe6060f1SDimitry Andric for (MachineBasicBlock::iterator E = EntryBB.end();
2468fe6060f1SDimitry Andric I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2469fe6060f1SDimitry Andric ;
2470fe6060f1SDimitry Andric
24717a6dacacSDimitry Andric if (ST->hasExtendedWaitCounts()) {
24727a6dacacSDimitry Andric BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
24737a6dacacSDimitry Andric .addImm(0);
24747a6dacacSDimitry Andric for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
24757a6dacacSDimitry Andric if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
24767a6dacacSDimitry Andric continue;
24777a6dacacSDimitry Andric
24787a6dacacSDimitry Andric BuildMI(EntryBB, I, DebugLoc(),
24797a6dacacSDimitry Andric TII->get(instrsForExtendedCounterTypes[CT]))
24807a6dacacSDimitry Andric .addImm(0);
24817a6dacacSDimitry Andric }
24827a6dacacSDimitry Andric } else {
24837a6dacacSDimitry Andric BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
24847a6dacacSDimitry Andric }
24857a6dacacSDimitry Andric
24867a6dacacSDimitry Andric auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
24877a6dacacSDimitry Andric ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
24887a6dacacSDimitry Andric SmemAccessCounter);
2489297eecfbSDimitry Andric NonKernelInitialState->setStateOnFunctionEntryOrReturn();
24905f757f3fSDimitry Andric BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
24915f757f3fSDimitry Andric
2492fe6060f1SDimitry Andric Modified = true;
2493fe6060f1SDimitry Andric }
24940b57cec5SDimitry Andric
24950b57cec5SDimitry Andric // Keep iterating over the blocks in reverse post order, inserting and
24960b57cec5SDimitry Andric // updating s_waitcnt where needed, until a fix point is reached.
24975ffd83dbSDimitry Andric for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
249806c3fb27SDimitry Andric BlockInfos.insert({MBB, BlockInfo()});
24990b57cec5SDimitry Andric
25000b57cec5SDimitry Andric std::unique_ptr<WaitcntBrackets> Brackets;
25010b57cec5SDimitry Andric bool Repeat;
25020b57cec5SDimitry Andric do {
25030b57cec5SDimitry Andric Repeat = false;
25040b57cec5SDimitry Andric
25055ffd83dbSDimitry Andric for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
25065ffd83dbSDimitry Andric ++BII) {
250706c3fb27SDimitry Andric MachineBasicBlock *MBB = BII->first;
25085ffd83dbSDimitry Andric BlockInfo &BI = BII->second;
25090b57cec5SDimitry Andric if (!BI.Dirty)
25100b57cec5SDimitry Andric continue;
25110b57cec5SDimitry Andric
25120b57cec5SDimitry Andric if (BI.Incoming) {
25130b57cec5SDimitry Andric if (!Brackets)
25148bcb0991SDimitry Andric Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
25150b57cec5SDimitry Andric else
25160b57cec5SDimitry Andric *Brackets = *BI.Incoming;
25170b57cec5SDimitry Andric } else {
25180b57cec5SDimitry Andric if (!Brackets)
25197a6dacacSDimitry Andric Brackets = std::make_unique<WaitcntBrackets>(
25207a6dacacSDimitry Andric ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
25217a6dacacSDimitry Andric SmemAccessCounter);
25220b57cec5SDimitry Andric else
25237a6dacacSDimitry Andric *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
25247a6dacacSDimitry Andric WaitEventMaskForInst, SmemAccessCounter);
25250b57cec5SDimitry Andric }
25260b57cec5SDimitry Andric
252706c3fb27SDimitry Andric Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
25280b57cec5SDimitry Andric BI.Dirty = false;
25290b57cec5SDimitry Andric
2530bdd1243dSDimitry Andric if (Brackets->hasPendingEvent()) {
25310b57cec5SDimitry Andric BlockInfo *MoveBracketsToSucc = nullptr;
253206c3fb27SDimitry Andric for (MachineBasicBlock *Succ : MBB->successors()) {
25335ffd83dbSDimitry Andric auto SuccBII = BlockInfos.find(Succ);
25345ffd83dbSDimitry Andric BlockInfo &SuccBI = SuccBII->second;
25350b57cec5SDimitry Andric if (!SuccBI.Incoming) {
25360b57cec5SDimitry Andric SuccBI.Dirty = true;
25375ffd83dbSDimitry Andric if (SuccBII <= BII)
25380b57cec5SDimitry Andric Repeat = true;
25390b57cec5SDimitry Andric if (!MoveBracketsToSucc) {
25400b57cec5SDimitry Andric MoveBracketsToSucc = &SuccBI;
25410b57cec5SDimitry Andric } else {
25428bcb0991SDimitry Andric SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
25430b57cec5SDimitry Andric }
25440b57cec5SDimitry Andric } else if (SuccBI.Incoming->merge(*Brackets)) {
25450b57cec5SDimitry Andric SuccBI.Dirty = true;
25465ffd83dbSDimitry Andric if (SuccBII <= BII)
25470b57cec5SDimitry Andric Repeat = true;
25480b57cec5SDimitry Andric }
25490b57cec5SDimitry Andric }
25500b57cec5SDimitry Andric if (MoveBracketsToSucc)
25510b57cec5SDimitry Andric MoveBracketsToSucc->Incoming = std::move(Brackets);
25520b57cec5SDimitry Andric }
25530b57cec5SDimitry Andric }
25540b57cec5SDimitry Andric } while (Repeat);
25550b57cec5SDimitry Andric
25560eae32dcSDimitry Andric if (ST->hasScalarStores()) {
25570b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
25580b57cec5SDimitry Andric bool HaveScalarStores = false;
25590b57cec5SDimitry Andric
2560349cc55cSDimitry Andric for (MachineBasicBlock &MBB : MF) {
2561349cc55cSDimitry Andric for (MachineInstr &MI : MBB) {
2562349cc55cSDimitry Andric if (!HaveScalarStores && TII->isScalarStore(MI))
25630b57cec5SDimitry Andric HaveScalarStores = true;
25640b57cec5SDimitry Andric
2565349cc55cSDimitry Andric if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2566349cc55cSDimitry Andric MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
25670b57cec5SDimitry Andric EndPgmBlocks.push_back(&MBB);
25680b57cec5SDimitry Andric }
25690b57cec5SDimitry Andric }
25700b57cec5SDimitry Andric
25710b57cec5SDimitry Andric if (HaveScalarStores) {
25720b57cec5SDimitry Andric // If scalar writes are used, the cache must be flushed or else the next
25730b57cec5SDimitry Andric // wave to reuse the same scratch memory can be clobbered.
25740b57cec5SDimitry Andric //
25750b57cec5SDimitry Andric // Insert s_dcache_wb at wave termination points if there were any scalar
25760eae32dcSDimitry Andric // stores, and only if the cache hasn't already been flushed. This could
25770eae32dcSDimitry Andric // be improved by looking across blocks for flushes in postdominating
25780eae32dcSDimitry Andric // blocks from the stores but an explicitly requested flush is probably
25790eae32dcSDimitry Andric // very rare.
25800b57cec5SDimitry Andric for (MachineBasicBlock *MBB : EndPgmBlocks) {
25810b57cec5SDimitry Andric bool SeenDCacheWB = false;
25820b57cec5SDimitry Andric
25830eae32dcSDimitry Andric for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
25840eae32dcSDimitry Andric I != E; ++I) {
25850b57cec5SDimitry Andric if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
25860b57cec5SDimitry Andric SeenDCacheWB = true;
25870b57cec5SDimitry Andric else if (TII->isScalarStore(*I))
25880b57cec5SDimitry Andric SeenDCacheWB = false;
25890b57cec5SDimitry Andric
25900b57cec5SDimitry Andric // FIXME: It would be better to insert this before a waitcnt if any.
25910b57cec5SDimitry Andric if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
25920b57cec5SDimitry Andric I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
25930b57cec5SDimitry Andric !SeenDCacheWB) {
25940b57cec5SDimitry Andric Modified = true;
25950b57cec5SDimitry Andric BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
25960b57cec5SDimitry Andric }
25970b57cec5SDimitry Andric }
25980b57cec5SDimitry Andric }
25990b57cec5SDimitry Andric }
26000eae32dcSDimitry Andric }
26010b57cec5SDimitry Andric
260206c3fb27SDimitry Andric // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
260306c3fb27SDimitry Andric // instructions.
260406c3fb27SDimitry Andric for (MachineInstr *MI : ReleaseVGPRInsts) {
260506c3fb27SDimitry Andric if (ST->requiresNopBeforeDeallocVGPRs()) {
26060fca6ea1SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
260706c3fb27SDimitry Andric .addImm(0);
260806c3fb27SDimitry Andric }
26090fca6ea1SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
26100fca6ea1SDimitry Andric TII->get(AMDGPU::S_SENDMSG))
261106c3fb27SDimitry Andric .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
261206c3fb27SDimitry Andric Modified = true;
261306c3fb27SDimitry Andric }
261406c3fb27SDimitry Andric ReleaseVGPRInsts.clear();
261506c3fb27SDimitry Andric
26160b57cec5SDimitry Andric return Modified;
26170b57cec5SDimitry Andric }
2618