SIInsertWaitcnts.cpp - OpenGrok cross reference for /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines Matching +full:vcc +full:- +full:p
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
17 /// finely-grained approach that keeps one timeline per event type could
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
24 //===----------------------------------------------------------------------===//
42 #define DEBUG_TYPE "si-insert-waitcnts"
44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
52   "amdgpu-waitcnt-forcezero",
109   VMEM_ACCESS,              // vector-memory read & write
110   VMEM_READ_ACCESS,         // vector-memory read
111   VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112   VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
113   VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
114   SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
118   SMEM_ACCESS,              // scalar-memory read & write
123   VMW_GPR_LOCK,             // vector-memory write holding on its data src
129 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
130 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
131 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
148 // Enumerate different types of result-returning VMEM operations. Although
151 // their results in order -- so there is no need to insert an s_waitcnt between
189       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);  in getVmemType()
193   return BaseInfo->BVH                                         ? VMEM_BVH  in getVmemType()
194          : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER  in getVmemType()
242 // a per-register scoreboard for each wait counter.
292     return getScoreUB(T) - getScoreLB(T);  in getScoreRange()
300     return SgprScores[GprNo - NUM_ALL_VGPRS];  in getRegScore()
332     return Events & (Events - 1);  in hasMixedPendingEvents()
394       ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);  in setScoreUB()
403       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);  in setRegScore()
404       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;  in setRegScore()
425   int VgprUB = -1;
426   int SgprUB = -1;
429   // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
436   SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
456       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),  in WaitcntGenerator()
457         IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),  in WaitcntGenerator()
604   // because of amdgpu-waitcnt-forcezero flag
609   // generator objects, which must have been re-initialised before use
658 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;  in setForceEmitWaitcnt()
700     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))  in getVmemWaitEventType()
710     if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))  in getVmemWaitEventType()
738   const MachineOperand &Op = MI->getOperand(OpNo);  in getRegInterval()
739   if (!TRI->isInAllocatableClass(Op.getReg()))  in getRegInterval()
740     return {-1, -1};  in getRegInterval()
748   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &  in getRegInterval()
751   if (TRI->isVectorRegister(*MRI, Op.getReg())) {  in getRegInterval()
753     Result.first = Reg - Encoding.VGPR0;  in getRegInterval()
754     if (TRI->isAGPR(*MRI, Op.getReg()))  in getRegInterval()
757   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {  in getRegInterval()
759     Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;  in getRegInterval()
764   // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...  in getRegInterval()
766     return {-1, -1};  in getRegInterval()
768   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());  in getRegInterval()
769   unsigned Size = TRI->getRegSizeInBits(*RC);  in getRegInterval()
781   assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));  in setExpScore()
799   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.  in updateByEvent()
806     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {  in updateByEvent()
811       if (AddrOpIdx != -1) {  in updateByEvent()
835               TRI->isVectorRegister(*MRI, Op.getReg())) {  in updateByEvent()
840     } else if (TII->isFLAT(Inst)) {  in updateByEvent()
852     } else if (TII->isMIMG(Inst)) {  in updateByEvent()
861     } else if (TII->isMTBUF(Inst)) {  in updateByEvent()
865     } else if (TII->isMUBUF(Inst)) {  in updateByEvent()
874     } else if (TII->isLDSDIR(Inst)) {  in updateByEvent()
881       if (TII->isEXP(Inst)) {  in updateByEvent()
889               TRI->isVGPR(*MRI, DefMO.getReg())) {  in updateByEvent()
891                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),  in updateByEvent()
899             TRI->isVectorRegister(*MRI, MO.getReg())) {  in updateByEvent()
918           assert(TRI->isVectorRegister(*MRI, Op.getReg()));  in updateByEvent()
929         (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {  in updateByEvent()
934         if (!MemOp->isStore() ||  in updateByEvent()
935             MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)  in updateByEvent()
939         auto AAI = MemOp->getAAInfo();  in updateByEvent()
950           for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {  in updateByEvent()
951             if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {  in updateByEvent()
957         if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)  in updateByEvent()
977       OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("  in print()
981       OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("  in print()
988       OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("  in print()
1013         unsigned RelScore = RegScore - LB - 1;  in print()
1026           unsigned RelScore = RegScore - LB - 1;  in print()
1051   // as (UB - LB). If the current Count is greater than or equal to the number  in simplifyWaitcnt()
1067         !ST->hasFlatLgkmVMemCountInOrder()) {  in determineWait()
1073       // Counter can get decremented out-of-order when there  in determineWait()
1079       // MAX(CounterType) - 1 instead.  in determineWait()
1080       unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);  in determineWait()
1103     setScoreLB(T, std::max(getScoreLB(T), UB - Count));  in applyWaitcnt()
1148 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1172   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());  in promoteSoftWaitCnt()
1173   if (Opcode == Waitcnt->getOpcode())  in promoteSoftWaitCnt()
1176   Waitcnt->setDesc(TII->get(Opcode));  in promoteSoftWaitCnt()
1181 /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1182 /// from \p Wait that were added by previous passes. Currently this pass
1223           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();  in applyPreexistingWaitcnt()
1248     LLVM_DEBUG(It == WaitcntInstr->getParent()->end()  in applyPreexistingWaitcnt()
1265     LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()  in applyPreexistingWaitcnt()
1278 /// required counters in \p Wait
1293         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);  in createNewWaitcnt()
1302     assert(ST->hasVscnt());  in createNewWaitcnt()
1305         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))  in createNewWaitcnt()
1320   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);  in getAllZeroWaitcnt()
1328 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1329 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1363           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();  in applyPreexistingWaitcnt()
1371           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();  in applyPreexistingWaitcnt()
1381           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();  in applyPreexistingWaitcnt()
1415       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()  in applyPreexistingWaitcnt()
1423       CombinedLoadDsCntInstr->eraseFromParent();  in applyPreexistingWaitcnt()
1440       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()  in applyPreexistingWaitcnt()
1448       CombinedStoreDsCntInstr->eraseFromParent();  in applyPreexistingWaitcnt()
1480       (*WI)->eraseFromParent();  in applyPreexistingWaitcnt()
1499       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()  in applyPreexistingWaitcnt()
1507       WaitInstrs[CT]->eraseFromParent();  in applyPreexistingWaitcnt()
1515 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1532       SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))  in createNewWaitcnt()
1541           BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))  in createNewWaitcnt()
1566         BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))  in createNewWaitcnt()
1603 ///  We rely on this in-order completion
1642     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));  in generateWaitcntInstBefore()
1652     if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&  in generateWaitcntInstBefore()
1657   // Resolve vm waits before gs-done.  in generateWaitcntInstBefore()
1660            ST->hasLegacyGeometry() &&  in generateWaitcntInstBefore()
1668   // The shader program must flush all EXP operations on the export-count  in generateWaitcntInstBefore()
1701         if (RtnAddrOpIdx != -1) {  in generateWaitcntInstBefore()
1726         const Value *Ptr = Memop->getValue();  in generateWaitcntInstBefore()
1727         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {  in generateWaitcntInstBefore()
1729           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))  in generateWaitcntInstBefore()
1732         unsigned AS = Memop->getAddrSpace();  in generateWaitcntInstBefore()
1736         if (TII->mayWriteLDSThroughDMA(MI))  in generateWaitcntInstBefore()
1748         if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {  in generateWaitcntInstBefore()
1759         if (Memop->isStore()) {  in generateWaitcntInstBefore()
1771         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))  in generateWaitcntInstBefore()
1776         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());  in generateWaitcntInstBefore()
1786                 !ST->hasVmemWriteVgprInOrder()) {  in generateWaitcntInstBefore()
1808   if (TII->isBarrierStart(MI.getOpcode()) &&  in generateWaitcntInstBefore()
1809       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {  in generateWaitcntInstBefore()
1810     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));  in generateWaitcntInstBefore()
1813   // TODO: Remove this work-around, enable the assert for Bug 457939  in generateWaitcntInstBefore()
1816   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {  in generateWaitcntInstBefore()
1826     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);  in generateWaitcntInstBefore()
1865         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);  in generateWaitcnt()
1875         TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);  in generateWaitcnt()
1876     if (Wait.ExpCnt < WaitExp->getImm()) {  in generateWaitcnt()
1877       WaitExp->setImm(Wait.ExpCnt);  in generateWaitcnt()
1886   if (WCG->createNewWaitcnt(Block, It, Wait))  in generateWaitcnt()
1896   assert(TII->isFLAT(MI));  in mayAccessVMEMThroughFlat()
1899   assert(TII->usesVM_CNT(MI));  in mayAccessVMEMThroughFlat()
1912     unsigned AS = Memop->getAddrSpace();  in mayAccessVMEMThroughFlat()
1924   assert(TII->isFLAT(MI));  in mayAccessLDSThroughFlat()
1927   if (!TII->usesLGKM_CNT(MI))  in mayAccessLDSThroughFlat()
1931   if (ST->isTgSplitEnabled())  in mayAccessLDSThroughFlat()
1941     unsigned AS = Memop->getAddrSpace();  in mayAccessLDSThroughFlat()
1953   assert(TII->isFLAT(MI));  in mayAccessScratchThroughFlat()
1956   if (TII->isFLATScratch(MI))  in mayAccessScratchThroughFlat()
1960   if (TII->isFLATGlobal(MI))  in mayAccessScratchThroughFlat()
1970     unsigned AS = Memop->getAddrSpace();  in mayAccessScratchThroughFlat()
1984   // instruction, update the upper-bound of the appropriate counter's  in updateEventWaitcntAfter()
1988   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {  in updateEventWaitcntAfter()
1989     if (TII->isAlwaysGDS(Inst.getOpcode()) ||  in updateEventWaitcntAfter()
1990         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {  in updateEventWaitcntAfter()
1991       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);  in updateEventWaitcntAfter()
1992       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);  in updateEventWaitcntAfter()
1994       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);  in updateEventWaitcntAfter()
1996   } else if (TII->isFLAT(Inst)) {  in updateEventWaitcntAfter()
2007       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),  in updateEventWaitcntAfter()
2013       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);  in updateEventWaitcntAfter()
2020     // - it will require that both the VM and LGKM be flushed to zero if it is  in updateEventWaitcntAfter()
2023       ScoreBrackets->setPendingFlat();  in updateEventWaitcntAfter()
2026     ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),  in updateEventWaitcntAfter()
2029     if (ST->vmemWriteNeedsExpWaitcnt() &&  in updateEventWaitcntAfter()
2031       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);  in updateEventWaitcntAfter()
2033   } else if (TII->isSMRD(Inst)) {  in updateEventWaitcntAfter()
2034     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);  in updateEventWaitcntAfter()
2038       ScoreBrackets->applyWaitcnt(  in updateEventWaitcntAfter()
2039           WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));  in updateEventWaitcntAfter()
2040       ScoreBrackets->setStateOnFunctionEntryOrReturn();  in updateEventWaitcntAfter()
2043       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());  in updateEventWaitcntAfter()
2046     ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);  in updateEventWaitcntAfter()
2047   } else if (TII->isVINTERP(Inst)) {  in updateEventWaitcntAfter()
2048     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();  in updateEventWaitcntAfter()
2049     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);  in updateEventWaitcntAfter()
2051     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();  in updateEventWaitcntAfter()
2053       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);  in updateEventWaitcntAfter()
2055       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);  in updateEventWaitcntAfter()
2057       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);  in updateEventWaitcntAfter()
2064       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);  in updateEventWaitcntAfter()
2073       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);  in updateEventWaitcntAfter()
2088 /// Merge the pending events and associater score brackets of \p Other into
2108     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];  in merge()
2109     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];  in merge()
2117     M.MyShift = NewUB - ScoreUBs[T];  in merge()
2118     M.OtherShift = NewUB - Other.ScoreUBs[T];  in merge()
2164   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and  in insertWaitcntInBlock()
2165   // ST->partialVCCWritesUpdateVCCZ().  in insertWaitcntInBlock()
2167   if (ST->hasReadVCCZBug()) {  in insertWaitcntInBlock()
2169     // to vcc and then issued an smem load.  in insertWaitcntInBlock()
2171   } else if (!ST->partialVCCWritesUpdateVCCZ()) {  in insertWaitcntInBlock()
2185     // Track pre-existing waitcnts that were added in earlier iterations or by  in insertWaitcntInBlock()
2206     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {  in insertWaitcntInBlock()
2210         if (!ST->partialVCCWritesUpdateVCCZ())  in insertWaitcntInBlock()
2212       } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {  in insertWaitcntInBlock()
2219         //    of vcc back to vcc.  in insertWaitcntInBlock()
2220         if (ST->hasReadVCCZBug() &&  in insertWaitcntInBlock()
2222           // Writes to vcc while there's an outstanding smem read may get  in insertWaitcntInBlock()
2226           // Writes to vcc will fix any incorrect value in vccz.  in insertWaitcntInBlock()
2232     if (TII->isSMRD(Inst)) {  in insertWaitcntInBlock()
2236         if (!Memop->isInvariant()) {  in insertWaitcntInBlock()
2237           const Value *Ptr = Memop->getValue();  in insertWaitcntInBlock()
2241       if (ST->hasReadVCCZBug()) {  in insertWaitcntInBlock()
2249     if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {  in insertWaitcntInBlock()
2250       AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(  in insertWaitcntInBlock()
2262     // TODO: Remove this work-around after fixing the scheduler and enable the  in insertWaitcntInBlock()
2265       // Restore the vccz bit.  Any time a value is written to vcc, the vcc  in insertWaitcntInBlock()
2267       // vcc and then writing it back to the register.  in insertWaitcntInBlock()
2269               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),  in insertWaitcntInBlock()
2270               TRI->getVCC())  in insertWaitcntInBlock()
2271           .addReg(TRI->getVCC());  in insertWaitcntInBlock()
2305     return Iterator->second;  in isPreheaderToFlush()
2311   MachineLoop *Loop = MLI->getLoopFor(Succ);  in isPreheaderToFlush()
2315   if (Loop->getLoopPreheader() == &MBB &&  in isPreheaderToFlush()
2317     Iterator->second = true;  in isPreheaderToFlush()
2345   for (MachineBasicBlock *MBB : ML->blocks()) {  in shouldFlushVmCnt()
2355         if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))  in shouldFlushVmCnt()
2391   if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)  in shouldFlushVmCnt()
2393   return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();  in shouldFlushVmCnt()
2398   TII = ST->getInstrInfo();  in runOnMachineFunction()
2399   TRI = &TII->getRegisterInfo();  in runOnMachineFunction()
2405     AA = &AAR->getAAResults();  in runOnMachineFunction()
2407   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());  in runOnMachineFunction()
2409   if (ST->hasExtendedWaitCounts()) {  in runOnMachineFunction()
2423   const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();  in runOnMachineFunction()
2428   if (ST->hasExtendedWaitCounts()) {  in runOnMachineFunction()
2441   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();  in runOnMachineFunction()
2442   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();  in runOnMachineFunction()
2448       TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;  in runOnMachineFunction()
2449   Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;  in runOnMachineFunction()
2451       TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;  in runOnMachineFunction()
2452   Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;  in runOnMachineFunction()
2460   if (!MFI->isEntryFunction()) {  in runOnMachineFunction()
2468          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)  in runOnMachineFunction()
2471     if (ST->hasExtendedWaitCounts()) {  in runOnMachineFunction()
2472       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))  in runOnMachineFunction()
2479                 TII->get(instrsForExtendedCounterTypes[CT]))  in runOnMachineFunction()
2483       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);  in runOnMachineFunction()
2489     NonKernelInitialState->setStateOnFunctionEntryOrReturn();  in runOnMachineFunction()
2507       MachineBasicBlock *MBB = BII->first;  in runOnMachineFunction()
2508       BlockInfo &BI = BII->second;  in runOnMachineFunction()
2530       if (Brackets->hasPendingEvent()) {  in runOnMachineFunction()
2532         for (MachineBasicBlock *Succ : MBB->successors()) {  in runOnMachineFunction()
2534           BlockInfo &SuccBI = SuccBII->second;  in runOnMachineFunction()
2544           } else if (SuccBI.Incoming->merge(*Brackets)) {  in runOnMachineFunction()
2551           MoveBracketsToSucc->Incoming = std::move(Brackets);  in runOnMachineFunction()
2556   if (ST->hasScalarStores()) {  in runOnMachineFunction()
2562         if (!HaveScalarStores && TII->isScalarStore(MI))  in runOnMachineFunction()
2583         for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();  in runOnMachineFunction()
2585           if (I->getOpcode() == AMDGPU::S_DCACHE_WB)  in runOnMachineFunction()
2587           else if (TII->isScalarStore(*I))  in runOnMachineFunction()
2591           if ((I->getOpcode() == AMDGPU::S_ENDPGM ||  in runOnMachineFunction()
2592                I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&  in runOnMachineFunction()
2595             BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));  in runOnMachineFunction()
2605     if (ST->requiresNopBeforeDeallocVGPRs()) {  in runOnMachineFunction()
2606       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))  in runOnMachineFunction()
2609     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  in runOnMachineFunction()
2610             TII->get(AMDGPU::S_SENDMSG))  in runOnMachineFunction()