Lines Matching +full:vcc +full:- +full:p
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
17 /// finely-grained approach that keeps one timeline per event type could
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
24 //===----------------------------------------------------------------------===//
42 #define DEBUG_TYPE "si-insert-waitcnts"
44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
52 "amdgpu-waitcnt-forcezero",
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
118 SMEM_ACCESS, // scalar-memory read & write
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
129 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
148 // Enumerate different types of result-returning VMEM operations. Although
151 // their results in order -- so there is no need to insert an s_waitcnt between
189 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); in getVmemType()
193 return BaseInfo->BVH ? VMEM_BVH in getVmemType()
194 : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER in getVmemType()
242 // a per-register scoreboard for each wait counter.
292 return getScoreUB(T) - getScoreLB(T); in getScoreRange()
300 return SgprScores[GprNo - NUM_ALL_VGPRS]; in getRegScore()
332 return Events & (Events - 1); in hasMixedPendingEvents()
394 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); in setScoreUB()
403 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); in setRegScore()
404 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; in setRegScore()
425 int VgprUB = -1;
426 int SgprUB = -1;
429 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
436 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
456 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), in WaitcntGenerator()
457 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), in WaitcntGenerator()
604 // because of amdgpu-waitcnt-forcezero flag
609 // generator objects, which must have been re-initialised before use
658 // For non-debug builds, ForceEmitWaitcnt has been initialized to false; in setForceEmitWaitcnt()
700 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) in getVmemWaitEventType()
710 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) in getVmemWaitEventType()
738 const MachineOperand &Op = MI->getOperand(OpNo); in getRegInterval()
739 if (!TRI->isInAllocatableClass(Op.getReg())) in getRegInterval()
740 return {-1, -1}; in getRegInterval()
748 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & in getRegInterval()
751 if (TRI->isVectorRegister(*MRI, Op.getReg())) { in getRegInterval()
753 Result.first = Reg - Encoding.VGPR0; in getRegInterval()
754 if (TRI->isAGPR(*MRI, Op.getReg())) in getRegInterval()
757 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { in getRegInterval()
759 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; in getRegInterval()
764 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... in getRegInterval()
766 return {-1, -1}; in getRegInterval()
768 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); in getRegInterval()
769 unsigned Size = TRI->getRegSizeInBits(*RC); in getRegInterval()
781 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); in setExpScore()
799 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. in updateByEvent()
806 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { in updateByEvent()
811 if (AddrOpIdx != -1) { in updateByEvent()
835 TRI->isVectorRegister(*MRI, Op.getReg())) { in updateByEvent()
840 } else if (TII->isFLAT(Inst)) { in updateByEvent()
852 } else if (TII->isMIMG(Inst)) { in updateByEvent()
861 } else if (TII->isMTBUF(Inst)) { in updateByEvent()
865 } else if (TII->isMUBUF(Inst)) { in updateByEvent()
874 } else if (TII->isLDSDIR(Inst)) { in updateByEvent()
881 if (TII->isEXP(Inst)) { in updateByEvent()
889 TRI->isVGPR(*MRI, DefMO.getReg())) { in updateByEvent()
891 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), in updateByEvent()
899 TRI->isVectorRegister(*MRI, MO.getReg())) { in updateByEvent()
918 assert(TRI->isVectorRegister(*MRI, Op.getReg())); in updateByEvent()
929 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { in updateByEvent()
934 if (!MemOp->isStore() || in updateByEvent()
935 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) in updateByEvent()
939 auto AAI = MemOp->getAAInfo(); in updateByEvent()
950 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { in updateByEvent()
951 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { in updateByEvent()
957 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) in updateByEvent()
977 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" in print()
981 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" in print()
988 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" in print()
1013 unsigned RelScore = RegScore - LB - 1; in print()
1026 unsigned RelScore = RegScore - LB - 1; in print()
1051 // as (UB - LB). If the current Count is greater than or equal to the number in simplifyWaitcnt()
1067 !ST->hasFlatLgkmVMemCountInOrder()) { in determineWait()
1073 // Counter can get decremented out-of-order when there in determineWait()
1079 // MAX(CounterType) - 1 instead. in determineWait()
1080 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); in determineWait()
1103 setScoreLB(T, std::max(getScoreLB(T), UB - Count)); in applyWaitcnt()
1148 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1172 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode()); in promoteSoftWaitCnt()
1173 if (Opcode == Waitcnt->getOpcode()) in promoteSoftWaitCnt()
1176 Waitcnt->setDesc(TII->get(Opcode)); in promoteSoftWaitCnt()
1181 /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1182 /// from \p Wait that were added by previous passes. Currently this pass
1223 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); in applyPreexistingWaitcnt()
1248 LLVM_DEBUG(It == WaitcntInstr->getParent()->end() in applyPreexistingWaitcnt()
1265 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() in applyPreexistingWaitcnt()
1278 /// required counters in \p Wait
1293 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); in createNewWaitcnt()
1302 assert(ST->hasVscnt()); in createNewWaitcnt()
1305 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) in createNewWaitcnt()
1320 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); in getAllZeroWaitcnt()
1328 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1329 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1363 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); in applyPreexistingWaitcnt()
1371 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); in applyPreexistingWaitcnt()
1381 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); in applyPreexistingWaitcnt()
1415 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() in applyPreexistingWaitcnt()
1423 CombinedLoadDsCntInstr->eraseFromParent(); in applyPreexistingWaitcnt()
1440 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() in applyPreexistingWaitcnt()
1448 CombinedStoreDsCntInstr->eraseFromParent(); in applyPreexistingWaitcnt()
1480 (*WI)->eraseFromParent(); in applyPreexistingWaitcnt()
1499 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() in applyPreexistingWaitcnt()
1507 WaitInstrs[CT]->eraseFromParent(); in applyPreexistingWaitcnt()
1515 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1532 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) in createNewWaitcnt()
1541 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) in createNewWaitcnt()
1566 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) in createNewWaitcnt()
1603 /// We rely on this in-order completion
1642 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); in generateWaitcntInstBefore()
1652 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() && in generateWaitcntInstBefore()
1657 // Resolve vm waits before gs-done. in generateWaitcntInstBefore()
1660 ST->hasLegacyGeometry() && in generateWaitcntInstBefore()
1668 // The shader program must flush all EXP operations on the export-count in generateWaitcntInstBefore()
1701 if (RtnAddrOpIdx != -1) { in generateWaitcntInstBefore()
1726 const Value *Ptr = Memop->getValue(); in generateWaitcntInstBefore()
1727 if (Memop->isStore() && SLoadAddresses.count(Ptr)) { in generateWaitcntInstBefore()
1729 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) in generateWaitcntInstBefore()
1732 unsigned AS = Memop->getAddrSpace(); in generateWaitcntInstBefore()
1736 if (TII->mayWriteLDSThroughDMA(MI)) in generateWaitcntInstBefore()
1748 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { in generateWaitcntInstBefore()
1759 if (Memop->isStore()) { in generateWaitcntInstBefore()
1771 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) in generateWaitcntInstBefore()
1776 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); in generateWaitcntInstBefore()
1786 !ST->hasVmemWriteVgprInOrder()) { in generateWaitcntInstBefore()
1808 if (TII->isBarrierStart(MI.getOpcode()) && in generateWaitcntInstBefore()
1809 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { in generateWaitcntInstBefore()
1810 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); in generateWaitcntInstBefore()
1813 // TODO: Remove this work-around, enable the assert for Bug 457939 in generateWaitcntInstBefore()
1816 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { in generateWaitcntInstBefore()
1826 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); in generateWaitcntInstBefore()
1865 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); in generateWaitcnt()
1875 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); in generateWaitcnt()
1876 if (Wait.ExpCnt < WaitExp->getImm()) { in generateWaitcnt()
1877 WaitExp->setImm(Wait.ExpCnt); in generateWaitcnt()
1886 if (WCG->createNewWaitcnt(Block, It, Wait)) in generateWaitcnt()
1896 assert(TII->isFLAT(MI)); in mayAccessVMEMThroughFlat()
1899 assert(TII->usesVM_CNT(MI)); in mayAccessVMEMThroughFlat()
1912 unsigned AS = Memop->getAddrSpace(); in mayAccessVMEMThroughFlat()
1924 assert(TII->isFLAT(MI)); in mayAccessLDSThroughFlat()
1927 if (!TII->usesLGKM_CNT(MI)) in mayAccessLDSThroughFlat()
1931 if (ST->isTgSplitEnabled()) in mayAccessLDSThroughFlat()
1941 unsigned AS = Memop->getAddrSpace(); in mayAccessLDSThroughFlat()
1953 assert(TII->isFLAT(MI)); in mayAccessScratchThroughFlat()
1956 if (TII->isFLATScratch(MI)) in mayAccessScratchThroughFlat()
1960 if (TII->isFLATGlobal(MI)) in mayAccessScratchThroughFlat()
1970 unsigned AS = Memop->getAddrSpace(); in mayAccessScratchThroughFlat()
1984 // instruction, update the upper-bound of the appropriate counter's in updateEventWaitcntAfter()
1988 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { in updateEventWaitcntAfter()
1989 if (TII->isAlwaysGDS(Inst.getOpcode()) || in updateEventWaitcntAfter()
1990 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { in updateEventWaitcntAfter()
1991 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); in updateEventWaitcntAfter()
1992 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); in updateEventWaitcntAfter()
1994 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); in updateEventWaitcntAfter()
1996 } else if (TII->isFLAT(Inst)) { in updateEventWaitcntAfter()
2007 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), in updateEventWaitcntAfter()
2013 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); in updateEventWaitcntAfter()
2020 // - it will require that both the VM and LGKM be flushed to zero if it is in updateEventWaitcntAfter()
2023 ScoreBrackets->setPendingFlat(); in updateEventWaitcntAfter()
2026 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), in updateEventWaitcntAfter()
2029 if (ST->vmemWriteNeedsExpWaitcnt() && in updateEventWaitcntAfter()
2031 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); in updateEventWaitcntAfter()
2033 } else if (TII->isSMRD(Inst)) { in updateEventWaitcntAfter()
2034 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); in updateEventWaitcntAfter()
2038 ScoreBrackets->applyWaitcnt( in updateEventWaitcntAfter()
2039 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); in updateEventWaitcntAfter()
2040 ScoreBrackets->setStateOnFunctionEntryOrReturn(); in updateEventWaitcntAfter()
2043 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); in updateEventWaitcntAfter()
2046 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); in updateEventWaitcntAfter()
2047 } else if (TII->isVINTERP(Inst)) { in updateEventWaitcntAfter()
2048 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); in updateEventWaitcntAfter()
2049 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); in updateEventWaitcntAfter()
2051 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); in updateEventWaitcntAfter()
2053 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); in updateEventWaitcntAfter()
2055 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); in updateEventWaitcntAfter()
2057 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); in updateEventWaitcntAfter()
2064 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); in updateEventWaitcntAfter()
2073 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); in updateEventWaitcntAfter()
2088 /// Merge the pending events and associater score brackets of \p Other into
2108 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; in merge()
2109 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; in merge()
2117 M.MyShift = NewUB - ScoreUBs[T]; in merge()
2118 M.OtherShift = NewUB - Other.ScoreUBs[T]; in merge()
2164 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and in insertWaitcntInBlock()
2165 // ST->partialVCCWritesUpdateVCCZ(). in insertWaitcntInBlock()
2167 if (ST->hasReadVCCZBug()) { in insertWaitcntInBlock()
2169 // to vcc and then issued an smem load. in insertWaitcntInBlock()
2171 } else if (!ST->partialVCCWritesUpdateVCCZ()) { in insertWaitcntInBlock()
2185 // Track pre-existing waitcnts that were added in earlier iterations or by in insertWaitcntInBlock()
2206 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { in insertWaitcntInBlock()
2210 if (!ST->partialVCCWritesUpdateVCCZ()) in insertWaitcntInBlock()
2212 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) { in insertWaitcntInBlock()
2219 // of vcc back to vcc. in insertWaitcntInBlock()
2220 if (ST->hasReadVCCZBug() && in insertWaitcntInBlock()
2222 // Writes to vcc while there's an outstanding smem read may get in insertWaitcntInBlock()
2226 // Writes to vcc will fix any incorrect value in vccz. in insertWaitcntInBlock()
2232 if (TII->isSMRD(Inst)) { in insertWaitcntInBlock()
2236 if (!Memop->isInvariant()) { in insertWaitcntInBlock()
2237 const Value *Ptr = Memop->getValue(); in insertWaitcntInBlock()
2241 if (ST->hasReadVCCZBug()) { in insertWaitcntInBlock()
2249 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { in insertWaitcntInBlock()
2250 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( in insertWaitcntInBlock()
2262 // TODO: Remove this work-around after fixing the scheduler and enable the in insertWaitcntInBlock()
2265 // Restore the vccz bit. Any time a value is written to vcc, the vcc in insertWaitcntInBlock()
2267 // vcc and then writing it back to the register. in insertWaitcntInBlock()
2269 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), in insertWaitcntInBlock()
2270 TRI->getVCC()) in insertWaitcntInBlock()
2271 .addReg(TRI->getVCC()); in insertWaitcntInBlock()
2305 return Iterator->second; in isPreheaderToFlush()
2311 MachineLoop *Loop = MLI->getLoopFor(Succ); in isPreheaderToFlush()
2315 if (Loop->getLoopPreheader() == &MBB && in isPreheaderToFlush()
2317 Iterator->second = true; in isPreheaderToFlush()
2345 for (MachineBasicBlock *MBB : ML->blocks()) { in shouldFlushVmCnt()
2355 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) in shouldFlushVmCnt()
2391 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) in shouldFlushVmCnt()
2393 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); in shouldFlushVmCnt()
2398 TII = ST->getInstrInfo(); in runOnMachineFunction()
2399 TRI = &TII->getRegisterInfo(); in runOnMachineFunction()
2405 AA = &AAR->getAAResults(); in runOnMachineFunction()
2407 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); in runOnMachineFunction()
2409 if (ST->hasExtendedWaitCounts()) { in runOnMachineFunction()
2423 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); in runOnMachineFunction()
2428 if (ST->hasExtendedWaitCounts()) { in runOnMachineFunction()
2441 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); in runOnMachineFunction()
2442 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); in runOnMachineFunction()
2448 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; in runOnMachineFunction()
2449 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; in runOnMachineFunction()
2451 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; in runOnMachineFunction()
2452 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; in runOnMachineFunction()
2460 if (!MFI->isEntryFunction()) { in runOnMachineFunction()
2468 I != E && (I->isPHI() || I->isMetaInstruction()); ++I) in runOnMachineFunction()
2471 if (ST->hasExtendedWaitCounts()) { in runOnMachineFunction()
2472 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) in runOnMachineFunction()
2479 TII->get(instrsForExtendedCounterTypes[CT])) in runOnMachineFunction()
2483 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); in runOnMachineFunction()
2489 NonKernelInitialState->setStateOnFunctionEntryOrReturn(); in runOnMachineFunction()
2507 MachineBasicBlock *MBB = BII->first; in runOnMachineFunction()
2508 BlockInfo &BI = BII->second; in runOnMachineFunction()
2530 if (Brackets->hasPendingEvent()) { in runOnMachineFunction()
2532 for (MachineBasicBlock *Succ : MBB->successors()) { in runOnMachineFunction()
2534 BlockInfo &SuccBI = SuccBII->second; in runOnMachineFunction()
2544 } else if (SuccBI.Incoming->merge(*Brackets)) { in runOnMachineFunction()
2551 MoveBracketsToSucc->Incoming = std::move(Brackets); in runOnMachineFunction()
2556 if (ST->hasScalarStores()) { in runOnMachineFunction()
2562 if (!HaveScalarStores && TII->isScalarStore(MI)) in runOnMachineFunction()
2583 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); in runOnMachineFunction()
2585 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) in runOnMachineFunction()
2587 else if (TII->isScalarStore(*I)) in runOnMachineFunction()
2591 if ((I->getOpcode() == AMDGPU::S_ENDPGM || in runOnMachineFunction()
2592 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && in runOnMachineFunction()
2595 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); in runOnMachineFunction()
2605 if (ST->requiresNopBeforeDeallocVGPRs()) { in runOnMachineFunction()
2606 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP)) in runOnMachineFunction()
2609 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), in runOnMachineFunction()
2610 TII->get(AMDGPU::S_SENDMSG)) in runOnMachineFunction()