Lines Matching full:wait
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
10 /// Insert wait instructions for memory reads and writes.
19 /// example, when both SMEM and LDS are in flight and we need to wait for
198 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { in getCounterRef() argument
201 return Wait.LoadCnt; in getCounterRef()
203 return Wait.ExpCnt; in getCounterRef()
205 return Wait.DsCnt; in getCounterRef()
207 return Wait.StoreCnt; in getCounterRef()
209 return Wait.SampleCnt; in getCounterRef()
211 return Wait.BvhCnt; in getCounterRef()
213 return Wait.KmCnt; in getCounterRef()
219 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { in addWait() argument
220 unsigned &WC = getCounterRef(Wait, T); in addWait()
224 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { in setNoWait() argument
225 getCounterRef(Wait, T) = ~0u; in setNoWait()
228 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { in getWait() argument
229 return getCounterRef(Wait, T); in getWait()
241 // This objects maintains the current score brackets of each wait counter, and
242 // a per-register scoreboard for each wait counter.
247 // wait count may get decreased out of order, therefore we need to put in
310 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
312 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
313 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
428 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
465 // Edits an existing sequence of wait count instructions according
467 // any new wait count instructions which may need to be generated by
472 // delete instructions if the incoming Wait value indicates they are not
473 // needed. It may also remove existing instructions for which a wait
478 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
484 // Generates new wait count instructions according to the value of
485 // Wait, returning true if any new instructions were created.
488 AMDGPU::Waitcnt Wait) = 0;
500 // Create a mask value from the initializer list of wait event types.
519 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
524 AMDGPU::Waitcnt Wait) override;
555 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
560 AMDGPU::Waitcnt Wait) override;
638 return "SI insert wait instructions"; in getPassName()
722 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
930 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS in updateByEvent()
931 // written can be accessed. A load from LDS to VMEM does not need a wait. in updateByEvent()
1038 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { in simplifyWaitcnt()
1039 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); in simplifyWaitcnt()
1040 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); in simplifyWaitcnt()
1041 simplifyWaitcnt(DS_CNT, Wait.DsCnt); in simplifyWaitcnt()
1042 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); in simplifyWaitcnt()
1043 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); in simplifyWaitcnt()
1044 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); in simplifyWaitcnt()
1045 simplifyWaitcnt(KM_CNT, Wait.KmCnt); in simplifyWaitcnt()
1052 // of outstanding events, then the wait for this counter is redundant. in simplifyWaitcnt()
1058 AMDGPU::Waitcnt &Wait) const { in determineWait()
1071 addWait(Wait, T, 0); in determineWait()
1076 addWait(Wait, T, 0); in determineWait()
1081 addWait(Wait, T, NeededWait); in determineWait()
1086 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { in applyWaitcnt() argument
1087 applyWaitcnt(LOAD_CNT, Wait.LoadCnt); in applyWaitcnt()
1088 applyWaitcnt(EXP_CNT, Wait.ExpCnt); in applyWaitcnt()
1089 applyWaitcnt(DS_CNT, Wait.DsCnt); in applyWaitcnt()
1090 applyWaitcnt(STORE_CNT, Wait.StoreCnt); in applyWaitcnt()
1091 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); in applyWaitcnt()
1092 applyWaitcnt(BVH_CNT, Wait.BvhCnt); in applyWaitcnt()
1093 applyWaitcnt(KM_CNT, Wait.KmCnt); in applyWaitcnt()
1182 /// from \p Wait that were added by previous passes. Currently this pass
1187 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { in applyPreexistingWaitcnt() argument
1203 // Update required wait count. If this is a soft waitcnt (= it was added in applyPreexistingWaitcnt()
1210 Wait = Wait.combined(OldWait); in applyPreexistingWaitcnt()
1213 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { in applyPreexistingWaitcnt()
1226 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); in applyPreexistingWaitcnt()
1228 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) { in applyPreexistingWaitcnt()
1238 AMDGPU::encodeWaitcnt(IV, Wait)); in applyPreexistingWaitcnt()
1241 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); in applyPreexistingWaitcnt()
1242 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); in applyPreexistingWaitcnt()
1243 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); in applyPreexistingWaitcnt()
1244 Wait.LoadCnt = ~0u; in applyPreexistingWaitcnt()
1245 Wait.ExpCnt = ~0u; in applyPreexistingWaitcnt()
1246 Wait.DsCnt = ~0u; in applyPreexistingWaitcnt()
1259 AMDGPU::OpName::simm16, Wait.StoreCnt); in applyPreexistingWaitcnt()
1262 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); in applyPreexistingWaitcnt()
1263 Wait.StoreCnt = ~0u; in applyPreexistingWaitcnt()
1278 /// required counters in \p Wait
1281 AMDGPU::Waitcnt Wait) { in createNewWaitcnt() argument
1290 if (Wait.hasWaitExceptStoreCnt()) { in createNewWaitcnt()
1291 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); in createNewWaitcnt()
1301 if (Wait.hasWaitStoreCnt()) { in createNewWaitcnt()
1307 .addImm(Wait.StoreCnt); in createNewWaitcnt()
1329 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1334 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { in applyPreexistingWaitcnt() argument
1350 // Update required wait count. If this is a soft waitcnt (= it was added in applyPreexistingWaitcnt()
1367 Wait = Wait.combined(OldWait); in applyPreexistingWaitcnt()
1375 Wait = Wait.combined(OldWait); in applyPreexistingWaitcnt()
1384 addWait(Wait, CT.value(), OldCnt); in applyPreexistingWaitcnt()
1400 // the appropriate single counter wait instruction can be inserted in applyPreexistingWaitcnt()
1402 // createNewWaitcnt(). As a side effect, resetting the wait counts will in applyPreexistingWaitcnt()
1405 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { in applyPreexistingWaitcnt()
1406 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); in applyPreexistingWaitcnt()
1410 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); in applyPreexistingWaitcnt()
1411 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); in applyPreexistingWaitcnt()
1412 Wait.LoadCnt = ~0u; in applyPreexistingWaitcnt()
1413 Wait.DsCnt = ~0u; in applyPreexistingWaitcnt()
1430 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { in applyPreexistingWaitcnt()
1431 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait); in applyPreexistingWaitcnt()
1435 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); in applyPreexistingWaitcnt()
1436 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); in applyPreexistingWaitcnt()
1437 Wait.StoreCnt = ~0u; in applyPreexistingWaitcnt()
1438 Wait.DsCnt = ~0u; in applyPreexistingWaitcnt()
1459 if (Wait.DsCnt != ~0u) { in applyPreexistingWaitcnt()
1466 // individual wait count instructions for these. in applyPreexistingWaitcnt()
1468 if (Wait.LoadCnt != ~0u) { in applyPreexistingWaitcnt()
1471 } else if (Wait.StoreCnt != ~0u) { in applyPreexistingWaitcnt()
1490 unsigned NewCnt = getWait(Wait, CT); in applyPreexistingWaitcnt()
1497 setNoWait(Wait, CT); in applyPreexistingWaitcnt()
1515 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1518 AMDGPU::Waitcnt Wait) { in createNewWaitcnt() argument
1525 // Check for opportunities to use combined wait instructions. in createNewWaitcnt()
1526 if (Wait.DsCnt != ~0u) { in createNewWaitcnt()
1529 if (Wait.LoadCnt != ~0u) { in createNewWaitcnt()
1530 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); in createNewWaitcnt()
1535 Wait.LoadCnt = ~0u; in createNewWaitcnt()
1536 Wait.DsCnt = ~0u; in createNewWaitcnt()
1537 } else if (Wait.StoreCnt != ~0u) { in createNewWaitcnt()
1538 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); in createNewWaitcnt()
1544 Wait.StoreCnt = ~0u; in createNewWaitcnt()
1545 Wait.DsCnt = ~0u; in createNewWaitcnt()
1561 unsigned Count = getWait(Wait, CT); in createNewWaitcnt()
1587 // Currently all conventions wait, but this may not always be the case. in callWaitsOnFunctionEntry()
1590 // senses to omit the wait and do it in the caller. in callWaitsOnFunctionEntry()
1594 /// \returns true if the callee is expected to wait for any outstanding waits
1621 AMDGPU::Waitcnt Wait; in generateWaitcntInstBefore() local
1632 Wait.LoadCnt = 0; in generateWaitcntInstBefore()
1642 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); in generateWaitcntInstBefore()
1644 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM in generateWaitcntInstBefore()
1663 Wait.LoadCnt = 0; in generateWaitcntInstBefore()
1678 Wait.ExpCnt = 0; in generateWaitcntInstBefore()
1683 // The function is going to insert a wait on everything in its prolog. in generateWaitcntInstBefore()
1686 Wait = AMDGPU::Waitcnt(); in generateWaitcntInstBefore()
1697 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); in generateWaitcntInstBefore()
1707 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); in generateWaitcntInstBefore()
1728 addWait(Wait, SmemAccessCounter, 0); in generateWaitcntInstBefore()
1735 // No need to wait before load from VMEM to LDS. in generateWaitcntInstBefore()
1746 // will produce a wait using the first (general) LDS DMA wait slot which in generateWaitcntInstBefore()
1747 // will wait on all of them anyway. in generateWaitcntInstBefore()
1753 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); in generateWaitcntInstBefore()
1758 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1760 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1787 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1788 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1789 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1793 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1795 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait); in generateWaitcntInstBefore()
1797 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); in generateWaitcntInstBefore()
1810 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); in generateWaitcntInstBefore()
1818 Wait.DsCnt = 0; in generateWaitcntInstBefore()
1822 // Verify that the wait is actually needed. in generateWaitcntInstBefore()
1823 ScoreBrackets.simplifyWaitcnt(Wait); in generateWaitcntInstBefore()
1826 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); in generateWaitcntInstBefore()
1829 Wait.LoadCnt = 0; in generateWaitcntInstBefore()
1831 Wait.ExpCnt = 0; in generateWaitcntInstBefore()
1833 Wait.DsCnt = 0; in generateWaitcntInstBefore()
1835 Wait.SampleCnt = 0; in generateWaitcntInstBefore()
1837 Wait.BvhCnt = 0; in generateWaitcntInstBefore()
1839 Wait.KmCnt = 0; in generateWaitcntInstBefore()
1843 Wait.LoadCnt = 0; in generateWaitcntInstBefore()
1845 Wait.SampleCnt = 0; in generateWaitcntInstBefore()
1847 Wait.BvhCnt = 0; in generateWaitcntInstBefore()
1850 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, in generateWaitcntInstBefore()
1854 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, in generateWaitcnt() argument
1862 // Try to merge the required wait with preexisting waitcnt instructions. in generateWaitcnt()
1865 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); in generateWaitcnt()
1869 ScoreBrackets.applyWaitcnt(Wait); in generateWaitcnt()
1872 if (Wait.ExpCnt != ~0u && It != Block.instr_end() && in generateWaitcnt()
1876 if (Wait.ExpCnt < WaitExp->getImm()) { in generateWaitcnt()
1877 WaitExp->setImm(Wait.ExpCnt); in generateWaitcnt()
1880 Wait.ExpCnt = ~0u; in generateWaitcnt()
1886 if (WCG->createNewWaitcnt(Block, It, Wait)) in generateWaitcnt()
2037 // Act as a wait on everything in updateEventWaitcntAfter()
2042 // May need to way wait for anything. in updateEventWaitcntAfter()
2216 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD in insertWaitcntInBlock()
2250 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( in insertWaitcntInBlock() local
2252 ScoreBrackets.simplifyWaitcnt(Wait); in insertWaitcntInBlock()
2253 Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, in insertWaitcntInBlock()
2281 AMDGPU::Waitcnt Wait; in insertWaitcntInBlock() local
2285 Wait.LoadCnt = 0; in insertWaitcntInBlock()
2287 Wait.SampleCnt = 0; in insertWaitcntInBlock()
2289 Wait.BvhCnt = 0; in insertWaitcntInBlock()
2293 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, in insertWaitcntInBlock()
2461 // Wait for any outstanding memory operations that the input registers may in runOnMachineFunction()
2462 // depend on. We can't track them and it's better to do the wait after the in runOnMachineFunction()