Lines Matching +full:wait +full:- +full:delay
1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
22 #define DEBUG_TYPE "amdgpu-insert-delay-alu"
44 // These instruction types wait for VA_VDST==0 before issuing. in instructionWaitsForVALU()
59 // Types of delay that can be encoded in an s_delay_alu instruction.
62 // Get the delay type for an instruction with the specified TSFlags.
74 // regunit. In straight-line code there will only be one such instruction, but
75 // when control flow converges we merge the delay information from each path
76 // to represent the union of the worst-case delays of each type.
78 // One larger than the maximum number of (non-TRANS) VALU instructions we
90 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
91 // are left until it completes, and how many other (non-TRANS) VALU we have
101 // Also remember how many other (non-TRANS) VALU we have seen since it was
103 // non-TRANS VALU, this is used to decide whether to encode a wait for just
127 // Guard against pseudo-instructions like SI_CALL which are marked as in DelayInfo()
143 // worst-case delays of each type.
154 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
156 // instruction. Return true if there is no longer any useful delay info.
167 VALUCycles -= Cycles; in advance()
180 TRANSCycles -= Cycles; in advance()
189 SALUCycles -= Cycles; in advance()
214 // A map from regunits to the delay info for that regunit.
216 // Merge another DelayState into this one by merging the delay info for each
224 It->second.merge(KV.second); in merge()
228 // Advance the delay info for each regunit, erasing any that are no longer
234 if (I->second.advance(Type, Cycles)) in advance()
252 return A->first < B->first; in dump()
255 dbgs() << " " << printRegUnit(I->first, TRI); in dump()
256 I->second.dump(); in dump()
263 // The saved delay state at the end of each basic block.
267 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, in emitDelayAlu() argument
271 // Wait for a TRANS instruction. in emitDelayAlu()
272 if (Delay.TRANSNum < DelayInfo::TRANS_MAX) in emitDelayAlu()
273 Imm |= 4 + Delay.TRANSNum; in emitDelayAlu()
275 // Wait for a VALU instruction (if it's more recent than any TRANS in emitDelayAlu()
277 if (Delay.VALUNum < DelayInfo::VALU_MAX && in emitDelayAlu()
278 Delay.VALUNum <= Delay.TRANSNumVALU) { in emitDelayAlu()
280 Imm |= Delay.VALUNum << 7; in emitDelayAlu()
282 Imm |= Delay.VALUNum; in emitDelayAlu()
285 // Wait for an SALU instruction. in emitDelayAlu()
286 if (Delay.SALUCycles) { in emitDelayAlu()
287 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); in emitDelayAlu()
289 // We have already encoded a VALU and a TRANS delay. There's no room in in emitDelayAlu()
290 // the encoding for an SALU delay as well, so just drop it. in emitDelayAlu()
292 Imm |= (Delay.SALUCycles + 8) << 7; in emitDelayAlu()
294 Imm |= Delay.SALUCycles + 8; in emitDelayAlu()
298 // Don't emit the s_delay_alu instruction if there's nothing to wait for. in emitDelayAlu()
302 // If we only need to wait for one instruction, try encoding it in the last in emitDelayAlu()
309 if (!I->isBundle() && !I->isMetaInstruction()) in emitDelayAlu()
313 MachineOperand &Op = LastDelayAlu->getOperand(0); in emitDelayAlu()
316 "Remembered an s_delay_alu with no room for another delay!"); in emitDelayAlu()
325 BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); in emitDelayAlu()
327 // encode another delay. in emitDelayAlu()
362 DelayInfo Delay; in runOnMachineBasicBlock() local
371 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { in runOnMachineBasicBlock()
374 Delay.merge(It->second); in runOnMachineBasicBlock()
381 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or in runOnMachineBasicBlock()
383 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); in runOnMachineBasicBlock()
392 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) in runOnMachineBasicBlock()
435 // Calculate the delay state for each basic block, iterating until we reach in runOnMachineFunction()
464 INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",