xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "llvm/ADT/SetVector.h"
19 
20 using namespace llvm;
21 
22 #define DEBUG_TYPE "amdgpu-insert-delay-alu"
23 
24 namespace {
25 
26 class AMDGPUInsertDelayAlu {
27 public:
28   const SIInstrInfo *SII;
29   const TargetRegisterInfo *TRI;
30 
31   const TargetSchedModel *SchedModel;
32 
33   // Return true if MI waits for all outstanding VALU instructions to complete.
34   static bool instructionWaitsForVALU(const MachineInstr &MI) {
35     // These instruction types wait for VA_VDST==0 before issuing.
36     const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
37                                SIInstrFlags::FLAT | SIInstrFlags::MIMG |
38                                SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
39     if (MI.getDesc().TSFlags & VA_VDST_0)
40       return true;
41     if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
42         MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
43       return true;
44     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
45         AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
46       return true;
47     return false;
48   }
49 
50   static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
51     // These instruction types wait for VA_SDST==0 before issuing.
52     uint64_t MIFlags = MI.getDesc().TSFlags;
53     if (MIFlags & SIInstrFlags::SMRD)
54       return true;
55 
56     if (MIFlags & SIInstrFlags::SALU) {
57       for (auto &Op : MI.operands()) {
58         if (Op.isReg())
59           return true;
60       }
61     }
62     return false;
63   }
64 
65   // Types of delay that can be encoded in an s_delay_alu instruction.
66   enum DelayType { VALU, TRANS, SALU, OTHER };
67 
68   // Get the delay type for an instruction with the specified TSFlags.
69   static DelayType getDelayType(uint64_t TSFlags) {
70     if (TSFlags & SIInstrFlags::TRANS)
71       return TRANS;
72     if (TSFlags & SIInstrFlags::VALU)
73       return VALU;
74     if (TSFlags & SIInstrFlags::SALU)
75       return SALU;
76     return OTHER;
77   }
78 
79   // Information about the last instruction(s) that wrote to a particular
80   // regunit. In straight-line code there will only be one such instruction, but
81   // when control flow converges we merge the delay information from each path
82   // to represent the union of the worst-case delays of each type.
83   struct DelayInfo {
84     // One larger than the maximum number of (non-TRANS) VALU instructions we
85     // can encode in an s_delay_alu instruction.
86     static constexpr unsigned VALU_MAX = 5;
87 
88     // One larger than the maximum number of TRANS instructions we can encode in
89     // an s_delay_alu instruction.
90     static constexpr unsigned TRANS_MAX = 4;
91 
92     // One larger than the maximum number of SALU cycles we can encode in an
93     // s_delay_alu instruction.
94     static constexpr unsigned SALU_CYCLES_MAX = 4;
95 
96     // If it was written by a (non-TRANS) VALU, remember how many clock cycles
97     // are left until it completes, and how many other (non-TRANS) VALU we have
98     // seen since it was issued.
99     uint8_t VALUCycles = 0;
100     uint8_t VALUNum = VALU_MAX;
101 
102     // If it was written by a TRANS, remember how many clock cycles are left
103     // until it completes, and how many other TRANS we have seen since it was
104     // issued.
105     uint8_t TRANSCycles = 0;
106     uint8_t TRANSNum = TRANS_MAX;
107     // Also remember how many other (non-TRANS) VALU we have seen since it was
108     // issued. When an instruction depends on both a prior TRANS and a prior
109     // non-TRANS VALU, this is used to decide whether to encode a wait for just
110     // one or both of them.
111     uint8_t TRANSNumVALU = VALU_MAX;
112 
113     // If it was written by an SALU, remember how many clock cycles are left
114     // until it completes.
115     uint8_t SALUCycles = 0;
116 
117     DelayInfo() = default;
118 
119     DelayInfo(DelayType Type, unsigned Cycles) {
120       switch (Type) {
121       default:
122         llvm_unreachable("unexpected type");
123       case VALU:
124         VALUCycles = Cycles;
125         VALUNum = 0;
126         break;
127       case TRANS:
128         TRANSCycles = Cycles;
129         TRANSNum = 0;
130         TRANSNumVALU = 0;
131         break;
132       case SALU:
133         // Guard against pseudo-instructions like SI_CALL which are marked as
134         // SALU but with a very high latency.
135         SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
136         break;
137       }
138     }
139 
140     bool operator==(const DelayInfo &RHS) const {
141       return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
142              TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
143              TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
144     }
145 
146     bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
147 
148     // Merge another DelayInfo into this one, to represent the union of the
149     // worst-case delays of each type.
150     void merge(const DelayInfo &RHS) {
151       VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
152       VALUNum = std::min(VALUNum, RHS.VALUNum);
153       TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
154       TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
155       TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
156       SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
157     }
158 
159     // Update this DelayInfo after issuing an instruction of the specified type.
160     // Cycles is the number of cycles it takes to issue the instruction.  Return
161     // true if there is no longer any useful delay info.
162     bool advance(DelayType Type, unsigned Cycles) {
163       bool Erase = true;
164 
165       VALUNum += (Type == VALU);
166       if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
167         // Forget about the VALU instruction. It was too far back or has
168         // definitely completed by now.
169         VALUNum = VALU_MAX;
170         VALUCycles = 0;
171       } else {
172         VALUCycles -= Cycles;
173         Erase = false;
174       }
175 
176       TRANSNum += (Type == TRANS);
177       TRANSNumVALU += (Type == VALU);
178       if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
179         // Forget about any TRANS instruction. It was too far back or has
180         // definitely completed by now.
181         TRANSNum = TRANS_MAX;
182         TRANSNumVALU = VALU_MAX;
183         TRANSCycles = 0;
184       } else {
185         TRANSCycles -= Cycles;
186         Erase = false;
187       }
188 
189       if (SALUCycles <= Cycles) {
190         // Forget about any SALU instruction. It has definitely completed by
191         // now.
192         SALUCycles = 0;
193       } else {
194         SALUCycles -= Cycles;
195         Erase = false;
196       }
197 
198       return Erase;
199     }
200 
201 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
202     void dump() const {
203       if (VALUCycles)
204         dbgs() << " VALUCycles=" << (int)VALUCycles;
205       if (VALUNum < VALU_MAX)
206         dbgs() << " VALUNum=" << (int)VALUNum;
207       if (TRANSCycles)
208         dbgs() << " TRANSCycles=" << (int)TRANSCycles;
209       if (TRANSNum < TRANS_MAX)
210         dbgs() << " TRANSNum=" << (int)TRANSNum;
211       if (TRANSNumVALU < VALU_MAX)
212         dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
213       if (SALUCycles)
214         dbgs() << " SALUCycles=" << (int)SALUCycles;
215     }
216 #endif
217   };
218 
219   // A map from regunits to the delay info for that regunit.
220   struct DelayState : DenseMap<unsigned, DelayInfo> {
221     // Merge another DelayState into this one by merging the delay info for each
222     // regunit.
223     void merge(const DelayState &RHS) {
224       for (const auto &KV : RHS) {
225         iterator It;
226         bool Inserted;
227         std::tie(It, Inserted) = insert(KV);
228         if (!Inserted)
229           It->second.merge(KV.second);
230       }
231     }
232 
233     // Advance the delay info for each regunit, erasing any that are no longer
234     // useful.
235     void advance(DelayType Type, unsigned Cycles) {
236       iterator Next;
237       for (auto I = begin(), E = end(); I != E; I = Next) {
238         Next = std::next(I);
239         if (I->second.advance(Type, Cycles))
240           erase(I);
241       }
242     }
243 
244     void advanceByVALUNum(unsigned VALUNum) {
245       iterator Next;
246       for (auto I = begin(), E = end(); I != E; I = Next) {
247         Next = std::next(I);
248         if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
249           erase(I);
250         }
251       }
252     }
253 
254 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
255     void dump(const TargetRegisterInfo *TRI) const {
256       if (empty()) {
257         dbgs() << "    empty\n";
258         return;
259       }
260 
261       // Dump DelayInfo for each RegUnit in numerical order.
262       SmallVector<const_iterator, 8> Order;
263       Order.reserve(size());
264       for (const_iterator I = begin(), E = end(); I != E; ++I)
265         Order.push_back(I);
266       llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
267         return A->first < B->first;
268       });
269       for (const_iterator I : Order) {
270         dbgs() << "    " << printRegUnit(I->first, TRI);
271         I->second.dump();
272         dbgs() << "\n";
273       }
274     }
275 #endif
276   };
277 
278   // The saved delay state at the end of each basic block.
279   DenseMap<MachineBasicBlock *, DelayState> BlockState;
280 
281   // Emit an s_delay_alu instruction if necessary before MI.
282   MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
283                              MachineInstr *LastDelayAlu) {
284     unsigned Imm = 0;
285 
286     // Wait for a TRANS instruction.
287     if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
288       Imm |= 4 + Delay.TRANSNum;
289 
290     // Wait for a VALU instruction (if it's more recent than any TRANS
291     // instruction that we're also waiting for).
292     if (Delay.VALUNum < DelayInfo::VALU_MAX &&
293         Delay.VALUNum <= Delay.TRANSNumVALU) {
294       if (Imm & 0xf)
295         Imm |= Delay.VALUNum << 7;
296       else
297         Imm |= Delay.VALUNum;
298     }
299 
300     // Wait for an SALU instruction.
301     if (Delay.SALUCycles) {
302       assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
303       if (Imm & 0x780) {
304         // We have already encoded a VALU and a TRANS delay. There's no room in
305         // the encoding for an SALU delay as well, so just drop it.
306       } else if (Imm & 0xf) {
307         Imm |= (Delay.SALUCycles + 8) << 7;
308       } else {
309         Imm |= Delay.SALUCycles + 8;
310       }
311     }
312 
313     // Don't emit the s_delay_alu instruction if there's nothing to wait for.
314     if (!Imm)
315       return LastDelayAlu;
316 
317     // If we only need to wait for one instruction, try encoding it in the last
318     // s_delay_alu that we emitted.
319     if (!(Imm & 0x780) && LastDelayAlu) {
320       unsigned Skip = 0;
321       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
322                 E = MachineBasicBlock::instr_iterator(MI);
323            ++I != E;) {
324         if (!I->isBundle() && !I->isMetaInstruction())
325           ++Skip;
326       }
327       if (Skip < 6) {
328         MachineOperand &Op = LastDelayAlu->getOperand(0);
329         unsigned LastImm = Op.getImm();
330         assert((LastImm & ~0xf) == 0 &&
331                "Remembered an s_delay_alu with no room for another delay!");
332         LastImm |= Imm << 7 | Skip << 4;
333         Op.setImm(LastImm);
334         return nullptr;
335       }
336     }
337 
338     auto &MBB = *MI.getParent();
339     MachineInstr *DelayAlu =
340         BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
341     // Remember the s_delay_alu for next time if there is still room in it to
342     // encode another delay.
343     return (Imm & 0x780) ? nullptr : DelayAlu;
344   }
345 
346   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
347     DelayState State;
348     for (auto *Pred : MBB.predecessors())
349       State.merge(BlockState[Pred]);
350 
351     LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
352                       << "\n";
353                State.dump(TRI););
354 
355     bool Changed = false;
356     MachineInstr *LastDelayAlu = nullptr;
357 
358     MCRegUnit LastSGPRFromVALU = 0;
359     // Iterate over the contents of bundles, but don't emit any instructions
360     // inside a bundle.
361     for (auto &MI : MBB.instrs()) {
362       if (MI.isBundle() || MI.isMetaInstruction())
363         continue;
364 
365       // Ignore some more instructions that do not generate any code.
366       switch (MI.getOpcode()) {
367       case AMDGPU::SI_RETURN_TO_EPILOG:
368         continue;
369       }
370 
371       DelayType Type = getDelayType(MI.getDesc().TSFlags);
372 
373       if (instructionWaitsForSGPRWrites(MI)) {
374         auto It = State.find(LastSGPRFromVALU);
375         if (It != State.end()) {
376           DelayInfo Info = It->getSecond();
377           State.advanceByVALUNum(Info.VALUNum);
378           LastSGPRFromVALU = 0;
379         }
380       }
381 
382       if (instructionWaitsForVALU(MI)) {
383         // Forget about all outstanding VALU delays.
384         // TODO: This is overkill since it also forgets about SALU delays.
385         State = DelayState();
386       } else if (Type != OTHER) {
387         DelayInfo Delay;
388         // TODO: Scan implicit uses too?
389         for (const auto &Op : MI.explicit_uses()) {
390           if (Op.isReg()) {
391             // One of the operands of the writelane is also the output operand.
392             // This creates the insertion of redundant delays. Hence, we have to
393             // ignore this operand.
394             if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
395               continue;
396             for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
397               auto It = State.find(Unit);
398               if (It != State.end()) {
399                 Delay.merge(It->second);
400                 State.erase(Unit);
401               }
402             }
403           }
404         }
405 
406         if (SII->isVALU(MI.getOpcode())) {
407           for (const auto &Op : MI.defs()) {
408             Register Reg = Op.getReg();
409             if (AMDGPU::isSGPR(Reg, TRI)) {
410               LastSGPRFromVALU = *TRI->regunits(Reg).begin();
411               break;
412             }
413           }
414         }
415 
416         if (Emit && !MI.isBundledWithPred()) {
417           // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
418           // just ignore them?
419           LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
420         }
421       }
422 
423       if (Type != OTHER) {
424         // TODO: Scan implicit defs too?
425         for (const auto &Op : MI.defs()) {
426           unsigned Latency = SchedModel->computeOperandLatency(
427               &MI, Op.getOperandNo(), nullptr, 0);
428           for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
429             State[Unit] = DelayInfo(Type, Latency);
430         }
431       }
432 
433       // Advance by the number of cycles it takes to issue this instruction.
434       // TODO: Use a more advanced model that accounts for instructions that
435       // take multiple cycles to issue on a particular pipeline.
436       unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
437       // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
438       // instructions on the assumption that they will usually have to be issued
439       // twice?
440       State.advance(Type, Cycles);
441 
442       LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
443     }
444 
445     if (Emit) {
446       assert(State == BlockState[&MBB] &&
447              "Basic block state should not have changed on final pass!");
448     } else if (DelayState &BS = BlockState[&MBB]; State != BS) {
449       BS = std::move(State);
450       Changed = true;
451     }
452     return Changed;
453   }
454 
455   bool run(MachineFunction &MF) {
456     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
457                       << "\n");
458 
459     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460     if (!ST.hasDelayAlu())
461       return false;
462 
463     SII = ST.getInstrInfo();
464     TRI = ST.getRegisterInfo();
465     SchedModel = &SII->getSchedModel();
466 
467     // Calculate the delay state for each basic block, iterating until we reach
468     // a fixed point.
469     SetVector<MachineBasicBlock *> WorkList;
470     for (auto &MBB : reverse(MF))
471       WorkList.insert(&MBB);
472     while (!WorkList.empty()) {
473       auto &MBB = *WorkList.pop_back_val();
474       bool Changed = runOnMachineBasicBlock(MBB, false);
475       if (Changed)
476         WorkList.insert_range(MBB.successors());
477     }
478 
479     LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
480 
481     // Make one last pass over all basic blocks to emit s_delay_alu
482     // instructions.
483     bool Changed = false;
484     for (auto &MBB : MF)
485       Changed |= runOnMachineBasicBlock(MBB, true);
486     return Changed;
487   }
488 };
489 
490 class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
491 public:
492   static char ID;
493 
494   AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {}
495 
496   void getAnalysisUsage(AnalysisUsage &AU) const override {
497     AU.setPreservesCFG();
498     MachineFunctionPass::getAnalysisUsage(AU);
499   }
500 
501   bool runOnMachineFunction(MachineFunction &MF) override {
502     if (skipFunction(MF.getFunction()))
503       return false;
504     AMDGPUInsertDelayAlu Impl;
505     return Impl.run(MF);
506   }
507 };
508 } // namespace
509 
510 PreservedAnalyses
511 AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
512                               MachineFunctionAnalysisManager &MFAM) {
513   if (!AMDGPUInsertDelayAlu().run(MF))
514     return PreservedAnalyses::all();
515   auto PA = getMachineFunctionPassPreservedAnalyses();
516   PA.preserveSet<CFGAnalyses>();
517   return PA;
518 } // end namespace llvm
519 
520 char AMDGPUInsertDelayAluLegacy::ID = 0;
521 
522 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
523 
524 INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
525                 "AMDGPU Insert Delay ALU", false, false)
526