xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
11 /// instructions that produce single-use VGPR values. If the value is forwarded
12 /// to the consumer instruction prior to VGPR writeback, the hardware can
13 /// then skip (kill) the VGPR write.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPU.h"
18 #include "AMDGPUGenSearchableTables.inc"
19 #include "GCNSubtarget.h"
20 #include "SIInstrInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/ADT/DenseMap.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/StringRef.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include "llvm/CodeGen/MachineInstr.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineOperand.h"
32 #include "llvm/CodeGen/Register.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/MC/MCRegister.h"
35 #include "llvm/MC/MCRegisterInfo.h"
36 #include "llvm/Pass.h"
37 #include <array>
38 
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
42 
43 namespace {
44 class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
45 private:
46   const SIInstrInfo *SII;
47   class SingleUseInstruction {
48   private:
49     static const unsigned MaxSkipRange = 0b111;
50     static const unsigned MaxNumberOfSkipRegions = 2;
51 
52     unsigned LastEncodedPositionEnd;
53     MachineInstr *ProducerInstr;
54 
55     std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
56     SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
57 
58     // Adds a skip region into the instruction.
59     void skip(const unsigned ProducerPosition) {
60       while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
61         SkipRegions.push_back(MaxSkipRange);
62         LastEncodedPositionEnd += MaxSkipRange;
63       }
64       SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
65       LastEncodedPositionEnd = ProducerPosition;
66     }
67 
68     bool currentRegionHasSpace() {
69       const auto Region = SkipRegions.size();
70       // The first region has an extra bit of encoding space.
71       return SingleUseRegions[Region] <
72              ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
73     }
74 
75     unsigned encodeImm() {
76       // Handle the first Single Use Region separately as it has an extra bit
77       // of encoding space.
78       unsigned Imm = SingleUseRegions[SkipRegions.size()];
79       unsigned ShiftAmount = 4;
80       for (unsigned i = SkipRegions.size(); i > 0; i--) {
81         Imm |= SkipRegions[i - 1] << ShiftAmount;
82         ShiftAmount += 3;
83         Imm |= SingleUseRegions[i - 1] << ShiftAmount;
84         ShiftAmount += 3;
85       }
86       return Imm;
87     }
88 
89   public:
90     SingleUseInstruction(const unsigned ProducerPosition,
91                          MachineInstr *Producer)
92         : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
93           SingleUseRegions({1, 0, 0}) {}
94 
95     // Returns false if adding a new single use producer failed. This happens
96     // because it could not be encoded, either because there is no room to
97     // encode another single use producer region or that this single use
98     // producer is too far away to encode the amount of instructions to skip.
99     bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
100       // Producer is too far away to encode into this instruction or another
101       // skip region is needed and SkipRegions.size() = 2 so there's no room for
102       // another skip region, therefore a new instruction is needed.
103       if (LastEncodedPositionEnd +
104               (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
105           ProducerPosition)
106         return false;
107 
108       // If a skip region is needed.
109       if (LastEncodedPositionEnd != ProducerPosition ||
110           !currentRegionHasSpace()) {
111         // If the current region is out of space therefore a skip region would
112         // be needed, but there is no room for another skip region.
113         if (SkipRegions.size() == MaxNumberOfSkipRegions)
114           return false;
115         skip(ProducerPosition);
116       }
117 
118       SingleUseRegions[SkipRegions.size()]++;
119       LastEncodedPositionEnd = ProducerPosition + 1;
120       ProducerInstr = MI;
121       return true;
122     }
123 
124     auto emit(const SIInstrInfo *SII) {
125       return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
126                      SII->get(AMDGPU::S_SINGLEUSE_VDST))
127           .addImm(encodeImm());
128     }
129   };
130 
131 public:
132   static char ID;
133 
134   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
135 
136   void insertSingleUseInstructions(
137       ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
138     SmallVector<SingleUseInstruction> Instructions;
139 
140     for (auto &[Position, MI] : SingleUseProducers) {
141       // Encode this position into the last single use instruction if possible.
142       if (Instructions.empty() ||
143           !Instructions.back().tryAddProducer(Position, MI)) {
144         // If not, add a new instruction.
145         Instructions.push_back(SingleUseInstruction(Position, MI));
146       }
147     }
148 
149     for (auto &Instruction : Instructions)
150       Instruction.emit(SII);
151   }
152 
153   bool runOnMachineFunction(MachineFunction &MF) override {
154     const auto &ST = MF.getSubtarget<GCNSubtarget>();
155     if (!ST.hasVGPRSingleUseHintInsts())
156       return false;
157 
158     SII = ST.getInstrInfo();
159     const auto *TRI = &SII->getRegisterInfo();
160     bool InstructionEmitted = false;
161 
162     for (MachineBasicBlock &MBB : MF) {
163       DenseMap<MCRegUnit, unsigned> RegisterUseCount;
164 
165       // Handle boundaries at the end of basic block separately to avoid
166       // false positives. If they are live at the end of a basic block then
167       // assume it has more uses later on.
168       for (const auto &Liveout : MBB.liveouts()) {
169         for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
170              ++Units) {
171           const auto [Unit, Mask] = *Units;
172           if ((Mask & Liveout.LaneMask).any())
173             RegisterUseCount[Unit] = 2;
174         }
175       }
176 
177       SmallVector<std::pair<unsigned, MachineInstr *>>
178           SingleUseProducerPositions;
179 
180       unsigned VALUInstrCount = 0;
181       for (MachineInstr &MI : reverse(MBB.instrs())) {
182         // All registers in all operands need to be single use for an
183         // instruction to be marked as a single use producer.
184         bool AllProducerOperandsAreSingleUse = true;
185 
186         // Gather a list of Registers used before updating use counts to avoid
187         // double counting registers that appear multiple times in a single
188         // MachineInstr.
189         SmallVector<MCRegUnit> RegistersUsed;
190 
191         for (const auto &Operand : MI.all_defs()) {
192           const auto Reg = Operand.getReg();
193 
194           const auto RegUnits = TRI->regunits(Reg);
195           if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
196                 return RegisterUseCount[Unit] > 1;
197               }))
198             AllProducerOperandsAreSingleUse = false;
199 
200           // Reset uses count when a register is no longer live.
201           for (const MCRegUnit Unit : RegUnits)
202             RegisterUseCount.erase(Unit);
203         }
204 
205         for (const auto &Operand : MI.all_uses()) {
206           const auto Reg = Operand.getReg();
207 
208           // Count the number of times each register is read.
209           for (const MCRegUnit Unit : TRI->regunits(Reg)) {
210             if (!is_contained(RegistersUsed, Unit))
211               RegistersUsed.push_back(Unit);
212           }
213         }
214         for (const MCRegUnit Unit : RegistersUsed)
215           RegisterUseCount[Unit]++;
216 
217         // Do not attempt to optimise across exec mask changes.
218         if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
219             AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
220           for (auto &UsedReg : RegisterUseCount)
221             UsedReg.second = 2;
222         }
223 
224         if (!SIInstrInfo::isVALU(MI) ||
225             AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
226           continue;
227         if (AllProducerOperandsAreSingleUse) {
228           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
229           InstructionEmitted = true;
230         }
231         VALUInstrCount++;
232       }
233       insertSingleUseInstructions(SingleUseProducerPositions);
234     }
235     return InstructionEmitted;
236   }
237 };
238 } // namespace
239 
240 char AMDGPUInsertSingleUseVDST::ID = 0;
241 
242 char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
243 
244 INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
245                 "AMDGPU Insert SingleUseVDST", false, false)
246