xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
15f757f3fSDimitry Andric //===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
25f757f3fSDimitry Andric //
35f757f3fSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45f757f3fSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55f757f3fSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65f757f3fSDimitry Andric //
75f757f3fSDimitry Andric //===----------------------------------------------------------------------===//
85f757f3fSDimitry Andric //
95f757f3fSDimitry Andric /// \file
105f757f3fSDimitry Andric /// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
115f757f3fSDimitry Andric /// instructions that produce single-use VGPR values. If the value is forwarded
125f757f3fSDimitry Andric /// to the consumer instruction prior to VGPR writeback, the hardware can
135f757f3fSDimitry Andric /// then skip (kill) the VGPR write.
145f757f3fSDimitry Andric //
155f757f3fSDimitry Andric //===----------------------------------------------------------------------===//
165f757f3fSDimitry Andric 
175f757f3fSDimitry Andric #include "AMDGPU.h"
18*0fca6ea1SDimitry Andric #include "AMDGPUGenSearchableTables.inc"
195f757f3fSDimitry Andric #include "GCNSubtarget.h"
205f757f3fSDimitry Andric #include "SIInstrInfo.h"
21*0fca6ea1SDimitry Andric #include "SIRegisterInfo.h"
225f757f3fSDimitry Andric #include "llvm/ADT/DenseMap.h"
235f757f3fSDimitry Andric #include "llvm/ADT/STLExtras.h"
24*0fca6ea1SDimitry Andric #include "llvm/ADT/SmallVector.h"
255f757f3fSDimitry Andric #include "llvm/ADT/StringRef.h"
265f757f3fSDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
275f757f3fSDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
285f757f3fSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
295f757f3fSDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
305f757f3fSDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
315f757f3fSDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
325f757f3fSDimitry Andric #include "llvm/CodeGen/Register.h"
335f757f3fSDimitry Andric #include "llvm/IR/DebugLoc.h"
345f757f3fSDimitry Andric #include "llvm/MC/MCRegister.h"
35*0fca6ea1SDimitry Andric #include "llvm/MC/MCRegisterInfo.h"
365f757f3fSDimitry Andric #include "llvm/Pass.h"
37*0fca6ea1SDimitry Andric #include <array>
385f757f3fSDimitry Andric 
395f757f3fSDimitry Andric using namespace llvm;
405f757f3fSDimitry Andric 
415f757f3fSDimitry Andric #define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
425f757f3fSDimitry Andric 
435f757f3fSDimitry Andric namespace {
445f757f3fSDimitry Andric class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
455f757f3fSDimitry Andric private:
465f757f3fSDimitry Andric   const SIInstrInfo *SII;
47*0fca6ea1SDimitry Andric   class SingleUseInstruction {
48*0fca6ea1SDimitry Andric   private:
49*0fca6ea1SDimitry Andric     static const unsigned MaxSkipRange = 0b111;
50*0fca6ea1SDimitry Andric     static const unsigned MaxNumberOfSkipRegions = 2;
51*0fca6ea1SDimitry Andric 
52*0fca6ea1SDimitry Andric     unsigned LastEncodedPositionEnd;
53*0fca6ea1SDimitry Andric     MachineInstr *ProducerInstr;
54*0fca6ea1SDimitry Andric 
55*0fca6ea1SDimitry Andric     std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
56*0fca6ea1SDimitry Andric     SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
57*0fca6ea1SDimitry Andric 
58*0fca6ea1SDimitry Andric     // Adds a skip region into the instruction.
skip(const unsigned ProducerPosition)59*0fca6ea1SDimitry Andric     void skip(const unsigned ProducerPosition) {
60*0fca6ea1SDimitry Andric       while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
61*0fca6ea1SDimitry Andric         SkipRegions.push_back(MaxSkipRange);
62*0fca6ea1SDimitry Andric         LastEncodedPositionEnd += MaxSkipRange;
63*0fca6ea1SDimitry Andric       }
64*0fca6ea1SDimitry Andric       SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
65*0fca6ea1SDimitry Andric       LastEncodedPositionEnd = ProducerPosition;
66*0fca6ea1SDimitry Andric     }
67*0fca6ea1SDimitry Andric 
currentRegionHasSpace()68*0fca6ea1SDimitry Andric     bool currentRegionHasSpace() {
69*0fca6ea1SDimitry Andric       const auto Region = SkipRegions.size();
70*0fca6ea1SDimitry Andric       // The first region has an extra bit of encoding space.
71*0fca6ea1SDimitry Andric       return SingleUseRegions[Region] <
72*0fca6ea1SDimitry Andric              ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
73*0fca6ea1SDimitry Andric     }
74*0fca6ea1SDimitry Andric 
encodeImm()75*0fca6ea1SDimitry Andric     unsigned encodeImm() {
76*0fca6ea1SDimitry Andric       // Handle the first Single Use Region separately as it has an extra bit
77*0fca6ea1SDimitry Andric       // of encoding space.
78*0fca6ea1SDimitry Andric       unsigned Imm = SingleUseRegions[SkipRegions.size()];
79*0fca6ea1SDimitry Andric       unsigned ShiftAmount = 4;
80*0fca6ea1SDimitry Andric       for (unsigned i = SkipRegions.size(); i > 0; i--) {
81*0fca6ea1SDimitry Andric         Imm |= SkipRegions[i - 1] << ShiftAmount;
82*0fca6ea1SDimitry Andric         ShiftAmount += 3;
83*0fca6ea1SDimitry Andric         Imm |= SingleUseRegions[i - 1] << ShiftAmount;
84*0fca6ea1SDimitry Andric         ShiftAmount += 3;
85*0fca6ea1SDimitry Andric       }
86*0fca6ea1SDimitry Andric       return Imm;
87*0fca6ea1SDimitry Andric     }
88*0fca6ea1SDimitry Andric 
89*0fca6ea1SDimitry Andric   public:
SingleUseInstruction(const unsigned ProducerPosition,MachineInstr * Producer)90*0fca6ea1SDimitry Andric     SingleUseInstruction(const unsigned ProducerPosition,
91*0fca6ea1SDimitry Andric                          MachineInstr *Producer)
92*0fca6ea1SDimitry Andric         : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
93*0fca6ea1SDimitry Andric           SingleUseRegions({1, 0, 0}) {}
94*0fca6ea1SDimitry Andric 
95*0fca6ea1SDimitry Andric     // Returns false if adding a new single use producer failed. This happens
96*0fca6ea1SDimitry Andric     // because it could not be encoded, either because there is no room to
97*0fca6ea1SDimitry Andric     // encode another single use producer region or that this single use
98*0fca6ea1SDimitry Andric     // producer is too far away to encode the amount of instructions to skip.
tryAddProducer(const unsigned ProducerPosition,MachineInstr * MI)99*0fca6ea1SDimitry Andric     bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
100*0fca6ea1SDimitry Andric       // Producer is too far away to encode into this instruction or another
101*0fca6ea1SDimitry Andric       // skip region is needed and SkipRegions.size() = 2 so there's no room for
102*0fca6ea1SDimitry Andric       // another skip region, therefore a new instruction is needed.
103*0fca6ea1SDimitry Andric       if (LastEncodedPositionEnd +
104*0fca6ea1SDimitry Andric               (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
105*0fca6ea1SDimitry Andric           ProducerPosition)
106*0fca6ea1SDimitry Andric         return false;
107*0fca6ea1SDimitry Andric 
108*0fca6ea1SDimitry Andric       // If a skip region is needed.
109*0fca6ea1SDimitry Andric       if (LastEncodedPositionEnd != ProducerPosition ||
110*0fca6ea1SDimitry Andric           !currentRegionHasSpace()) {
111*0fca6ea1SDimitry Andric         // If the current region is out of space therefore a skip region would
112*0fca6ea1SDimitry Andric         // be needed, but there is no room for another skip region.
113*0fca6ea1SDimitry Andric         if (SkipRegions.size() == MaxNumberOfSkipRegions)
114*0fca6ea1SDimitry Andric           return false;
115*0fca6ea1SDimitry Andric         skip(ProducerPosition);
116*0fca6ea1SDimitry Andric       }
117*0fca6ea1SDimitry Andric 
118*0fca6ea1SDimitry Andric       SingleUseRegions[SkipRegions.size()]++;
119*0fca6ea1SDimitry Andric       LastEncodedPositionEnd = ProducerPosition + 1;
120*0fca6ea1SDimitry Andric       ProducerInstr = MI;
121*0fca6ea1SDimitry Andric       return true;
122*0fca6ea1SDimitry Andric     }
123*0fca6ea1SDimitry Andric 
emit(const SIInstrInfo * SII)124*0fca6ea1SDimitry Andric     auto emit(const SIInstrInfo *SII) {
125*0fca6ea1SDimitry Andric       return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
126*0fca6ea1SDimitry Andric                      SII->get(AMDGPU::S_SINGLEUSE_VDST))
127*0fca6ea1SDimitry Andric           .addImm(encodeImm());
128*0fca6ea1SDimitry Andric     }
129*0fca6ea1SDimitry Andric   };
1305f757f3fSDimitry Andric 
1315f757f3fSDimitry Andric public:
1325f757f3fSDimitry Andric   static char ID;
1335f757f3fSDimitry Andric 
AMDGPUInsertSingleUseVDST()1345f757f3fSDimitry Andric   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
1355f757f3fSDimitry Andric 
insertSingleUseInstructions(ArrayRef<std::pair<unsigned,MachineInstr * >> SingleUseProducers) const136*0fca6ea1SDimitry Andric   void insertSingleUseInstructions(
137*0fca6ea1SDimitry Andric       ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
138*0fca6ea1SDimitry Andric     SmallVector<SingleUseInstruction> Instructions;
139*0fca6ea1SDimitry Andric 
140*0fca6ea1SDimitry Andric     for (auto &[Position, MI] : SingleUseProducers) {
141*0fca6ea1SDimitry Andric       // Encode this position into the last single use instruction if possible.
142*0fca6ea1SDimitry Andric       if (Instructions.empty() ||
143*0fca6ea1SDimitry Andric           !Instructions.back().tryAddProducer(Position, MI)) {
144*0fca6ea1SDimitry Andric         // If not, add a new instruction.
145*0fca6ea1SDimitry Andric         Instructions.push_back(SingleUseInstruction(Position, MI));
146*0fca6ea1SDimitry Andric       }
147*0fca6ea1SDimitry Andric     }
148*0fca6ea1SDimitry Andric 
149*0fca6ea1SDimitry Andric     for (auto &Instruction : Instructions)
150*0fca6ea1SDimitry Andric       Instruction.emit(SII);
1515f757f3fSDimitry Andric   }
1525f757f3fSDimitry Andric 
runOnMachineFunction(MachineFunction & MF)1535f757f3fSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override {
1545f757f3fSDimitry Andric     const auto &ST = MF.getSubtarget<GCNSubtarget>();
1555f757f3fSDimitry Andric     if (!ST.hasVGPRSingleUseHintInsts())
1565f757f3fSDimitry Andric       return false;
1575f757f3fSDimitry Andric 
1585f757f3fSDimitry Andric     SII = ST.getInstrInfo();
1595f757f3fSDimitry Andric     const auto *TRI = &SII->getRegisterInfo();
1605f757f3fSDimitry Andric     bool InstructionEmitted = false;
1615f757f3fSDimitry Andric 
1625f757f3fSDimitry Andric     for (MachineBasicBlock &MBB : MF) {
163*0fca6ea1SDimitry Andric       DenseMap<MCRegUnit, unsigned> RegisterUseCount;
1645f757f3fSDimitry Andric 
1655f757f3fSDimitry Andric       // Handle boundaries at the end of basic block separately to avoid
1665f757f3fSDimitry Andric       // false positives. If they are live at the end of a basic block then
1675f757f3fSDimitry Andric       // assume it has more uses later on.
168*0fca6ea1SDimitry Andric       for (const auto &Liveout : MBB.liveouts()) {
169*0fca6ea1SDimitry Andric         for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
170*0fca6ea1SDimitry Andric              ++Units) {
171*0fca6ea1SDimitry Andric           const auto [Unit, Mask] = *Units;
172*0fca6ea1SDimitry Andric           if ((Mask & Liveout.LaneMask).any())
173*0fca6ea1SDimitry Andric             RegisterUseCount[Unit] = 2;
174*0fca6ea1SDimitry Andric         }
175*0fca6ea1SDimitry Andric       }
1765f757f3fSDimitry Andric 
177*0fca6ea1SDimitry Andric       SmallVector<std::pair<unsigned, MachineInstr *>>
178*0fca6ea1SDimitry Andric           SingleUseProducerPositions;
179*0fca6ea1SDimitry Andric 
180*0fca6ea1SDimitry Andric       unsigned VALUInstrCount = 0;
1815f757f3fSDimitry Andric       for (MachineInstr &MI : reverse(MBB.instrs())) {
1825f757f3fSDimitry Andric         // All registers in all operands need to be single use for an
1835f757f3fSDimitry Andric         // instruction to be marked as a single use producer.
1845f757f3fSDimitry Andric         bool AllProducerOperandsAreSingleUse = true;
1855f757f3fSDimitry Andric 
186*0fca6ea1SDimitry Andric         // Gather a list of Registers used before updating use counts to avoid
187*0fca6ea1SDimitry Andric         // double counting registers that appear multiple times in a single
188*0fca6ea1SDimitry Andric         // MachineInstr.
189*0fca6ea1SDimitry Andric         SmallVector<MCRegUnit> RegistersUsed;
190*0fca6ea1SDimitry Andric 
191*0fca6ea1SDimitry Andric         for (const auto &Operand : MI.all_defs()) {
192*0fca6ea1SDimitry Andric           const auto Reg = Operand.getReg();
193*0fca6ea1SDimitry Andric 
194*0fca6ea1SDimitry Andric           const auto RegUnits = TRI->regunits(Reg);
195*0fca6ea1SDimitry Andric           if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
196*0fca6ea1SDimitry Andric                 return RegisterUseCount[Unit] > 1;
197*0fca6ea1SDimitry Andric               }))
198*0fca6ea1SDimitry Andric             AllProducerOperandsAreSingleUse = false;
199*0fca6ea1SDimitry Andric 
200*0fca6ea1SDimitry Andric           // Reset uses count when a register is no longer live.
201*0fca6ea1SDimitry Andric           for (const MCRegUnit Unit : RegUnits)
202*0fca6ea1SDimitry Andric             RegisterUseCount.erase(Unit);
203*0fca6ea1SDimitry Andric         }
204*0fca6ea1SDimitry Andric 
205*0fca6ea1SDimitry Andric         for (const auto &Operand : MI.all_uses()) {
2065f757f3fSDimitry Andric           const auto Reg = Operand.getReg();
2075f757f3fSDimitry Andric 
2085f757f3fSDimitry Andric           // Count the number of times each register is read.
209*0fca6ea1SDimitry Andric           for (const MCRegUnit Unit : TRI->regunits(Reg)) {
210*0fca6ea1SDimitry Andric             if (!is_contained(RegistersUsed, Unit))
211*0fca6ea1SDimitry Andric               RegistersUsed.push_back(Unit);
212*0fca6ea1SDimitry Andric           }
213*0fca6ea1SDimitry Andric         }
214*0fca6ea1SDimitry Andric         for (const MCRegUnit Unit : RegistersUsed)
215*0fca6ea1SDimitry Andric           RegisterUseCount[Unit]++;
2165f757f3fSDimitry Andric 
2175f757f3fSDimitry Andric         // Do not attempt to optimise across exec mask changes.
218*0fca6ea1SDimitry Andric         if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
219*0fca6ea1SDimitry Andric             AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
2205f757f3fSDimitry Andric           for (auto &UsedReg : RegisterUseCount)
2215f757f3fSDimitry Andric             UsedReg.second = 2;
2225f757f3fSDimitry Andric         }
2235f757f3fSDimitry Andric 
224*0fca6ea1SDimitry Andric         if (!SIInstrInfo::isVALU(MI) ||
225*0fca6ea1SDimitry Andric             AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
2265f757f3fSDimitry Andric           continue;
227*0fca6ea1SDimitry Andric         if (AllProducerOperandsAreSingleUse) {
228*0fca6ea1SDimitry Andric           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
2295f757f3fSDimitry Andric           InstructionEmitted = true;
2305f757f3fSDimitry Andric         }
231*0fca6ea1SDimitry Andric         VALUInstrCount++;
2325f757f3fSDimitry Andric       }
233*0fca6ea1SDimitry Andric       insertSingleUseInstructions(SingleUseProducerPositions);
2345f757f3fSDimitry Andric     }
2355f757f3fSDimitry Andric     return InstructionEmitted;
2365f757f3fSDimitry Andric   }
2375f757f3fSDimitry Andric };
2385f757f3fSDimitry Andric } // namespace
2395f757f3fSDimitry Andric 
2405f757f3fSDimitry Andric char AMDGPUInsertSingleUseVDST::ID = 0;
2415f757f3fSDimitry Andric 
2425f757f3fSDimitry Andric char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
2435f757f3fSDimitry Andric 
2445f757f3fSDimitry Andric INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
2455f757f3fSDimitry Andric                 "AMDGPU Insert SingleUseVDST", false, false)
246