1 //===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU 11 /// instructions that produce single-use VGPR values. If the value is forwarded 12 /// to the consumer instruction prior to VGPR writeback, the hardware can 13 /// then skip (kill) the VGPR write. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "AMDGPUGenSearchableTables.inc" 19 #include "GCNSubtarget.h" 20 #include "SIInstrInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "llvm/ADT/DenseMap.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/ADT/StringRef.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineOperand.h" 32 #include "llvm/CodeGen/Register.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/MC/MCRegister.h" 35 #include "llvm/MC/MCRegisterInfo.h" 36 #include "llvm/Pass.h" 37 #include <array> 38 39 using namespace llvm; 40 41 #define DEBUG_TYPE "amdgpu-insert-single-use-vdst" 42 43 namespace { 44 class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { 45 private: 46 const SIInstrInfo *SII; 47 class SingleUseInstruction { 48 private: 49 static const unsigned MaxSkipRange = 0b111; 50 static const unsigned MaxNumberOfSkipRegions = 2; 51 52 unsigned LastEncodedPositionEnd; 53 MachineInstr *ProducerInstr; 54 55 std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions; 56 SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions; 57 58 // Adds a skip region into the instruction. 59 void skip(const unsigned ProducerPosition) { 60 while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) { 61 SkipRegions.push_back(MaxSkipRange); 62 LastEncodedPositionEnd += MaxSkipRange; 63 } 64 SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd); 65 LastEncodedPositionEnd = ProducerPosition; 66 } 67 68 bool currentRegionHasSpace() { 69 const auto Region = SkipRegions.size(); 70 // The first region has an extra bit of encoding space. 71 return SingleUseRegions[Region] < 72 ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U); 73 } 74 75 unsigned encodeImm() { 76 // Handle the first Single Use Region separately as it has an extra bit 77 // of encoding space. 78 unsigned Imm = SingleUseRegions[SkipRegions.size()]; 79 unsigned ShiftAmount = 4; 80 for (unsigned i = SkipRegions.size(); i > 0; i--) { 81 Imm |= SkipRegions[i - 1] << ShiftAmount; 82 ShiftAmount += 3; 83 Imm |= SingleUseRegions[i - 1] << ShiftAmount; 84 ShiftAmount += 3; 85 } 86 return Imm; 87 } 88 89 public: 90 SingleUseInstruction(const unsigned ProducerPosition, 91 MachineInstr *Producer) 92 : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer), 93 SingleUseRegions({1, 0, 0}) {} 94 95 // Returns false if adding a new single use producer failed. This happens 96 // because it could not be encoded, either because there is no room to 97 // encode another single use producer region or that this single use 98 // producer is too far away to encode the amount of instructions to skip. 99 bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) { 100 // Producer is too far away to encode into this instruction or another 101 // skip region is needed and SkipRegions.size() = 2 so there's no room for 102 // another skip region, therefore a new instruction is needed. 103 if (LastEncodedPositionEnd + 104 (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) < 105 ProducerPosition) 106 return false; 107 108 // If a skip region is needed. 109 if (LastEncodedPositionEnd != ProducerPosition || 110 !currentRegionHasSpace()) { 111 // If the current region is out of space therefore a skip region would 112 // be needed, but there is no room for another skip region. 113 if (SkipRegions.size() == MaxNumberOfSkipRegions) 114 return false; 115 skip(ProducerPosition); 116 } 117 118 SingleUseRegions[SkipRegions.size()]++; 119 LastEncodedPositionEnd = ProducerPosition + 1; 120 ProducerInstr = MI; 121 return true; 122 } 123 124 auto emit(const SIInstrInfo *SII) { 125 return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(), 126 SII->get(AMDGPU::S_SINGLEUSE_VDST)) 127 .addImm(encodeImm()); 128 } 129 }; 130 131 public: 132 static char ID; 133 134 AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} 135 136 void insertSingleUseInstructions( 137 ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const { 138 SmallVector<SingleUseInstruction> Instructions; 139 140 for (auto &[Position, MI] : SingleUseProducers) { 141 // Encode this position into the last single use instruction if possible. 142 if (Instructions.empty() || 143 !Instructions.back().tryAddProducer(Position, MI)) { 144 // If not, add a new instruction. 145 Instructions.push_back(SingleUseInstruction(Position, MI)); 146 } 147 } 148 149 for (auto &Instruction : Instructions) 150 Instruction.emit(SII); 151 } 152 153 bool runOnMachineFunction(MachineFunction &MF) override { 154 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 155 if (!ST.hasVGPRSingleUseHintInsts()) 156 return false; 157 158 SII = ST.getInstrInfo(); 159 const auto *TRI = &SII->getRegisterInfo(); 160 bool InstructionEmitted = false; 161 162 for (MachineBasicBlock &MBB : MF) { 163 DenseMap<MCRegUnit, unsigned> RegisterUseCount; 164 165 // Handle boundaries at the end of basic block separately to avoid 166 // false positives. If they are live at the end of a basic block then 167 // assume it has more uses later on. 168 for (const auto &Liveout : MBB.liveouts()) { 169 for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid(); 170 ++Units) { 171 const auto [Unit, Mask] = *Units; 172 if ((Mask & Liveout.LaneMask).any()) 173 RegisterUseCount[Unit] = 2; 174 } 175 } 176 177 SmallVector<std::pair<unsigned, MachineInstr *>> 178 SingleUseProducerPositions; 179 180 unsigned VALUInstrCount = 0; 181 for (MachineInstr &MI : reverse(MBB.instrs())) { 182 // All registers in all operands need to be single use for an 183 // instruction to be marked as a single use producer. 184 bool AllProducerOperandsAreSingleUse = true; 185 186 // Gather a list of Registers used before updating use counts to avoid 187 // double counting registers that appear multiple times in a single 188 // MachineInstr. 189 SmallVector<MCRegUnit> RegistersUsed; 190 191 for (const auto &Operand : MI.all_defs()) { 192 const auto Reg = Operand.getReg(); 193 194 const auto RegUnits = TRI->regunits(Reg); 195 if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) { 196 return RegisterUseCount[Unit] > 1; 197 })) 198 AllProducerOperandsAreSingleUse = false; 199 200 // Reset uses count when a register is no longer live. 201 for (const MCRegUnit Unit : RegUnits) 202 RegisterUseCount.erase(Unit); 203 } 204 205 for (const auto &Operand : MI.all_uses()) { 206 const auto Reg = Operand.getReg(); 207 208 // Count the number of times each register is read. 209 for (const MCRegUnit Unit : TRI->regunits(Reg)) { 210 if (!is_contained(RegistersUsed, Unit)) 211 RegistersUsed.push_back(Unit); 212 } 213 } 214 for (const MCRegUnit Unit : RegistersUsed) 215 RegisterUseCount[Unit]++; 216 217 // Do not attempt to optimise across exec mask changes. 218 if (MI.modifiesRegister(AMDGPU::EXEC, TRI) || 219 AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) { 220 for (auto &UsedReg : RegisterUseCount) 221 UsedReg.second = 2; 222 } 223 224 if (!SIInstrInfo::isVALU(MI) || 225 AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode())) 226 continue; 227 if (AllProducerOperandsAreSingleUse) { 228 SingleUseProducerPositions.push_back({VALUInstrCount, &MI}); 229 InstructionEmitted = true; 230 } 231 VALUInstrCount++; 232 } 233 insertSingleUseInstructions(SingleUseProducerPositions); 234 } 235 return InstructionEmitted; 236 } 237 }; 238 } // namespace 239 240 char AMDGPUInsertSingleUseVDST::ID = 0; 241 242 char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; 243 244 INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, 245 "AMDGPU Insert SingleUseVDST", false, false) 246