15f757f3fSDimitry Andric //===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// 25f757f3fSDimitry Andric // 35f757f3fSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45f757f3fSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55f757f3fSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65f757f3fSDimitry Andric // 75f757f3fSDimitry Andric //===----------------------------------------------------------------------===// 85f757f3fSDimitry Andric // 95f757f3fSDimitry Andric /// \file 105f757f3fSDimitry Andric /// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU 115f757f3fSDimitry Andric /// instructions that produce single-use VGPR values. If the value is forwarded 125f757f3fSDimitry Andric /// to the consumer instruction prior to VGPR writeback, the hardware can 135f757f3fSDimitry Andric /// then skip (kill) the VGPR write. 145f757f3fSDimitry Andric // 155f757f3fSDimitry Andric //===----------------------------------------------------------------------===// 165f757f3fSDimitry Andric 175f757f3fSDimitry Andric #include "AMDGPU.h" 18*0fca6ea1SDimitry Andric #include "AMDGPUGenSearchableTables.inc" 195f757f3fSDimitry Andric #include "GCNSubtarget.h" 205f757f3fSDimitry Andric #include "SIInstrInfo.h" 21*0fca6ea1SDimitry Andric #include "SIRegisterInfo.h" 225f757f3fSDimitry Andric #include "llvm/ADT/DenseMap.h" 235f757f3fSDimitry Andric #include "llvm/ADT/STLExtras.h" 24*0fca6ea1SDimitry Andric #include "llvm/ADT/SmallVector.h" 255f757f3fSDimitry Andric #include "llvm/ADT/StringRef.h" 265f757f3fSDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 275f757f3fSDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 285f757f3fSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 295f757f3fSDimitry Andric #include "llvm/CodeGen/MachineInstr.h" 305f757f3fSDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 315f757f3fSDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 325f757f3fSDimitry Andric #include "llvm/CodeGen/Register.h" 335f757f3fSDimitry Andric #include "llvm/IR/DebugLoc.h" 345f757f3fSDimitry Andric #include "llvm/MC/MCRegister.h" 35*0fca6ea1SDimitry Andric #include "llvm/MC/MCRegisterInfo.h" 365f757f3fSDimitry Andric #include "llvm/Pass.h" 37*0fca6ea1SDimitry Andric #include <array> 385f757f3fSDimitry Andric 395f757f3fSDimitry Andric using namespace llvm; 405f757f3fSDimitry Andric 415f757f3fSDimitry Andric #define DEBUG_TYPE "amdgpu-insert-single-use-vdst" 425f757f3fSDimitry Andric 435f757f3fSDimitry Andric namespace { 445f757f3fSDimitry Andric class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { 455f757f3fSDimitry Andric private: 465f757f3fSDimitry Andric const SIInstrInfo *SII; 47*0fca6ea1SDimitry Andric class SingleUseInstruction { 48*0fca6ea1SDimitry Andric private: 49*0fca6ea1SDimitry Andric static const unsigned MaxSkipRange = 0b111; 50*0fca6ea1SDimitry Andric static const unsigned MaxNumberOfSkipRegions = 2; 51*0fca6ea1SDimitry Andric 52*0fca6ea1SDimitry Andric unsigned LastEncodedPositionEnd; 53*0fca6ea1SDimitry Andric MachineInstr *ProducerInstr; 54*0fca6ea1SDimitry Andric 55*0fca6ea1SDimitry Andric std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions; 56*0fca6ea1SDimitry Andric SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions; 57*0fca6ea1SDimitry Andric 58*0fca6ea1SDimitry Andric // Adds a skip region into the instruction. skip(const unsigned ProducerPosition)59*0fca6ea1SDimitry Andric void skip(const unsigned ProducerPosition) { 60*0fca6ea1SDimitry Andric while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) { 61*0fca6ea1SDimitry Andric SkipRegions.push_back(MaxSkipRange); 62*0fca6ea1SDimitry Andric LastEncodedPositionEnd += MaxSkipRange; 63*0fca6ea1SDimitry Andric } 64*0fca6ea1SDimitry Andric SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd); 65*0fca6ea1SDimitry Andric LastEncodedPositionEnd = ProducerPosition; 66*0fca6ea1SDimitry Andric } 67*0fca6ea1SDimitry Andric currentRegionHasSpace()68*0fca6ea1SDimitry Andric bool currentRegionHasSpace() { 69*0fca6ea1SDimitry Andric const auto Region = SkipRegions.size(); 70*0fca6ea1SDimitry Andric // The first region has an extra bit of encoding space. 71*0fca6ea1SDimitry Andric return SingleUseRegions[Region] < 72*0fca6ea1SDimitry Andric ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U); 73*0fca6ea1SDimitry Andric } 74*0fca6ea1SDimitry Andric encodeImm()75*0fca6ea1SDimitry Andric unsigned encodeImm() { 76*0fca6ea1SDimitry Andric // Handle the first Single Use Region separately as it has an extra bit 77*0fca6ea1SDimitry Andric // of encoding space. 78*0fca6ea1SDimitry Andric unsigned Imm = SingleUseRegions[SkipRegions.size()]; 79*0fca6ea1SDimitry Andric unsigned ShiftAmount = 4; 80*0fca6ea1SDimitry Andric for (unsigned i = SkipRegions.size(); i > 0; i--) { 81*0fca6ea1SDimitry Andric Imm |= SkipRegions[i - 1] << ShiftAmount; 82*0fca6ea1SDimitry Andric ShiftAmount += 3; 83*0fca6ea1SDimitry Andric Imm |= SingleUseRegions[i - 1] << ShiftAmount; 84*0fca6ea1SDimitry Andric ShiftAmount += 3; 85*0fca6ea1SDimitry Andric } 86*0fca6ea1SDimitry Andric return Imm; 87*0fca6ea1SDimitry Andric } 88*0fca6ea1SDimitry Andric 89*0fca6ea1SDimitry Andric public: SingleUseInstruction(const unsigned ProducerPosition,MachineInstr * Producer)90*0fca6ea1SDimitry Andric SingleUseInstruction(const unsigned ProducerPosition, 91*0fca6ea1SDimitry Andric MachineInstr *Producer) 92*0fca6ea1SDimitry Andric : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer), 93*0fca6ea1SDimitry Andric SingleUseRegions({1, 0, 0}) {} 94*0fca6ea1SDimitry Andric 95*0fca6ea1SDimitry Andric // Returns false if adding a new single use producer failed. This happens 96*0fca6ea1SDimitry Andric // because it could not be encoded, either because there is no room to 97*0fca6ea1SDimitry Andric // encode another single use producer region or that this single use 98*0fca6ea1SDimitry Andric // producer is too far away to encode the amount of instructions to skip. tryAddProducer(const unsigned ProducerPosition,MachineInstr * MI)99*0fca6ea1SDimitry Andric bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) { 100*0fca6ea1SDimitry Andric // Producer is too far away to encode into this instruction or another 101*0fca6ea1SDimitry Andric // skip region is needed and SkipRegions.size() = 2 so there's no room for 102*0fca6ea1SDimitry Andric // another skip region, therefore a new instruction is needed. 103*0fca6ea1SDimitry Andric if (LastEncodedPositionEnd + 104*0fca6ea1SDimitry Andric (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) < 105*0fca6ea1SDimitry Andric ProducerPosition) 106*0fca6ea1SDimitry Andric return false; 107*0fca6ea1SDimitry Andric 108*0fca6ea1SDimitry Andric // If a skip region is needed. 109*0fca6ea1SDimitry Andric if (LastEncodedPositionEnd != ProducerPosition || 110*0fca6ea1SDimitry Andric !currentRegionHasSpace()) { 111*0fca6ea1SDimitry Andric // If the current region is out of space therefore a skip region would 112*0fca6ea1SDimitry Andric // be needed, but there is no room for another skip region. 113*0fca6ea1SDimitry Andric if (SkipRegions.size() == MaxNumberOfSkipRegions) 114*0fca6ea1SDimitry Andric return false; 115*0fca6ea1SDimitry Andric skip(ProducerPosition); 116*0fca6ea1SDimitry Andric } 117*0fca6ea1SDimitry Andric 118*0fca6ea1SDimitry Andric SingleUseRegions[SkipRegions.size()]++; 119*0fca6ea1SDimitry Andric LastEncodedPositionEnd = ProducerPosition + 1; 120*0fca6ea1SDimitry Andric ProducerInstr = MI; 121*0fca6ea1SDimitry Andric return true; 122*0fca6ea1SDimitry Andric } 123*0fca6ea1SDimitry Andric emit(const SIInstrInfo * SII)124*0fca6ea1SDimitry Andric auto emit(const SIInstrInfo *SII) { 125*0fca6ea1SDimitry Andric return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(), 126*0fca6ea1SDimitry Andric SII->get(AMDGPU::S_SINGLEUSE_VDST)) 127*0fca6ea1SDimitry Andric .addImm(encodeImm()); 128*0fca6ea1SDimitry Andric } 129*0fca6ea1SDimitry Andric }; 1305f757f3fSDimitry Andric 1315f757f3fSDimitry Andric public: 1325f757f3fSDimitry Andric static char ID; 1335f757f3fSDimitry Andric AMDGPUInsertSingleUseVDST()1345f757f3fSDimitry Andric AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} 1355f757f3fSDimitry Andric insertSingleUseInstructions(ArrayRef<std::pair<unsigned,MachineInstr * >> SingleUseProducers) const136*0fca6ea1SDimitry Andric void insertSingleUseInstructions( 137*0fca6ea1SDimitry Andric ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const { 138*0fca6ea1SDimitry Andric SmallVector<SingleUseInstruction> Instructions; 139*0fca6ea1SDimitry Andric 140*0fca6ea1SDimitry Andric for (auto &[Position, MI] : SingleUseProducers) { 141*0fca6ea1SDimitry Andric // Encode this position into the last single use instruction if possible. 142*0fca6ea1SDimitry Andric if (Instructions.empty() || 143*0fca6ea1SDimitry Andric !Instructions.back().tryAddProducer(Position, MI)) { 144*0fca6ea1SDimitry Andric // If not, add a new instruction. 145*0fca6ea1SDimitry Andric Instructions.push_back(SingleUseInstruction(Position, MI)); 146*0fca6ea1SDimitry Andric } 147*0fca6ea1SDimitry Andric } 148*0fca6ea1SDimitry Andric 149*0fca6ea1SDimitry Andric for (auto &Instruction : Instructions) 150*0fca6ea1SDimitry Andric Instruction.emit(SII); 1515f757f3fSDimitry Andric } 1525f757f3fSDimitry Andric runOnMachineFunction(MachineFunction & MF)1535f757f3fSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override { 1545f757f3fSDimitry Andric const auto &ST = MF.getSubtarget<GCNSubtarget>(); 1555f757f3fSDimitry Andric if (!ST.hasVGPRSingleUseHintInsts()) 1565f757f3fSDimitry Andric return false; 1575f757f3fSDimitry Andric 1585f757f3fSDimitry Andric SII = ST.getInstrInfo(); 1595f757f3fSDimitry Andric const auto *TRI = &SII->getRegisterInfo(); 1605f757f3fSDimitry Andric bool InstructionEmitted = false; 1615f757f3fSDimitry Andric 1625f757f3fSDimitry Andric for (MachineBasicBlock &MBB : MF) { 163*0fca6ea1SDimitry Andric DenseMap<MCRegUnit, unsigned> RegisterUseCount; 1645f757f3fSDimitry Andric 1655f757f3fSDimitry Andric // Handle boundaries at the end of basic block separately to avoid 1665f757f3fSDimitry Andric // false positives. If they are live at the end of a basic block then 1675f757f3fSDimitry Andric // assume it has more uses later on. 168*0fca6ea1SDimitry Andric for (const auto &Liveout : MBB.liveouts()) { 169*0fca6ea1SDimitry Andric for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid(); 170*0fca6ea1SDimitry Andric ++Units) { 171*0fca6ea1SDimitry Andric const auto [Unit, Mask] = *Units; 172*0fca6ea1SDimitry Andric if ((Mask & Liveout.LaneMask).any()) 173*0fca6ea1SDimitry Andric RegisterUseCount[Unit] = 2; 174*0fca6ea1SDimitry Andric } 175*0fca6ea1SDimitry Andric } 1765f757f3fSDimitry Andric 177*0fca6ea1SDimitry Andric SmallVector<std::pair<unsigned, MachineInstr *>> 178*0fca6ea1SDimitry Andric SingleUseProducerPositions; 179*0fca6ea1SDimitry Andric 180*0fca6ea1SDimitry Andric unsigned VALUInstrCount = 0; 1815f757f3fSDimitry Andric for (MachineInstr &MI : reverse(MBB.instrs())) { 1825f757f3fSDimitry Andric // All registers in all operands need to be single use for an 1835f757f3fSDimitry Andric // instruction to be marked as a single use producer. 1845f757f3fSDimitry Andric bool AllProducerOperandsAreSingleUse = true; 1855f757f3fSDimitry Andric 186*0fca6ea1SDimitry Andric // Gather a list of Registers used before updating use counts to avoid 187*0fca6ea1SDimitry Andric // double counting registers that appear multiple times in a single 188*0fca6ea1SDimitry Andric // MachineInstr. 189*0fca6ea1SDimitry Andric SmallVector<MCRegUnit> RegistersUsed; 190*0fca6ea1SDimitry Andric 191*0fca6ea1SDimitry Andric for (const auto &Operand : MI.all_defs()) { 192*0fca6ea1SDimitry Andric const auto Reg = Operand.getReg(); 193*0fca6ea1SDimitry Andric 194*0fca6ea1SDimitry Andric const auto RegUnits = TRI->regunits(Reg); 195*0fca6ea1SDimitry Andric if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) { 196*0fca6ea1SDimitry Andric return RegisterUseCount[Unit] > 1; 197*0fca6ea1SDimitry Andric })) 198*0fca6ea1SDimitry Andric AllProducerOperandsAreSingleUse = false; 199*0fca6ea1SDimitry Andric 200*0fca6ea1SDimitry Andric // Reset uses count when a register is no longer live. 201*0fca6ea1SDimitry Andric for (const MCRegUnit Unit : RegUnits) 202*0fca6ea1SDimitry Andric RegisterUseCount.erase(Unit); 203*0fca6ea1SDimitry Andric } 204*0fca6ea1SDimitry Andric 205*0fca6ea1SDimitry Andric for (const auto &Operand : MI.all_uses()) { 2065f757f3fSDimitry Andric const auto Reg = Operand.getReg(); 2075f757f3fSDimitry Andric 2085f757f3fSDimitry Andric // Count the number of times each register is read. 209*0fca6ea1SDimitry Andric for (const MCRegUnit Unit : TRI->regunits(Reg)) { 210*0fca6ea1SDimitry Andric if (!is_contained(RegistersUsed, Unit)) 211*0fca6ea1SDimitry Andric RegistersUsed.push_back(Unit); 212*0fca6ea1SDimitry Andric } 213*0fca6ea1SDimitry Andric } 214*0fca6ea1SDimitry Andric for (const MCRegUnit Unit : RegistersUsed) 215*0fca6ea1SDimitry Andric RegisterUseCount[Unit]++; 2165f757f3fSDimitry Andric 2175f757f3fSDimitry Andric // Do not attempt to optimise across exec mask changes. 218*0fca6ea1SDimitry Andric if (MI.modifiesRegister(AMDGPU::EXEC, TRI) || 219*0fca6ea1SDimitry Andric AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) { 2205f757f3fSDimitry Andric for (auto &UsedReg : RegisterUseCount) 2215f757f3fSDimitry Andric UsedReg.second = 2; 2225f757f3fSDimitry Andric } 2235f757f3fSDimitry Andric 224*0fca6ea1SDimitry Andric if (!SIInstrInfo::isVALU(MI) || 225*0fca6ea1SDimitry Andric AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode())) 2265f757f3fSDimitry Andric continue; 227*0fca6ea1SDimitry Andric if (AllProducerOperandsAreSingleUse) { 228*0fca6ea1SDimitry Andric SingleUseProducerPositions.push_back({VALUInstrCount, &MI}); 2295f757f3fSDimitry Andric InstructionEmitted = true; 2305f757f3fSDimitry Andric } 231*0fca6ea1SDimitry Andric VALUInstrCount++; 2325f757f3fSDimitry Andric } 233*0fca6ea1SDimitry Andric insertSingleUseInstructions(SingleUseProducerPositions); 2345f757f3fSDimitry Andric } 2355f757f3fSDimitry Andric return InstructionEmitted; 2365f757f3fSDimitry Andric } 2375f757f3fSDimitry Andric }; 2385f757f3fSDimitry Andric } // namespace 2395f757f3fSDimitry Andric 2405f757f3fSDimitry Andric char AMDGPUInsertSingleUseVDST::ID = 0; 2415f757f3fSDimitry Andric 2425f757f3fSDimitry Andric char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; 2435f757f3fSDimitry Andric 2445f757f3fSDimitry Andric INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, 2455f757f3fSDimitry Andric "AMDGPU Insert SingleUseVDST", false, false) 246