1 //===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This file contains the AMDGPU DAG scheduling 10 /// mutation to pair VOPD instructions back to back. It also contains 11 // subroutines useful in the creation of VOPD instructions 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "GCNVOPDUtils.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIInstrInfo.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/CodeGen/MachineBasicBlock.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineOperand.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/MacroFusion.h" 27 #include "llvm/CodeGen/ScheduleDAG.h" 28 #include "llvm/CodeGen/ScheduleDAGMutation.h" 29 #include "llvm/CodeGen/TargetInstrInfo.h" 30 #include "llvm/MC/MCInst.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "gcn-vopd-utils" 35 36 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, 37 const MachineInstr &FirstMI, 38 const MachineInstr &SecondMI) { 39 const MachineFunction *MF = FirstMI.getMF(); 40 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 41 const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo()); 42 const MachineRegisterInfo &MRI = MF->getRegInfo(); 43 const unsigned NumVGPRBanks = 4; 44 // Literals also count against scalar bus limit 45 SmallVector<const MachineOperand *> UniqueLiterals; 46 auto addLiteral = [&](const MachineOperand &Op) { 47 for (auto &Literal : UniqueLiterals) { 48 if (Literal->isIdenticalTo(Op)) 49 return; 50 } 51 UniqueLiterals.push_back(&Op); 52 }; 53 SmallVector<Register> UniqueScalarRegs; 54 assert([&]() -> bool { 55 for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); 56 MII != FirstMI.getParent()->instr_end(); ++MII) { 57 if (&*MII == &SecondMI) 58 return true; 59 } 60 return false; 61 }() && "Expected FirstMI to precede SecondMI"); 62 // Cannot pair dependent instructions 63 for (const auto &Use : SecondMI.uses()) 64 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) 65 return false; 66 67 struct ComponentInfo { 68 ComponentInfo(const MachineInstr &MI) : MI(MI) {} 69 Register Dst, Reg0, Reg1, Reg2; 70 const MachineInstr &MI; 71 }; 72 ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)}; 73 74 for (ComponentInfo &Comp : CInfo) { 75 switch (Comp.MI.getOpcode()) { 76 case AMDGPU::V_FMAMK_F32: 77 // cannot inline the fixed literal in fmamk 78 addLiteral(Comp.MI.getOperand(2)); 79 Comp.Reg2 = Comp.MI.getOperand(3).getReg(); 80 break; 81 case AMDGPU::V_FMAAK_F32: 82 // cannot inline the fixed literal in fmaak 83 addLiteral(Comp.MI.getOperand(3)); 84 Comp.Reg1 = Comp.MI.getOperand(2).getReg(); 85 break; 86 case AMDGPU::V_FMAC_F32_e32: 87 case AMDGPU::V_DOT2_F32_F16: 88 case AMDGPU::V_DOT2_F32_BF16: 89 Comp.Reg1 = Comp.MI.getOperand(2).getReg(); 90 Comp.Reg2 = Comp.MI.getOperand(0).getReg(); 91 break; 92 case AMDGPU::V_CNDMASK_B32_e32: 93 UniqueScalarRegs.push_back(AMDGPU::VCC_LO); 94 Comp.Reg1 = Comp.MI.getOperand(2).getReg(); 95 break; 96 case AMDGPU::V_MOV_B32_e32: 97 break; 98 default: 99 Comp.Reg1 = Comp.MI.getOperand(2).getReg(); 100 break; 101 } 102 103 Comp.Dst = Comp.MI.getOperand(0).getReg(); 104 105 const MachineOperand &Op0 = Comp.MI.getOperand(1); 106 if (Op0.isReg()) { 107 if (!TRI->isVectorRegister(MRI, Op0.getReg())) { 108 if (!is_contained(UniqueScalarRegs, Op0.getReg())) 109 UniqueScalarRegs.push_back(Op0.getReg()); 110 } else 111 Comp.Reg0 = Op0.getReg(); 112 } else { 113 if (!TII.isInlineConstant(Comp.MI, 1)) 114 addLiteral(Op0); 115 } 116 } 117 118 if (UniqueLiterals.size() > 1) 119 return false; 120 if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) 121 return false; 122 123 // check port 0 124 if (CInfo[0].Reg0 && CInfo[1].Reg0 && 125 CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks) 126 return false; 127 // check port 1 128 if (CInfo[0].Reg1 && CInfo[1].Reg1 && 129 CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks) 130 return false; 131 // check port 2 132 if (CInfo[0].Reg2 && CInfo[1].Reg2 && 133 !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1)) 134 return false; 135 if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1)) 136 return false; 137 138 LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI 139 << "\n\tY: " << SecondMI << "\n"); 140 return true; 141 } 142 143 /// Check if the instr pair, FirstMI and SecondMI, should be scheduled 144 /// together. Given SecondMI, when FirstMI is unspecified, then check if 145 /// SecondMI may be part of a fused pair at all. 146 static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, 147 const TargetSubtargetInfo &TSI, 148 const MachineInstr *FirstMI, 149 const MachineInstr &SecondMI) { 150 const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII); 151 unsigned Opc2 = SecondMI.getOpcode(); 152 auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); 153 154 // One instruction case 155 if (!FirstMI) 156 return SecondCanBeVOPD.Y; 157 158 unsigned Opc = FirstMI->getOpcode(); 159 auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); 160 161 if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || 162 (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) 163 return false; 164 165 return checkVOPDRegConstraints(STII, *FirstMI, SecondMI); 166 } 167 168 /// Adapts design from MacroFusion 169 /// Puts valid candidate instructions back-to-back so they can easily 170 /// be turned into VOPD instructions 171 /// Greedily pairs instruction candidates. O(n^2) algorithm. 172 struct VOPDPairingMutation : ScheduleDAGMutation { 173 ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer 174 175 VOPDPairingMutation( 176 ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer 177 : shouldScheduleAdjacent(shouldScheduleAdjacent) {} 178 179 void apply(ScheduleDAGInstrs *DAG) override { 180 const TargetInstrInfo &TII = *DAG->TII; 181 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); 182 if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) { 183 LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n"); 184 return; 185 } 186 187 std::vector<SUnit>::iterator ISUI, JSUI; 188 for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { 189 const MachineInstr *IMI = ISUI->getInstr(); 190 if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) 191 continue; 192 if (!hasLessThanNumFused(*ISUI, 2)) 193 continue; 194 195 for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { 196 if (JSUI->isBoundaryNode()) 197 continue; 198 const MachineInstr *JMI = JSUI->getInstr(); 199 if (!hasLessThanNumFused(*JSUI, 2) || 200 !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) 201 continue; 202 if (fuseInstructionPair(*DAG, *ISUI, *JSUI)) 203 break; 204 } 205 } 206 LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n"); 207 } 208 }; 209 210 std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() { 211 return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent); 212 } 213