1 //===-- GCNPreRAOptimizations.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass combines split register tuple initialization into a single pseudo: 11 /// 12 /// undef %0.sub1:sreg_64 = S_MOV_B32 1 13 /// %0.sub0:sreg_64 = S_MOV_B32 2 14 /// => 15 /// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001 16 /// 17 /// This is to allow rematerialization of a value instead of spilling. It is 18 /// supposed to be done after register coalescer to allow it to do its job and 19 /// before actual register allocation to allow rematerialization. 20 /// 21 /// Right now the pass only handles 64 bit SGPRs with immediate initializers, 22 /// although the same shall be possible with other register classes and 23 /// instructions if necessary. 24 /// 25 /// This pass also adds register allocation hints to COPY. 26 /// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints. 27 /// When using True16, we often see COPY moving a 16-bit value between a VGPR_32 28 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of 29 /// the VGPR_32, the COPY can be completely eliminated. 30 /// 31 //===----------------------------------------------------------------------===// 32 33 #include "GCNPreRAOptimizations.h" 34 #include "AMDGPU.h" 35 #include "GCNSubtarget.h" 36 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 37 #include "SIRegisterInfo.h" 38 #include "llvm/CodeGen/LiveIntervals.h" 39 #include "llvm/CodeGen/MachineFunctionPass.h" 40 #include "llvm/InitializePasses.h" 41 42 using namespace llvm; 43 44 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" 45 46 namespace { 47 48 class GCNPreRAOptimizationsImpl { 49 private: 50 const SIInstrInfo *TII; 51 const SIRegisterInfo *TRI; 52 MachineRegisterInfo *MRI; 53 LiveIntervals *LIS; 54 55 bool processReg(Register Reg); 56 57 public: 58 GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} 59 bool run(MachineFunction &MF); 60 }; 61 62 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { 63 public: 64 static char ID; 65 66 GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) { 67 initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry()); 68 } 69 70 bool runOnMachineFunction(MachineFunction &MF) override; 71 72 StringRef getPassName() const override { 73 return "AMDGPU Pre-RA optimizations"; 74 } 75 76 void getAnalysisUsage(AnalysisUsage &AU) const override { 77 AU.addRequired<LiveIntervalsWrapperPass>(); 78 AU.setPreservesAll(); 79 MachineFunctionPass::getAnalysisUsage(AU); 80 } 81 }; 82 } // End anonymous namespace. 83 84 INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE, 85 "AMDGPU Pre-RA optimizations", false, false) 86 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) 87 INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE, 88 "Pre-RA optimizations", false, false) 89 90 char GCNPreRAOptimizationsLegacy::ID = 0; 91 92 char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizationsLegacy::ID; 93 94 FunctionPass *llvm::createGCNPreRAOptimizationsLegacyPass() { 95 return new GCNPreRAOptimizationsLegacy(); 96 } 97 98 bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { 99 MachineInstr *Def0 = nullptr; 100 MachineInstr *Def1 = nullptr; 101 uint64_t Init = 0; 102 bool Changed = false; 103 SmallSet<Register, 32> ModifiedRegs; 104 bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg)); 105 106 for (MachineInstr &I : MRI->def_instructions(Reg)) { 107 switch (I.getOpcode()) { 108 default: 109 return false; 110 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 111 break; 112 case AMDGPU::COPY: { 113 // Some subtargets cannot do an AGPR to AGPR copy directly, and need an 114 // intermdiate temporary VGPR register. Try to find the defining 115 // accvgpr_write to avoid temporary registers. 116 117 if (!IsAGPRDst) 118 return false; 119 120 Register SrcReg = I.getOperand(1).getReg(); 121 122 if (!SrcReg.isVirtual()) 123 break; 124 125 // Check if source of copy is from another AGPR. 126 bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg)); 127 if (!IsAGPRSrc) 128 break; 129 130 // def_instructions() does not look at subregs so it may give us a 131 // different instruction that defines the same vreg but different subreg 132 // so we have to manually check subreg. 133 Register SrcSubReg = I.getOperand(1).getSubReg(); 134 for (auto &Def : MRI->def_instructions(SrcReg)) { 135 if (SrcSubReg != Def.getOperand(0).getSubReg()) 136 continue; 137 138 if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 139 MachineOperand DefSrcMO = Def.getOperand(1); 140 141 // Immediates are not an issue and can be propagated in 142 // postrapseudos pass. Only handle cases where defining 143 // accvgpr_write source is a vreg. 144 if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) { 145 // Propagate source reg of accvgpr write to this copy instruction 146 I.getOperand(1).setReg(DefSrcMO.getReg()); 147 I.getOperand(1).setSubReg(DefSrcMO.getSubReg()); 148 149 // Reg uses were changed, collect unique set of registers to update 150 // live intervals at the end. 151 ModifiedRegs.insert(DefSrcMO.getReg()); 152 ModifiedRegs.insert(SrcReg); 153 154 Changed = true; 155 } 156 157 // Found the defining accvgpr_write, stop looking any further. 158 break; 159 } 160 } 161 break; 162 } 163 case AMDGPU::S_MOV_B32: 164 if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() || 165 I.getNumOperands() != 2) 166 return false; 167 168 switch (I.getOperand(0).getSubReg()) { 169 default: 170 return false; 171 case AMDGPU::sub0: 172 if (Def0) 173 return false; 174 Def0 = &I; 175 Init |= Lo_32(I.getOperand(1).getImm()); 176 break; 177 case AMDGPU::sub1: 178 if (Def1) 179 return false; 180 Def1 = &I; 181 Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32; 182 break; 183 } 184 break; 185 } 186 } 187 188 // For AGPR reg, check if live intervals need to be updated. 189 if (IsAGPRDst) { 190 if (Changed) { 191 for (Register RegToUpdate : ModifiedRegs) { 192 LIS->removeInterval(RegToUpdate); 193 LIS->createAndComputeVirtRegInterval(RegToUpdate); 194 } 195 } 196 197 return Changed; 198 } 199 200 // For SGPR reg, check if we can combine instructions. 201 if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent()) 202 return Changed; 203 204 LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1 205 << " =>\n"); 206 207 if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1), 208 LIS->getInstructionIndex(*Def0))) 209 std::swap(Def0, Def1); 210 211 LIS->RemoveMachineInstrFromMaps(*Def0); 212 LIS->RemoveMachineInstrFromMaps(*Def1); 213 auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(), 214 TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg) 215 .addImm(Init); 216 217 Def0->eraseFromParent(); 218 Def1->eraseFromParent(); 219 LIS->InsertMachineInstrInMaps(*NewI); 220 LIS->removeInterval(Reg); 221 LIS->createAndComputeVirtRegInterval(Reg); 222 223 LLVM_DEBUG(dbgs() << " " << *NewI); 224 225 return true; 226 } 227 228 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { 229 if (skipFunction(MF.getFunction())) 230 return false; 231 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); 232 return GCNPreRAOptimizationsImpl(LIS).run(MF); 233 } 234 235 PreservedAnalyses 236 GCNPreRAOptimizationsPass::run(MachineFunction &MF, 237 MachineFunctionAnalysisManager &MFAM) { 238 LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF); 239 GCNPreRAOptimizationsImpl(LIS).run(MF); 240 return PreservedAnalyses::all(); 241 } 242 243 bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { 244 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 245 TII = ST.getInstrInfo(); 246 MRI = &MF.getRegInfo(); 247 TRI = ST.getRegisterInfo(); 248 249 bool Changed = false; 250 251 for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { 252 Register Reg = Register::index2VirtReg(I); 253 if (!LIS->hasInterval(Reg)) 254 continue; 255 const TargetRegisterClass *RC = MRI->getRegClass(Reg); 256 if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) && 257 (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC))) 258 continue; 259 260 Changed |= processReg(Reg); 261 } 262 263 if (!ST.useRealTrue16Insts()) 264 return Changed; 265 266 // Add RA hints to improve True16 COPY elimination. 267 for (const MachineBasicBlock &MBB : MF) { 268 for (const MachineInstr &MI : MBB) { 269 if (MI.getOpcode() != AMDGPU::COPY) 270 continue; 271 Register Dst = MI.getOperand(0).getReg(); 272 Register Src = MI.getOperand(1).getReg(); 273 if (Dst.isVirtual() && 274 MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && 275 Src.isPhysical() && 276 TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) 277 MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); 278 if (Src.isVirtual() && 279 MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && 280 Dst.isPhysical() && 281 TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) 282 MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); 283 if (!Dst.isVirtual() || !Src.isVirtual()) 284 continue; 285 if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && 286 MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { 287 MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); 288 MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); 289 } 290 if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && 291 MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) 292 MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); 293 } 294 } 295 296 return Changed; 297 } 298