1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LiveRegUnits.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 static cl::opt<bool> EnableSpillVGPRToAGPR( 24 "amdgpu-spill-vgpr-to-agpr", 25 cl::desc("Enable spilling VGPRs to AGPRs"), 26 cl::ReallyHidden, 27 cl::init(true)); 28 29 // Find a register matching \p RC from \p LiveUnits which is unused and 30 // available throughout the function. On failure, returns AMDGPU::NoRegister. 31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of 32 // MCRegisters. This should reduce the number of iterations and avoid redundant 33 // checking. 34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, 35 const LiveRegUnits &LiveUnits, 36 const TargetRegisterClass &RC) { 37 for (MCRegister Reg : RC) { 38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) && 39 !MRI.isReserved(Reg)) 40 return Reg; 41 } 42 return MCRegister(); 43 } 44 45 // Find a scratch register that we can use in the prologue. We avoid using 46 // callee-save registers since they may appear to be free when this is called 47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 48 // when this is called from emitPrologue. 49 static MCRegister findScratchNonCalleeSaveRegister( 50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, 51 const TargetRegisterClass &RC, bool Unused = false) { 52 // Mark callee saved registers as used so we will not choose them. 53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 54 for (unsigned i = 0; CSRegs[i]; ++i) 55 LiveUnits.addReg(CSRegs[i]); 56 57 // We are looking for a register that can be used throughout the entire 58 // function, so any use is unacceptable. 59 if (Unused) 60 return findUnusedRegister(MRI, LiveUnits, RC); 61 62 for (MCRegister Reg : RC) { 63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg)) 64 return Reg; 65 } 66 67 return MCRegister(); 68 } 69 70 /// Query target location for spilling SGPRs 71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs 72 static void getVGPRSpillLaneOrTempRegister( 73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, 74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, 75 bool IncludeScratchCopy = true) { 76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 77 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 78 79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 80 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 81 unsigned Size = TRI->getSpillSize(RC); 82 Align Alignment = TRI->getSpillAlign(RC); 83 84 // We need to save and restore the given SGPR. 85 86 Register ScratchSGPR; 87 // 1: Try to save the given register into an unused scratch SGPR. The 88 // LiveUnits should have all the callee saved registers marked as used. For 89 // certain cases we skip copy to scratch SGPR. 90 if (IncludeScratchCopy) 91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC); 92 93 if (!ScratchSGPR) { 94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, 95 TargetStackID::SGPRSpill); 96 97 if (TRI->spillSGPRToVGPR() && 98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) { 99 // 2: There's no free lane to spill, and no free register to save the 100 // SGPR, so we're forced to take another VGPR to use for the spill. 101 MFI->addToPrologEpilogSGPRSpills( 102 SGPR, PrologEpilogSGPRSaveRestoreInfo( 103 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); 104 105 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); 106 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " 107 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 108 << '\n';); 109 } else { 110 // Remove dead <FI> index 111 MF.getFrameInfo().RemoveStackObject(FI); 112 // 3: If all else fails, spill the register to memory. 113 FI = FrameInfo.CreateSpillStackObject(Size, Alignment); 114 MFI->addToPrologEpilogSGPRSpills( 115 SGPR, 116 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); 117 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " 118 << printReg(SGPR, TRI) << '\n'); 119 } 120 } else { 121 MFI->addToPrologEpilogSGPRSpills( 122 SGPR, PrologEpilogSGPRSaveRestoreInfo( 123 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); 124 LiveUnits.addReg(ScratchSGPR); 125 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " 126 << printReg(ScratchSGPR, TRI) << '\n'); 127 } 128 } 129 130 // We need to specially emit stack operations here because a different frame 131 // register is used than in the rest of the function, as getFrameRegister would 132 // use. 133 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 134 const SIMachineFunctionInfo &FuncInfo, 135 LiveRegUnits &LiveUnits, MachineFunction &MF, 136 MachineBasicBlock &MBB, 137 MachineBasicBlock::iterator I, const DebugLoc &DL, 138 Register SpillReg, int FI, Register FrameReg, 139 int64_t DwordOff = 0) { 140 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 141 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 142 143 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 144 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 145 MachineMemOperand *MMO = MF.getMachineMemOperand( 146 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 147 FrameInfo.getObjectAlign(FI)); 148 LiveUnits.addReg(SpillReg); 149 bool IsKill = !MBB.isLiveIn(SpillReg); 150 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, 151 DwordOff, MMO, nullptr, &LiveUnits); 152 if (IsKill) 153 LiveUnits.removeReg(SpillReg); 154 } 155 156 static void buildEpilogRestore(const GCNSubtarget &ST, 157 const SIRegisterInfo &TRI, 158 const SIMachineFunctionInfo &FuncInfo, 159 LiveRegUnits &LiveUnits, MachineFunction &MF, 160 MachineBasicBlock &MBB, 161 MachineBasicBlock::iterator I, 162 const DebugLoc &DL, Register SpillReg, int FI, 163 Register FrameReg, int64_t DwordOff = 0) { 164 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 165 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 166 167 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 168 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 169 MachineMemOperand *MMO = MF.getMachineMemOperand( 170 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 171 FrameInfo.getObjectAlign(FI)); 172 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, 173 DwordOff, MMO, nullptr, &LiveUnits); 174 } 175 176 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 177 const DebugLoc &DL, const SIInstrInfo *TII, 178 Register TargetReg) { 179 MachineFunction *MF = MBB.getParent(); 180 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 181 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 182 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 183 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 184 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 185 186 if (MFI->getGITPtrHigh() != 0xffffffff) { 187 BuildMI(MBB, I, DL, SMovB32, TargetHi) 188 .addImm(MFI->getGITPtrHigh()) 189 .addReg(TargetReg, RegState::ImplicitDefine); 190 } else { 191 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 192 BuildMI(MBB, I, DL, GetPC64, TargetReg); 193 } 194 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 195 MF->getRegInfo().addLiveIn(GitPtrLo); 196 MBB.addLiveIn(GitPtrLo); 197 BuildMI(MBB, I, DL, SMovB32, TargetLo) 198 .addReg(GitPtrLo); 199 } 200 201 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, 202 const SIMachineFunctionInfo *FuncInfo, 203 MachineFunction &MF, MachineBasicBlock &MBB, 204 MachineBasicBlock::iterator MBBI, bool IsProlog) { 205 if (LiveUnits.empty()) { 206 LiveUnits.init(TRI); 207 if (IsProlog) { 208 LiveUnits.addLiveIns(MBB); 209 } else { 210 // In epilog. 211 LiveUnits.addLiveOuts(MBB); 212 LiveUnits.stepBackward(*MBBI); 213 } 214 } 215 } 216 217 namespace llvm { 218 219 // SpillBuilder to save/restore special SGPR spills like the one needed for FP, 220 // BP, etc. These spills are delayed until the current function's frame is 221 // finalized. For a given register, the builder uses the 222 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. 223 class PrologEpilogSGPRSpillBuilder { 224 MachineBasicBlock::iterator MI; 225 MachineBasicBlock &MBB; 226 MachineFunction &MF; 227 const GCNSubtarget &ST; 228 MachineFrameInfo &MFI; 229 SIMachineFunctionInfo *FuncInfo; 230 const SIInstrInfo *TII; 231 const SIRegisterInfo &TRI; 232 Register SuperReg; 233 const PrologEpilogSGPRSaveRestoreInfo SI; 234 LiveRegUnits &LiveUnits; 235 const DebugLoc &DL; 236 Register FrameReg; 237 ArrayRef<int16_t> SplitParts; 238 unsigned NumSubRegs; 239 unsigned EltSize = 4; 240 241 void saveToMemory(const int FI) const { 242 MachineRegisterInfo &MRI = MF.getRegInfo(); 243 assert(!MFI.isDeadObjectIndex(FI)); 244 245 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); 246 247 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 248 MRI, LiveUnits, AMDGPU::VGPR_32RegClass); 249 if (!TmpVGPR) 250 report_fatal_error("failed to find free scratch register"); 251 252 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 253 Register SubReg = NumSubRegs == 1 254 ? SuperReg 255 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 256 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 257 .addReg(SubReg); 258 259 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, 260 FI, FrameReg, DwordOff); 261 DwordOff += 4; 262 } 263 } 264 265 void saveToVGPRLane(const int FI) const { 266 assert(!MFI.isDeadObjectIndex(FI)); 267 268 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 269 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 270 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); 271 assert(Spill.size() == NumSubRegs); 272 273 for (unsigned I = 0; I < NumSubRegs; ++I) { 274 Register SubReg = NumSubRegs == 1 275 ? SuperReg 276 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 277 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR), 278 Spill[I].VGPR) 279 .addReg(SubReg) 280 .addImm(Spill[I].Lane) 281 .addReg(Spill[I].VGPR, RegState::Undef); 282 } 283 } 284 285 void copyToScratchSGPR(Register DstReg) const { 286 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) 287 .addReg(SuperReg) 288 .setMIFlag(MachineInstr::FrameSetup); 289 } 290 291 void restoreFromMemory(const int FI) { 292 MachineRegisterInfo &MRI = MF.getRegInfo(); 293 294 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); 295 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 296 MRI, LiveUnits, AMDGPU::VGPR_32RegClass); 297 if (!TmpVGPR) 298 report_fatal_error("failed to find free scratch register"); 299 300 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 301 Register SubReg = NumSubRegs == 1 302 ? SuperReg 303 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 304 305 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, 306 TmpVGPR, FI, FrameReg, DwordOff); 307 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 308 .addReg(TmpVGPR, RegState::Kill); 309 DwordOff += 4; 310 } 311 } 312 313 void restoreFromVGPRLane(const int FI) { 314 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 315 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 316 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); 317 assert(Spill.size() == NumSubRegs); 318 319 for (unsigned I = 0; I < NumSubRegs; ++I) { 320 Register SubReg = NumSubRegs == 1 321 ? SuperReg 322 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 323 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 324 .addReg(Spill[I].VGPR) 325 .addImm(Spill[I].Lane); 326 } 327 } 328 329 void copyFromScratchSGPR(Register SrcReg) const { 330 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) 331 .addReg(SrcReg) 332 .setMIFlag(MachineInstr::FrameDestroy); 333 } 334 335 public: 336 PrologEpilogSGPRSpillBuilder(Register Reg, 337 const PrologEpilogSGPRSaveRestoreInfo SI, 338 MachineBasicBlock &MBB, 339 MachineBasicBlock::iterator MI, 340 const DebugLoc &DL, const SIInstrInfo *TII, 341 const SIRegisterInfo &TRI, 342 LiveRegUnits &LiveUnits, Register FrameReg) 343 : MI(MI), MBB(MBB), MF(*MBB.getParent()), 344 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), 345 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 346 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), 347 FrameReg(FrameReg) { 348 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 349 SplitParts = TRI.getRegSplitParts(RC, EltSize); 350 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 351 352 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 353 } 354 355 void save() { 356 switch (SI.getKind()) { 357 case SGPRSaveKind::SPILL_TO_MEM: 358 return saveToMemory(SI.getIndex()); 359 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 360 return saveToVGPRLane(SI.getIndex()); 361 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 362 return copyToScratchSGPR(SI.getReg()); 363 } 364 } 365 366 void restore() { 367 switch (SI.getKind()) { 368 case SGPRSaveKind::SPILL_TO_MEM: 369 return restoreFromMemory(SI.getIndex()); 370 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 371 return restoreFromVGPRLane(SI.getIndex()); 372 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 373 return copyFromScratchSGPR(SI.getReg()); 374 } 375 } 376 }; 377 378 } // namespace llvm 379 380 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 381 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 382 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 383 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 384 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 385 const SIInstrInfo *TII = ST.getInstrInfo(); 386 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 387 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 388 389 // We don't need this if we only have spills since there is no user facing 390 // scratch. 391 392 // TODO: If we know we don't have flat instructions earlier, we can omit 393 // this from the input registers. 394 // 395 // TODO: We only need to know if we access scratch space through a flat 396 // pointer. Because we only detect if flat instructions are used at all, 397 // this will be used more often than necessary on VI. 398 399 Register FlatScrInitLo; 400 Register FlatScrInitHi; 401 402 if (ST.isAmdPalOS()) { 403 // Extract the scratch offset from the descriptor in the GIT 404 LiveRegUnits LiveUnits; 405 LiveUnits.init(*TRI); 406 LiveUnits.addLiveIns(MBB); 407 408 // Find unused reg to load flat scratch init into 409 MachineRegisterInfo &MRI = MF.getRegInfo(); 410 Register FlatScrInit = AMDGPU::NoRegister; 411 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 412 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 413 AllSGPR64s = AllSGPR64s.slice( 414 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 415 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 416 for (MCPhysReg Reg : AllSGPR64s) { 417 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && 418 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 419 FlatScrInit = Reg; 420 break; 421 } 422 } 423 assert(FlatScrInit && "Failed to find free register for scratch init"); 424 425 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 426 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 427 428 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 429 430 // We now have the GIT ptr - now get the scratch descriptor from the entry 431 // at offset 0 (or offset 16 for a compute shader). 432 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 433 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 434 auto *MMO = MF.getMachineMemOperand( 435 PtrInfo, 436 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 437 MachineMemOperand::MODereferenceable, 438 8, Align(4)); 439 unsigned Offset = 440 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 441 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 442 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 443 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 444 .addReg(FlatScrInit) 445 .addImm(EncodedOffset) // offset 446 .addImm(0) // cpol 447 .addMemOperand(MMO); 448 449 // Mask the offset in [47:0] of the descriptor 450 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 451 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 452 .addReg(FlatScrInitHi) 453 .addImm(0xffff); 454 And->getOperand(3).setIsDead(); // Mark SCC as dead. 455 } else { 456 Register FlatScratchInitReg = 457 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 458 assert(FlatScratchInitReg); 459 460 MachineRegisterInfo &MRI = MF.getRegInfo(); 461 MRI.addLiveIn(FlatScratchInitReg); 462 MBB.addLiveIn(FlatScratchInitReg); 463 464 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 465 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 466 } 467 468 // Do a 64-bit pointer add. 469 if (ST.flatScratchIsPointer()) { 470 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 471 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 472 .addReg(FlatScrInitLo) 473 .addReg(ScratchWaveOffsetReg); 474 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 475 FlatScrInitHi) 476 .addReg(FlatScrInitHi) 477 .addImm(0); 478 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 479 480 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 481 addReg(FlatScrInitLo). 482 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 483 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 484 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 485 addReg(FlatScrInitHi). 486 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 487 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 488 return; 489 } 490 491 // For GFX9. 492 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 493 .addReg(FlatScrInitLo) 494 .addReg(ScratchWaveOffsetReg); 495 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 496 AMDGPU::FLAT_SCR_HI) 497 .addReg(FlatScrInitHi) 498 .addImm(0); 499 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 500 501 return; 502 } 503 504 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 505 506 // Copy the size in bytes. 507 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 508 .addReg(FlatScrInitHi, RegState::Kill); 509 510 // Add wave offset in bytes to private base offset. 511 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 512 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) 513 .addReg(FlatScrInitLo) 514 .addReg(ScratchWaveOffsetReg); 515 516 // Convert offset to 256-byte units. 517 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), 518 AMDGPU::FLAT_SCR_HI) 519 .addReg(FlatScrInitLo, RegState::Kill) 520 .addImm(8); 521 LShr->getOperand(3).setIsDead(); // Mark SCC as dead. 522 } 523 524 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 525 // memory. They should have been removed by now. 526 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 527 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 528 I != E; ++I) { 529 if (!MFI.isDeadObjectIndex(I)) 530 return false; 531 } 532 533 return true; 534 } 535 536 // Shift down registers reserved for the scratch RSRC. 537 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 538 MachineFunction &MF) const { 539 540 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 541 const SIInstrInfo *TII = ST.getInstrInfo(); 542 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 543 MachineRegisterInfo &MRI = MF.getRegInfo(); 544 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 545 546 assert(MFI->isEntryFunction()); 547 548 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 549 550 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 551 allStackObjectsAreDead(MF.getFrameInfo()))) 552 return Register(); 553 554 if (ST.hasSGPRInitBug() || 555 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 556 return ScratchRsrcReg; 557 558 // We reserved the last registers for this. Shift it down to the end of those 559 // which were actually used. 560 // 561 // FIXME: It might be safer to use a pseudoregister before replacement. 562 563 // FIXME: We should be able to eliminate unused input registers. We only 564 // cannot do this for the resources required for scratch access. For now we 565 // skip over user SGPRs and may leave unused holes. 566 567 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 568 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 569 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 570 571 // Skip the last N reserved elements because they should have already been 572 // reserved for VCC etc. 573 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 574 for (MCPhysReg Reg : AllSGPR128s) { 575 // Pick the first unallocated one. Make sure we don't clobber the other 576 // reserved input we needed. Also for PAL, make sure we don't clobber 577 // the GIT pointer passed in SGPR0 or SGPR8. 578 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 579 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { 580 MRI.replaceRegWith(ScratchRsrcReg, Reg); 581 MFI->setScratchRSrcReg(Reg); 582 return Reg; 583 } 584 } 585 586 return ScratchRsrcReg; 587 } 588 589 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 590 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 591 } 592 593 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 594 MachineBasicBlock &MBB) const { 595 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 596 597 // FIXME: If we only have SGPR spills, we won't actually be using scratch 598 // memory since these spill to VGPRs. We should be cleaning up these unused 599 // SGPR spill frame indices somewhere. 600 601 // FIXME: We still have implicit uses on SGPR spill instructions in case they 602 // need to spill to vector memory. It's likely that will not happen, but at 603 // this point it appears we need the setup. This part of the prolog should be 604 // emitted after frame indices are eliminated. 605 606 // FIXME: Remove all of the isPhysRegUsed checks 607 608 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 609 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 610 const SIInstrInfo *TII = ST.getInstrInfo(); 611 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 612 MachineRegisterInfo &MRI = MF.getRegInfo(); 613 const Function &F = MF.getFunction(); 614 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 615 616 assert(MFI->isEntryFunction()); 617 618 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 619 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 620 621 // We need to do the replacement of the private segment buffer register even 622 // if there are no stack objects. There could be stores to undef or a 623 // constant without an associated object. 624 // 625 // This will return `Register()` in cases where there are no actual 626 // uses of the SRSRC. 627 Register ScratchRsrcReg; 628 if (!ST.enableFlatScratch()) 629 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 630 631 // Make the selected register live throughout the function. 632 if (ScratchRsrcReg) { 633 for (MachineBasicBlock &OtherBB : MF) { 634 if (&OtherBB != &MBB) { 635 OtherBB.addLiveIn(ScratchRsrcReg); 636 } 637 } 638 } 639 640 // Now that we have fixed the reserved SRSRC we need to locate the 641 // (potentially) preloaded SRSRC. 642 Register PreloadedScratchRsrcReg; 643 if (ST.isAmdHsaOrMesa(F)) { 644 PreloadedScratchRsrcReg = 645 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 646 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 647 // We added live-ins during argument lowering, but since they were not 648 // used they were deleted. We're adding the uses now, so add them back. 649 MRI.addLiveIn(PreloadedScratchRsrcReg); 650 MBB.addLiveIn(PreloadedScratchRsrcReg); 651 } 652 } 653 654 // Debug location must be unknown since the first debug location is used to 655 // determine the end of the prologue. 656 DebugLoc DL; 657 MachineBasicBlock::iterator I = MBB.begin(); 658 659 // We found the SRSRC first because it needs four registers and has an 660 // alignment requirement. If the SRSRC that we found is clobbering with 661 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 662 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 663 // wave offset to a free SGPR. 664 Register ScratchWaveOffsetReg; 665 if (PreloadedScratchWaveOffsetReg && 666 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 667 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 668 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 669 AllSGPRs = AllSGPRs.slice( 670 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 671 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 672 for (MCPhysReg Reg : AllSGPRs) { 673 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 674 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 675 ScratchWaveOffsetReg = Reg; 676 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 677 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 678 break; 679 } 680 } 681 } else { 682 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 683 } 684 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); 685 686 if (requiresStackPointerReference(MF)) { 687 Register SPReg = MFI->getStackPtrOffsetReg(); 688 assert(SPReg != AMDGPU::SP_REG); 689 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 690 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); 691 } 692 693 if (hasFP(MF)) { 694 Register FPReg = MFI->getFrameOffsetReg(); 695 assert(FPReg != AMDGPU::FP_REG); 696 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 697 } 698 699 bool NeedsFlatScratchInit = 700 MFI->getUserSGPRInfo().hasFlatScratchInit() && 701 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || 702 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); 703 704 if ((NeedsFlatScratchInit || ScratchRsrcReg) && 705 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { 706 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 707 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 708 } 709 710 if (NeedsFlatScratchInit) { 711 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 712 } 713 714 if (ScratchRsrcReg) { 715 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 716 PreloadedScratchRsrcReg, 717 ScratchRsrcReg, ScratchWaveOffsetReg); 718 } 719 } 720 721 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 722 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 723 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 724 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 725 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 726 727 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 728 const SIInstrInfo *TII = ST.getInstrInfo(); 729 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 730 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 731 const Function &Fn = MF.getFunction(); 732 733 if (ST.isAmdPalOS()) { 734 // The pointer to the GIT is formed from the offset passed in and either 735 // the amdgpu-git-ptr-high function attribute or the top part of the PC 736 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 737 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 738 739 buildGitPtr(MBB, I, DL, TII, Rsrc01); 740 741 // We now have the GIT ptr - now get the scratch descriptor from the entry 742 // at offset 0 (or offset 16 for a compute shader). 743 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 744 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 745 auto MMO = MF.getMachineMemOperand(PtrInfo, 746 MachineMemOperand::MOLoad | 747 MachineMemOperand::MOInvariant | 748 MachineMemOperand::MODereferenceable, 749 16, Align(4)); 750 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 751 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 752 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 753 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 754 .addReg(Rsrc01) 755 .addImm(EncodedOffset) // offset 756 .addImm(0) // cpol 757 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 758 .addMemOperand(MMO); 759 760 // The driver will always set the SRD for wave 64 (bits 118:117 of 761 // descriptor / bits 22:21 of third sub-reg will be 0b11) 762 // If the shader is actually wave32 we have to modify the const_index_stride 763 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 764 // reason the driver does this is that there can be cases where it presents 765 // 2 shaders with different wave size (e.g. VsFs). 766 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 767 if (ST.isWave32()) { 768 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 769 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 770 .addImm(21) 771 .addReg(Rsrc03); 772 } 773 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 774 assert(!ST.isAmdHsaOrMesa(Fn)); 775 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 776 777 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 778 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 779 780 // Use relocations to get the pointer, and setup the other bits manually. 781 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 782 783 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { 784 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 785 786 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 787 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 788 789 BuildMI(MBB, I, DL, Mov64, Rsrc01) 790 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 791 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 792 } else { 793 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 794 795 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 796 auto MMO = MF.getMachineMemOperand( 797 PtrInfo, 798 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 799 MachineMemOperand::MODereferenceable, 800 8, Align(4)); 801 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 802 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 803 .addImm(0) // offset 804 .addImm(0) // cpol 805 .addMemOperand(MMO) 806 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 807 808 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 809 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 810 } 811 } else { 812 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 813 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 814 815 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 816 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 817 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 818 819 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 820 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 821 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 822 } 823 824 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 825 .addImm(Rsrc23 & 0xffffffff) 826 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 827 828 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 829 .addImm(Rsrc23 >> 32) 830 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 831 } else if (ST.isAmdHsaOrMesa(Fn)) { 832 assert(PreloadedScratchRsrcReg); 833 834 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 835 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 836 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 837 } 838 } 839 840 // Add the scratch wave offset into the scratch RSRC. 841 // 842 // We only want to update the first 48 bits, which is the base address 843 // pointer, without touching the adjacent 16 bits of flags. We know this add 844 // cannot carry-out from bit 47, otherwise the scratch allocation would be 845 // impossible to fit in the 48-bit global address space. 846 // 847 // TODO: Evaluate if it is better to just construct an SRD using the flat 848 // scratch init and some constants rather than update the one we are passed. 849 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 850 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 851 852 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 853 // the kernel body via inreg arguments. 854 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 855 .addReg(ScratchRsrcSub0) 856 .addReg(ScratchWaveOffsetReg) 857 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 858 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 859 .addReg(ScratchRsrcSub1) 860 .addImm(0) 861 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 862 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 863 } 864 865 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 866 switch (ID) { 867 case TargetStackID::Default: 868 case TargetStackID::NoAlloc: 869 case TargetStackID::SGPRSpill: 870 return true; 871 case TargetStackID::ScalableVector: 872 case TargetStackID::WasmLocal: 873 return false; 874 } 875 llvm_unreachable("Invalid TargetStackID::Value"); 876 } 877 878 // Activate only the inactive lanes when \p EnableInactiveLanes is true. 879 // Otherwise, activate all lanes. It returns the saved exec. 880 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, 881 MachineFunction &MF, 882 MachineBasicBlock &MBB, 883 MachineBasicBlock::iterator MBBI, 884 const DebugLoc &DL, bool IsProlog, 885 bool EnableInactiveLanes) { 886 Register ScratchExecCopy; 887 MachineRegisterInfo &MRI = MF.getRegInfo(); 888 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 889 const SIInstrInfo *TII = ST.getInstrInfo(); 890 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 891 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 892 893 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 894 895 ScratchExecCopy = findScratchNonCalleeSaveRegister( 896 MRI, LiveUnits, *TRI.getWaveMaskRegClass()); 897 if (!ScratchExecCopy) 898 report_fatal_error("failed to find free scratch register"); 899 900 LiveUnits.addReg(ScratchExecCopy); 901 902 const unsigned SaveExecOpc = 903 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 904 : AMDGPU::S_OR_SAVEEXEC_B32) 905 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 906 : AMDGPU::S_OR_SAVEEXEC_B64); 907 auto SaveExec = 908 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); 909 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 910 911 return ScratchExecCopy; 912 } 913 914 void SIFrameLowering::emitCSRSpillStores( 915 MachineFunction &MF, MachineBasicBlock &MBB, 916 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, 917 Register FrameReg, Register FramePtrRegScratchCopy) const { 918 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 919 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 920 const SIInstrInfo *TII = ST.getInstrInfo(); 921 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 922 923 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 924 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 925 // might end up flipping the EXEC bits twice. 926 Register ScratchExecCopy; 927 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 928 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 929 if (!WWMScratchRegs.empty()) 930 ScratchExecCopy = 931 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 932 /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 933 934 auto StoreWWMRegisters = 935 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 936 for (const auto &Reg : WWMRegs) { 937 Register VGPR = Reg.first; 938 int FI = Reg.second; 939 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 940 VGPR, FI, FrameReg); 941 } 942 }; 943 944 StoreWWMRegisters(WWMScratchRegs); 945 if (!WWMCalleeSavedRegs.empty()) { 946 if (ScratchExecCopy) { 947 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 948 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 949 } else { 950 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 951 /*IsProlog*/ true, 952 /*EnableInactiveLanes*/ false); 953 } 954 } 955 956 StoreWWMRegisters(WWMCalleeSavedRegs); 957 if (ScratchExecCopy) { 958 // FIXME: Split block and make terminator. 959 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 960 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 961 .addReg(ScratchExecCopy, RegState::Kill); 962 LiveUnits.addReg(ScratchExecCopy); 963 } 964 965 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 966 967 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 968 // Special handle FP spill: 969 // Skip if FP is saved to a scratch SGPR, the save has already been emitted. 970 // Otherwise, FP has been moved to a temporary register and spill it 971 // instead. 972 Register Reg = 973 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 974 if (!Reg) 975 continue; 976 977 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 978 LiveUnits, FrameReg); 979 SB.save(); 980 } 981 982 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make 983 // such scratch registers live throughout the function. 984 SmallVector<Register, 1> ScratchSGPRs; 985 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs); 986 if (!ScratchSGPRs.empty()) { 987 for (MachineBasicBlock &MBB : MF) { 988 for (MCPhysReg Reg : ScratchSGPRs) 989 MBB.addLiveIn(Reg); 990 991 MBB.sortUniqueLiveIns(); 992 } 993 if (!LiveUnits.empty()) { 994 for (MCPhysReg Reg : ScratchSGPRs) 995 LiveUnits.addReg(Reg); 996 } 997 } 998 } 999 1000 void SIFrameLowering::emitCSRSpillRestores( 1001 MachineFunction &MF, MachineBasicBlock &MBB, 1002 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, 1003 Register FrameReg, Register FramePtrRegScratchCopy) const { 1004 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1005 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1006 const SIInstrInfo *TII = ST.getInstrInfo(); 1007 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1008 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1009 1010 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 1011 // Special handle FP restore: 1012 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore 1013 // the FP value to a temporary register. The frame pointer should be 1014 // overwritten only at the end when all other spills are restored from 1015 // current frame. 1016 Register Reg = 1017 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 1018 if (!Reg) 1019 continue; 1020 1021 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 1022 LiveUnits, FrameReg); 1023 SB.restore(); 1024 } 1025 1026 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the 1027 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to 1028 // this, we might end up flipping the EXEC bits twice. 1029 Register ScratchExecCopy; 1030 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 1031 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 1032 if (!WWMScratchRegs.empty()) 1033 ScratchExecCopy = 1034 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1035 /*IsProlog*/ false, /*EnableInactiveLanes*/ true); 1036 1037 auto RestoreWWMRegisters = 1038 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 1039 for (const auto &Reg : WWMRegs) { 1040 Register VGPR = Reg.first; 1041 int FI = Reg.second; 1042 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 1043 VGPR, FI, FrameReg); 1044 } 1045 }; 1046 1047 RestoreWWMRegisters(WWMScratchRegs); 1048 if (!WWMCalleeSavedRegs.empty()) { 1049 if (ScratchExecCopy) { 1050 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1051 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 1052 } else { 1053 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1054 /*IsProlog*/ false, 1055 /*EnableInactiveLanes*/ false); 1056 } 1057 } 1058 1059 RestoreWWMRegisters(WWMCalleeSavedRegs); 1060 if (ScratchExecCopy) { 1061 // FIXME: Split block and make terminator. 1062 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1063 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 1064 .addReg(ScratchExecCopy, RegState::Kill); 1065 } 1066 } 1067 1068 void SIFrameLowering::emitPrologue(MachineFunction &MF, 1069 MachineBasicBlock &MBB) const { 1070 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1071 if (FuncInfo->isEntryFunction()) { 1072 emitEntryFunctionPrologue(MF, MBB); 1073 return; 1074 } 1075 1076 MachineFrameInfo &MFI = MF.getFrameInfo(); 1077 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1078 const SIInstrInfo *TII = ST.getInstrInfo(); 1079 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1080 MachineRegisterInfo &MRI = MF.getRegInfo(); 1081 1082 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1083 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1084 Register BasePtrReg = 1085 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 1086 LiveRegUnits LiveUnits; 1087 1088 MachineBasicBlock::iterator MBBI = MBB.begin(); 1089 // DebugLoc must be unknown since the first instruction with DebugLoc is used 1090 // to determine the end of the prologue. 1091 DebugLoc DL; 1092 1093 if (FuncInfo->isChainFunction()) { 1094 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but 1095 // are free to set one up if they need it. 1096 bool UseSP = requiresStackPointerReference(MF); 1097 if (UseSP) { 1098 assert(StackPtrReg != AMDGPU::SP_REG); 1099 1100 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) 1101 .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); 1102 } 1103 } 1104 1105 bool HasFP = false; 1106 bool HasBP = false; 1107 uint32_t NumBytes = MFI.getStackSize(); 1108 uint32_t RoundedSize = NumBytes; 1109 1110 if (TRI.hasStackRealignment(MF)) 1111 HasFP = true; 1112 1113 Register FramePtrRegScratchCopy; 1114 if (!HasFP && !hasFP(MF)) { 1115 // Emit the CSR spill stores with SP base register. 1116 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, 1117 FuncInfo->isChainFunction() ? Register() : StackPtrReg, 1118 FramePtrRegScratchCopy); 1119 } else { 1120 // CSR spill stores will use FP as base register. 1121 Register SGPRForFPSaveRestoreCopy = 1122 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1123 1124 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 1125 if (SGPRForFPSaveRestoreCopy) { 1126 // Copy FP to the scratch register now and emit the CFI entry. It avoids 1127 // the extra FP copy needed in the other two cases when FP is spilled to 1128 // memory or to a VGPR lane. 1129 PrologEpilogSGPRSpillBuilder SB( 1130 FramePtrReg, 1131 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, 1132 DL, TII, TRI, LiveUnits, FramePtrReg); 1133 SB.save(); 1134 LiveUnits.addReg(SGPRForFPSaveRestoreCopy); 1135 } else { 1136 // Copy FP into a new scratch register so that its previous value can be 1137 // spilled after setting up the new frame. 1138 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1139 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); 1140 if (!FramePtrRegScratchCopy) 1141 report_fatal_error("failed to find free scratch register"); 1142 1143 LiveUnits.addReg(FramePtrRegScratchCopy); 1144 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) 1145 .addReg(FramePtrReg); 1146 } 1147 } 1148 1149 if (HasFP) { 1150 const unsigned Alignment = MFI.getMaxAlign().value(); 1151 1152 RoundedSize += Alignment; 1153 if (LiveUnits.empty()) { 1154 LiveUnits.init(TRI); 1155 LiveUnits.addLiveIns(MBB); 1156 } 1157 1158 // s_add_i32 s33, s32, NumBytes 1159 // s_and_b32 s33, s33, 0b111...0000 1160 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) 1161 .addReg(StackPtrReg) 1162 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 1163 .setMIFlag(MachineInstr::FrameSetup); 1164 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 1165 .addReg(FramePtrReg, RegState::Kill) 1166 .addImm(-Alignment * getScratchScaleFactor(ST)) 1167 .setMIFlag(MachineInstr::FrameSetup); 1168 And->getOperand(3).setIsDead(); // Mark SCC as dead. 1169 FuncInfo->setIsStackRealigned(true); 1170 } else if ((HasFP = hasFP(MF))) { 1171 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1172 .addReg(StackPtrReg) 1173 .setMIFlag(MachineInstr::FrameSetup); 1174 } 1175 1176 // If FP is used, emit the CSR spills with FP base register. 1177 if (HasFP) { 1178 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, 1179 FramePtrRegScratchCopy); 1180 if (FramePtrRegScratchCopy) 1181 LiveUnits.removeReg(FramePtrRegScratchCopy); 1182 } 1183 1184 // If we need a base pointer, set it up here. It's whatever the value of 1185 // the stack pointer is at this point. Any variable size objects will be 1186 // allocated after this, so we can still use the base pointer to reference 1187 // the incoming arguments. 1188 if ((HasBP = TRI.hasBasePointer(MF))) { 1189 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1190 .addReg(StackPtrReg) 1191 .setMIFlag(MachineInstr::FrameSetup); 1192 } 1193 1194 if (HasFP && RoundedSize != 0) { 1195 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1196 .addReg(StackPtrReg) 1197 .addImm(RoundedSize * getScratchScaleFactor(ST)) 1198 .setMIFlag(MachineInstr::FrameSetup); 1199 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1200 } 1201 1202 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1203 (void)FPSaved; 1204 assert((!HasFP || FPSaved) && 1205 "Needed to save FP but didn't save it anywhere"); 1206 1207 // If we allow spilling to AGPRs we may have saved FP but then spill 1208 // everything into AGPRs instead of the stack. 1209 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && 1210 "Saved FP but didn't need it"); 1211 1212 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg); 1213 (void)BPSaved; 1214 assert((!HasBP || BPSaved) && 1215 "Needed to save BP but didn't save it anywhere"); 1216 1217 assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); 1218 } 1219 1220 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 1221 MachineBasicBlock &MBB) const { 1222 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1223 if (FuncInfo->isEntryFunction()) 1224 return; 1225 1226 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1227 const SIInstrInfo *TII = ST.getInstrInfo(); 1228 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1229 MachineRegisterInfo &MRI = MF.getRegInfo(); 1230 LiveRegUnits LiveUnits; 1231 // Get the insert location for the epilogue. If there were no terminators in 1232 // the block, get the last instruction. 1233 MachineBasicBlock::iterator MBBI = MBB.end(); 1234 DebugLoc DL; 1235 if (!MBB.empty()) { 1236 MBBI = MBB.getLastNonDebugInstr(); 1237 if (MBBI != MBB.end()) 1238 DL = MBBI->getDebugLoc(); 1239 1240 MBBI = MBB.getFirstTerminator(); 1241 } 1242 1243 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1244 uint32_t NumBytes = MFI.getStackSize(); 1245 uint32_t RoundedSize = FuncInfo->isStackRealigned() 1246 ? NumBytes + MFI.getMaxAlign().value() 1247 : NumBytes; 1248 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1249 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1250 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1251 1252 Register FramePtrRegScratchCopy; 1253 Register SGPRForFPSaveRestoreCopy = 1254 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1255 if (FPSaved) { 1256 // CSR spill restores should use FP as base register. If 1257 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP 1258 // into a new scratch register and copy to FP later when other registers are 1259 // restored from the current stack frame. 1260 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1261 if (SGPRForFPSaveRestoreCopy) { 1262 LiveUnits.addReg(SGPRForFPSaveRestoreCopy); 1263 } else { 1264 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1265 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); 1266 if (!FramePtrRegScratchCopy) 1267 report_fatal_error("failed to find free scratch register"); 1268 1269 LiveUnits.addReg(FramePtrRegScratchCopy); 1270 } 1271 1272 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, 1273 FramePtrRegScratchCopy); 1274 } 1275 1276 if (RoundedSize != 0 && hasFP(MF)) { 1277 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1278 .addReg(StackPtrReg) 1279 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) 1280 .setMIFlag(MachineInstr::FrameDestroy); 1281 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1282 } 1283 1284 if (FPSaved) { 1285 // Insert the copy to restore FP. 1286 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy 1287 : FramePtrRegScratchCopy; 1288 MachineInstrBuilder MIB = 1289 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1290 .addReg(SrcReg); 1291 if (SGPRForFPSaveRestoreCopy) 1292 MIB.setMIFlag(MachineInstr::FrameDestroy); 1293 } else { 1294 // Insert the CSR spill restores with SP as the base register. 1295 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg, 1296 FramePtrRegScratchCopy); 1297 } 1298 } 1299 1300 #ifndef NDEBUG 1301 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1302 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1303 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1304 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1305 I != E; ++I) { 1306 if (!MFI.isDeadObjectIndex(I) && 1307 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1308 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { 1309 return false; 1310 } 1311 } 1312 1313 return true; 1314 } 1315 #endif 1316 1317 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1318 int FI, 1319 Register &FrameReg) const { 1320 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1321 1322 FrameReg = RI->getFrameRegister(MF); 1323 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1324 } 1325 1326 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1327 MachineFunction &MF, 1328 RegScavenger *RS) const { 1329 MachineFrameInfo &MFI = MF.getFrameInfo(); 1330 1331 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1332 const SIInstrInfo *TII = ST.getInstrInfo(); 1333 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1334 MachineRegisterInfo &MRI = MF.getRegInfo(); 1335 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1336 1337 // Allocate spill slots for WWM reserved VGPRs. 1338 // For chain functions, we only need to do this if we have calls to 1339 // llvm.amdgcn.cs.chain. 1340 bool IsChainWithoutCalls = 1341 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); 1342 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { 1343 for (Register Reg : FuncInfo->getWWMReservedRegs()) { 1344 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); 1345 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), 1346 TRI->getSpillAlign(*RC)); 1347 } 1348 } 1349 1350 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() 1351 && EnableSpillVGPRToAGPR; 1352 1353 if (SpillVGPRToAGPR) { 1354 // To track the spill frame indices handled in this pass. 1355 BitVector SpillFIs(MFI.getObjectIndexEnd(), false); 1356 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); 1357 1358 bool SeenDbgInstr = false; 1359 1360 for (MachineBasicBlock &MBB : MF) { 1361 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { 1362 int FrameIndex; 1363 if (MI.isDebugInstr()) 1364 SeenDbgInstr = true; 1365 1366 if (TII->isVGPRSpill(MI)) { 1367 // Try to eliminate stack used by VGPR spills before frame 1368 // finalization. 1369 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1370 AMDGPU::OpName::vaddr); 1371 int FI = MI.getOperand(FIOp).getIndex(); 1372 Register VReg = 1373 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 1374 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, 1375 TRI->isAGPR(MRI, VReg))) { 1376 assert(RS != nullptr); 1377 RS->enterBasicBlockEnd(MBB); 1378 RS->backward(std::next(MI.getIterator())); 1379 TRI->eliminateFrameIndex(MI, 0, FIOp, RS); 1380 SpillFIs.set(FI); 1381 continue; 1382 } 1383 } else if (TII->isStoreToStackSlot(MI, FrameIndex) || 1384 TII->isLoadFromStackSlot(MI, FrameIndex)) 1385 if (!MFI.isFixedObjectIndex(FrameIndex)) 1386 NonVGPRSpillFIs.set(FrameIndex); 1387 } 1388 } 1389 1390 // Stack slot coloring may assign different objects to the same stack slot. 1391 // If not, then the VGPR to AGPR spill slot is dead. 1392 for (unsigned FI : SpillFIs.set_bits()) 1393 if (!NonVGPRSpillFIs.test(FI)) 1394 FuncInfo->setVGPRToAGPRSpillDead(FI); 1395 1396 for (MachineBasicBlock &MBB : MF) { 1397 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) 1398 MBB.addLiveIn(Reg); 1399 1400 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) 1401 MBB.addLiveIn(Reg); 1402 1403 MBB.sortUniqueLiveIns(); 1404 1405 if (!SpillFIs.empty() && SeenDbgInstr) { 1406 // FIXME: The dead frame indices are replaced with a null register from 1407 // the debug value instructions. We should instead, update it with the 1408 // correct register value. But not sure the register value alone is 1409 for (MachineInstr &MI : MBB) { 1410 if (MI.isDebugValue() && MI.getOperand(0).isFI() && 1411 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) && 1412 SpillFIs[MI.getOperand(0).getIndex()]) { 1413 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); 1414 } 1415 } 1416 } 1417 } 1418 } 1419 1420 // At this point we've already allocated all spilled SGPRs to VGPRs if we 1421 // can. Any remaining SGPR spills will go to memory, so move them back to the 1422 // default stack. 1423 bool HaveSGPRToVMemSpill = 1424 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); 1425 assert(allSGPRSpillsAreDead(MF) && 1426 "SGPR spill should have been removed in SILowerSGPRSpills"); 1427 1428 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1429 // but currently hasNonSpillStackObjects is set only from source 1430 // allocas. Stack temps produced from legalization are not counted currently. 1431 if (!allStackObjectsAreDead(MFI)) { 1432 assert(RS && "RegScavenger required if spilling"); 1433 1434 // Add an emergency spill slot 1435 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1436 1437 // If we are spilling SGPRs to memory with a large frame, we may need a 1438 // second VGPR emergency frame index. 1439 if (HaveSGPRToVMemSpill && 1440 allocateScavengingFrameIndexesNearIncomingSP(MF)) { 1441 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false)); 1442 } 1443 } 1444 } 1445 1446 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( 1447 MachineFunction &MF, RegScavenger *RS) const { 1448 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1449 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1450 MachineRegisterInfo &MRI = MF.getRegInfo(); 1451 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1452 1453 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 1454 // On gfx908, we had initially reserved highest available VGPR for AGPR 1455 // copy. Now since we are done with RA, check if there exist an unused VGPR 1456 // which is lower than the eariler reserved VGPR before RA. If one exist, 1457 // use it for AGPR copy instead of one reserved before RA. 1458 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); 1459 Register UnusedLowVGPR = 1460 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); 1461 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < 1462 TRI->getHWRegIndex(VGPRForAGPRCopy))) { 1463 // Reserve this newly identified VGPR (for AGPR copy) 1464 // reserved registers should already be frozen at this point 1465 // so we can avoid calling MRI.freezeReservedRegs and just use 1466 // MRI.reserveReg 1467 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); 1468 MRI.reserveReg(UnusedLowVGPR, TRI); 1469 } 1470 } 1471 // We initally reserved the highest available SGPR pair for long branches 1472 // now, after RA, we shift down to a lower unused one if one exists 1473 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); 1474 Register UnusedLowSGPR = 1475 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); 1476 // If LongBranchReservedReg is null then we didn't find a long branch 1477 // and never reserved a register to begin with so there is nothing to 1478 // shift down. Then if UnusedLowSGPR is null, there isn't available lower 1479 // register to use so just keep the original one we set. 1480 if (LongBranchReservedReg && UnusedLowSGPR) { 1481 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); 1482 MRI.reserveReg(UnusedLowSGPR, TRI); 1483 } 1484 } 1485 1486 // The special SGPR spills like the one needed for FP, BP or any reserved 1487 // registers delayed until frame lowering. 1488 void SIFrameLowering::determinePrologEpilogSGPRSaves( 1489 MachineFunction &MF, BitVector &SavedVGPRs, 1490 bool NeedExecCopyReservedReg) const { 1491 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1492 MachineRegisterInfo &MRI = MF.getRegInfo(); 1493 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1494 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1495 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1496 LiveRegUnits LiveUnits; 1497 LiveUnits.init(*TRI); 1498 // Initially mark callee saved registers as used so we will not choose them 1499 // while looking for scratch SGPRs. 1500 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); 1501 for (unsigned I = 0; CSRegs[I]; ++I) 1502 LiveUnits.addReg(CSRegs[I]); 1503 1504 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); 1505 1506 if (NeedExecCopyReservedReg) { 1507 Register ReservedReg = MFI->getSGPRForEXECCopy(); 1508 assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); 1509 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); 1510 if (UnusedScratchReg) { 1511 // If found any unused scratch SGPR, reserve the register itself for Exec 1512 // copy and there is no need for any spills in that case. 1513 MFI->setSGPRForEXECCopy(UnusedScratchReg); 1514 LiveUnits.addReg(UnusedScratchReg); 1515 } else { 1516 // Needs spill. 1517 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && 1518 "Re-reserving spill slot for EXEC copy register"); 1519 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC, 1520 /*IncludeScratchCopy=*/false); 1521 } 1522 } 1523 1524 // hasFP only knows about stack objects that already exist. We're now 1525 // determining the stack slots that will be created, so we have to predict 1526 // them. Stack objects force FP usage with calls. 1527 // 1528 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1529 // don't want to report it here. 1530 // 1531 // FIXME: Is this really hasReservedCallFrame? 1532 const bool WillHaveFP = 1533 FrameInfo.hasCalls() && 1534 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1535 1536 if (WillHaveFP || hasFP(MF)) { 1537 Register FramePtrReg = MFI->getFrameOffsetReg(); 1538 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && 1539 "Re-reserving spill slot for FP"); 1540 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg); 1541 } 1542 1543 if (TRI->hasBasePointer(MF)) { 1544 Register BasePtrReg = TRI->getBaseRegister(); 1545 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && 1546 "Re-reserving spill slot for BP"); 1547 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg); 1548 } 1549 } 1550 1551 // Only report VGPRs to generic code. 1552 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1553 BitVector &SavedVGPRs, 1554 RegScavenger *RS) const { 1555 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1556 1557 // If this is a function with the amdgpu_cs_chain[_preserve] calling 1558 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then 1559 // we don't need to save and restore anything. 1560 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) 1561 return; 1562 1563 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1564 if (MFI->isEntryFunction()) 1565 return; 1566 1567 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1568 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1569 const SIInstrInfo *TII = ST.getInstrInfo(); 1570 bool NeedExecCopyReservedReg = false; 1571 1572 MachineInstr *ReturnMI = nullptr; 1573 for (MachineBasicBlock &MBB : MF) { 1574 for (MachineInstr &MI : MBB) { 1575 // WRITELANE instructions used for SGPR spills can overwrite the inactive 1576 // lanes of VGPRs and callee must spill and restore them even if they are 1577 // marked Caller-saved. 1578 1579 // TODO: Handle this elsewhere at an early point. Walking through all MBBs 1580 // here would be a bad heuristic. A better way should be by calling 1581 // allocateWWMSpill during the regalloc pipeline whenever a physical 1582 // register is allocated for the intended virtual registers. 1583 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) 1584 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); 1585 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) 1586 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); 1587 else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) 1588 NeedExecCopyReservedReg = true; 1589 else if (MI.getOpcode() == AMDGPU::SI_RETURN || 1590 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1591 (MFI->isChainFunction() && 1592 TII->isChainCallOpcode(MI.getOpcode()))) { 1593 // We expect all return to be the same size. 1594 assert(!ReturnMI || 1595 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == 1596 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); 1597 ReturnMI = &MI; 1598 } 1599 } 1600 } 1601 1602 // Remove any VGPRs used in the return value because these do not need to be saved. 1603 // This prevents CSR restore from clobbering return VGPRs. 1604 if (ReturnMI) { 1605 for (auto &Op : ReturnMI->operands()) { 1606 if (Op.isReg()) 1607 SavedVGPRs.reset(Op.getReg()); 1608 } 1609 } 1610 1611 // Ignore the SGPRs the default implementation found. 1612 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1613 1614 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1615 // In gfx908 there was do AGPR loads and stores and thus spilling also 1616 // require a temporary VGPR. 1617 if (!ST.hasGFX90AInsts()) 1618 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1619 1620 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); 1621 1622 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't 1623 // allow the default insertion to handle them. 1624 for (auto &Reg : MFI->getWWMSpills()) 1625 SavedVGPRs.reset(Reg.first); 1626 1627 // Mark all lane VGPRs as BB LiveIns. 1628 for (MachineBasicBlock &MBB : MF) { 1629 for (auto &Reg : MFI->getWWMSpills()) 1630 MBB.addLiveIn(Reg.first); 1631 1632 MBB.sortUniqueLiveIns(); 1633 } 1634 } 1635 1636 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1637 BitVector &SavedRegs, 1638 RegScavenger *RS) const { 1639 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1640 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1641 if (MFI->isEntryFunction()) 1642 return; 1643 1644 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1645 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1646 1647 // The SP is specifically managed and we don't want extra spills of it. 1648 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1649 1650 const BitVector AllSavedRegs = SavedRegs; 1651 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1652 1653 // We have to anticipate introducing CSR VGPR spills or spill of caller 1654 // save VGPR reserved for SGPR spills as we now always create stack entry 1655 // for it, if we don't have any stack objects already, since we require a FP 1656 // if there is a call and stack. We will allocate a VGPR for SGPR spills if 1657 // there are any SGPR spills. Whether they are CSR spills or otherwise. 1658 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1659 const bool WillHaveFP = 1660 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); 1661 1662 // FP will be specially managed like SP. 1663 if (WillHaveFP || hasFP(MF)) 1664 SavedRegs.reset(MFI->getFrameOffsetReg()); 1665 1666 // Return address use with return instruction is hidden through the SI_RETURN 1667 // pseudo. Given that and since the IPRA computes actual register usage and 1668 // does not use CSR list, the clobbering of return address by function calls 1669 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register 1670 // usage collection. This will ensure save/restore of return address happens 1671 // in those scenarios. 1672 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1673 Register RetAddrReg = TRI->getReturnAddressReg(MF); 1674 if (!MFI->isEntryFunction() && 1675 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) { 1676 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); 1677 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); 1678 } 1679 } 1680 1681 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1682 MachineFunction &MF, const TargetRegisterInfo *TRI, 1683 std::vector<CalleeSavedInfo> &CSI) const { 1684 if (CSI.empty()) 1685 return true; // Early exit if no callee saved registers are modified! 1686 1687 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1688 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1689 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1690 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1691 Register BasePtrReg = RI->getBaseRegister(); 1692 Register SGPRForFPSaveRestoreCopy = 1693 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1694 Register SGPRForBPSaveRestoreCopy = 1695 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg); 1696 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) 1697 return false; 1698 1699 unsigned NumModifiedRegs = 0; 1700 1701 if (SGPRForFPSaveRestoreCopy) 1702 NumModifiedRegs++; 1703 if (SGPRForBPSaveRestoreCopy) 1704 NumModifiedRegs++; 1705 1706 for (auto &CS : CSI) { 1707 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { 1708 CS.setDstReg(SGPRForFPSaveRestoreCopy); 1709 if (--NumModifiedRegs) 1710 break; 1711 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { 1712 CS.setDstReg(SGPRForBPSaveRestoreCopy); 1713 if (--NumModifiedRegs) 1714 break; 1715 } 1716 } 1717 1718 return false; 1719 } 1720 1721 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( 1722 const MachineFunction &MF) const { 1723 1724 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1725 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1726 const SIInstrInfo *TII = ST.getInstrInfo(); 1727 uint64_t EstStackSize = MFI.estimateStackSize(MF); 1728 uint64_t MaxOffset = EstStackSize - 1; 1729 1730 // We need the emergency stack slots to be allocated in range of the 1731 // MUBUF/flat scratch immediate offset from the base register, so assign these 1732 // first at the incoming SP position. 1733 // 1734 // TODO: We could try sorting the objects to find a hole in the first bytes 1735 // rather than allocating as close to possible. This could save a lot of space 1736 // on frames with alignment requirements. 1737 if (ST.enableFlatScratch()) { 1738 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1739 SIInstrFlags::FlatScratch)) 1740 return false; 1741 } else { 1742 if (TII->isLegalMUBUFImmOffset(MaxOffset)) 1743 return false; 1744 } 1745 1746 return true; 1747 } 1748 1749 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1750 MachineFunction &MF, 1751 MachineBasicBlock &MBB, 1752 MachineBasicBlock::iterator I) const { 1753 int64_t Amount = I->getOperand(0).getImm(); 1754 if (Amount == 0) 1755 return MBB.erase(I); 1756 1757 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1758 const SIInstrInfo *TII = ST.getInstrInfo(); 1759 const DebugLoc &DL = I->getDebugLoc(); 1760 unsigned Opc = I->getOpcode(); 1761 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1762 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1763 1764 if (!hasReservedCallFrame(MF)) { 1765 Amount = alignTo(Amount, getStackAlign()); 1766 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1767 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1768 Register SPReg = MFI->getStackPtrOffsetReg(); 1769 1770 Amount *= getScratchScaleFactor(ST); 1771 if (IsDestroy) 1772 Amount = -Amount; 1773 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) 1774 .addReg(SPReg) 1775 .addImm(Amount); 1776 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1777 } else if (CalleePopAmount != 0) { 1778 llvm_unreachable("is this used?"); 1779 } 1780 1781 return MBB.erase(I); 1782 } 1783 1784 /// Returns true if the frame will require a reference to the stack pointer. 1785 /// 1786 /// This is the set of conditions common to setting up the stack pointer in a 1787 /// kernel, and for using a frame pointer in a callable function. 1788 /// 1789 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1790 /// references SP. 1791 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1792 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1793 } 1794 1795 // The FP for kernels is always known 0, so we never really need to setup an 1796 // explicit register for it. However, DisableFramePointerElim will force us to 1797 // use a register for it. 1798 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1799 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1800 1801 // For entry & chain functions we can use an immediate offset in most cases, 1802 // so the presence of calls doesn't imply we need a distinct frame pointer. 1803 if (MFI.hasCalls() && 1804 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1805 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { 1806 // All offsets are unsigned, so need to be addressed in the same direction 1807 // as stack growth. 1808 1809 // FIXME: This function is pretty broken, since it can be called before the 1810 // frame layout is determined or CSR spills are inserted. 1811 return MFI.getStackSize() != 0; 1812 } 1813 1814 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1815 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1816 MF) || 1817 MF.getTarget().Options.DisableFramePointerElim(MF); 1818 } 1819 1820 // This is essentially a reduced version of hasFP for entry functions. Since the 1821 // stack pointer is known 0 on entry to kernels, we never really need an FP 1822 // register. We may need to initialize the stack pointer depending on the frame 1823 // properties, which logically overlaps many of the cases where an ordinary 1824 // function would require an FP. 1825 // Also used for chain functions. While not technically entry functions, chain 1826 // functions may need to set up a stack pointer in some situations. 1827 bool SIFrameLowering::requiresStackPointerReference( 1828 const MachineFunction &MF) const { 1829 // Callable functions always require a stack pointer reference. 1830 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || 1831 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && 1832 "only expected to call this for entry points and chain functions"); 1833 1834 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1835 1836 // Entry points ordinarily don't need to initialize SP. We have to set it up 1837 // for callees if there are any. Also note tail calls are impossible/don't 1838 // make any sense for kernels. 1839 if (MFI.hasCalls()) 1840 return true; 1841 1842 // We still need to initialize the SP if we're doing anything weird that 1843 // references the SP, like variable sized stack objects. 1844 return frameTriviallyRequiresSP(MFI); 1845 } 1846