1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LiveRegUnits.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 static cl::opt<bool> EnableSpillVGPRToAGPR( 24 "amdgpu-spill-vgpr-to-agpr", 25 cl::desc("Enable spilling VGPRs to AGPRs"), 26 cl::ReallyHidden, 27 cl::init(true)); 28 29 // Find a register matching \p RC from \p LiveUnits which is unused and 30 // available throughout the function. On failure, returns AMDGPU::NoRegister. 31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of 32 // MCRegisters. This should reduce the number of iterations and avoid redundant 33 // checking. 34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, 35 const LiveRegUnits &LiveUnits, 36 const TargetRegisterClass &RC) { 37 for (MCRegister Reg : RC) { 38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) && 39 !MRI.isReserved(Reg)) 40 return Reg; 41 } 42 return MCRegister(); 43 } 44 45 // Find a scratch register that we can use in the prologue. We avoid using 46 // callee-save registers since they may appear to be free when this is called 47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 48 // when this is called from emitPrologue. 49 static MCRegister findScratchNonCalleeSaveRegister( 50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, 51 const TargetRegisterClass &RC, bool Unused = false) { 52 // Mark callee saved registers as used so we will not choose them. 53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 54 for (unsigned i = 0; CSRegs[i]; ++i) 55 LiveUnits.addReg(CSRegs[i]); 56 57 // We are looking for a register that can be used throughout the entire 58 // function, so any use is unacceptable. 59 if (Unused) 60 return findUnusedRegister(MRI, LiveUnits, RC); 61 62 for (MCRegister Reg : RC) { 63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg)) 64 return Reg; 65 } 66 67 return MCRegister(); 68 } 69 70 /// Query target location for spilling SGPRs 71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs 72 static void getVGPRSpillLaneOrTempRegister( 73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, 74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, 75 bool IncludeScratchCopy = true) { 76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 77 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 78 79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 80 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 81 unsigned Size = TRI->getSpillSize(RC); 82 Align Alignment = TRI->getSpillAlign(RC); 83 84 // We need to save and restore the given SGPR. 85 86 Register ScratchSGPR; 87 // 1: Try to save the given register into an unused scratch SGPR. The 88 // LiveUnits should have all the callee saved registers marked as used. For 89 // certain cases we skip copy to scratch SGPR. 90 if (IncludeScratchCopy) 91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC); 92 93 if (!ScratchSGPR) { 94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, 95 TargetStackID::SGPRSpill); 96 97 if (TRI->spillSGPRToVGPR() && 98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, 99 /*IsPrologEpilog=*/true)) { 100 // 2: There's no free lane to spill, and no free register to save the 101 // SGPR, so we're forced to take another VGPR to use for the spill. 102 MFI->addToPrologEpilogSGPRSpills( 103 SGPR, PrologEpilogSGPRSaveRestoreInfo( 104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); 105 106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); 107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " 108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 109 << '\n';); 110 } else { 111 // Remove dead <FI> index 112 MF.getFrameInfo().RemoveStackObject(FI); 113 // 3: If all else fails, spill the register to memory. 114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment); 115 MFI->addToPrologEpilogSGPRSpills( 116 SGPR, 117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); 118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " 119 << printReg(SGPR, TRI) << '\n'); 120 } 121 } else { 122 MFI->addToPrologEpilogSGPRSpills( 123 SGPR, PrologEpilogSGPRSaveRestoreInfo( 124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); 125 LiveUnits.addReg(ScratchSGPR); 126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " 127 << printReg(ScratchSGPR, TRI) << '\n'); 128 } 129 } 130 131 // We need to specially emit stack operations here because a different frame 132 // register is used than in the rest of the function, as getFrameRegister would 133 // use. 134 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 135 const SIMachineFunctionInfo &FuncInfo, 136 LiveRegUnits &LiveUnits, MachineFunction &MF, 137 MachineBasicBlock &MBB, 138 MachineBasicBlock::iterator I, const DebugLoc &DL, 139 Register SpillReg, int FI, Register FrameReg, 140 int64_t DwordOff = 0) { 141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 143 144 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 146 MachineMemOperand *MMO = MF.getMachineMemOperand( 147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 148 FrameInfo.getObjectAlign(FI)); 149 LiveUnits.addReg(SpillReg); 150 bool IsKill = !MBB.isLiveIn(SpillReg); 151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, 152 DwordOff, MMO, nullptr, &LiveUnits); 153 if (IsKill) 154 LiveUnits.removeReg(SpillReg); 155 } 156 157 static void buildEpilogRestore(const GCNSubtarget &ST, 158 const SIRegisterInfo &TRI, 159 const SIMachineFunctionInfo &FuncInfo, 160 LiveRegUnits &LiveUnits, MachineFunction &MF, 161 MachineBasicBlock &MBB, 162 MachineBasicBlock::iterator I, 163 const DebugLoc &DL, Register SpillReg, int FI, 164 Register FrameReg, int64_t DwordOff = 0) { 165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 167 168 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 170 MachineMemOperand *MMO = MF.getMachineMemOperand( 171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 172 FrameInfo.getObjectAlign(FI)); 173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, 174 DwordOff, MMO, nullptr, &LiveUnits); 175 } 176 177 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 178 const DebugLoc &DL, const SIInstrInfo *TII, 179 Register TargetReg) { 180 MachineFunction *MF = MBB.getParent(); 181 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 182 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 186 187 if (MFI->getGITPtrHigh() != 0xffffffff) { 188 BuildMI(MBB, I, DL, SMovB32, TargetHi) 189 .addImm(MFI->getGITPtrHigh()) 190 .addReg(TargetReg, RegState::ImplicitDefine); 191 } else { 192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo); 193 BuildMI(MBB, I, DL, GetPC64, TargetReg); 194 } 195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 196 MF->getRegInfo().addLiveIn(GitPtrLo); 197 MBB.addLiveIn(GitPtrLo); 198 BuildMI(MBB, I, DL, SMovB32, TargetLo) 199 .addReg(GitPtrLo); 200 } 201 202 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, 203 const SIMachineFunctionInfo *FuncInfo, 204 MachineFunction &MF, MachineBasicBlock &MBB, 205 MachineBasicBlock::iterator MBBI, bool IsProlog) { 206 if (LiveUnits.empty()) { 207 LiveUnits.init(TRI); 208 if (IsProlog) { 209 LiveUnits.addLiveIns(MBB); 210 } else { 211 // In epilog. 212 LiveUnits.addLiveOuts(MBB); 213 LiveUnits.stepBackward(*MBBI); 214 } 215 } 216 } 217 218 namespace llvm { 219 220 // SpillBuilder to save/restore special SGPR spills like the one needed for FP, 221 // BP, etc. These spills are delayed until the current function's frame is 222 // finalized. For a given register, the builder uses the 223 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. 224 class PrologEpilogSGPRSpillBuilder { 225 MachineBasicBlock::iterator MI; 226 MachineBasicBlock &MBB; 227 MachineFunction &MF; 228 const GCNSubtarget &ST; 229 MachineFrameInfo &MFI; 230 SIMachineFunctionInfo *FuncInfo; 231 const SIInstrInfo *TII; 232 const SIRegisterInfo &TRI; 233 Register SuperReg; 234 const PrologEpilogSGPRSaveRestoreInfo SI; 235 LiveRegUnits &LiveUnits; 236 const DebugLoc &DL; 237 Register FrameReg; 238 ArrayRef<int16_t> SplitParts; 239 unsigned NumSubRegs; 240 unsigned EltSize = 4; 241 242 void saveToMemory(const int FI) const { 243 MachineRegisterInfo &MRI = MF.getRegInfo(); 244 assert(!MFI.isDeadObjectIndex(FI)); 245 246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); 247 248 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass); 250 if (!TmpVGPR) 251 report_fatal_error("failed to find free scratch register"); 252 253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 254 Register SubReg = NumSubRegs == 1 255 ? SuperReg 256 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 258 .addReg(SubReg); 259 260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, 261 FI, FrameReg, DwordOff); 262 DwordOff += 4; 263 } 264 } 265 266 void saveToVGPRLane(const int FI) const { 267 assert(!MFI.isDeadObjectIndex(FI)); 268 269 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 270 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 271 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); 272 assert(Spill.size() == NumSubRegs); 273 274 for (unsigned I = 0; I < NumSubRegs; ++I) { 275 Register SubReg = NumSubRegs == 1 276 ? SuperReg 277 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR), 279 Spill[I].VGPR) 280 .addReg(SubReg) 281 .addImm(Spill[I].Lane) 282 .addReg(Spill[I].VGPR, RegState::Undef); 283 } 284 } 285 286 void copyToScratchSGPR(Register DstReg) const { 287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) 288 .addReg(SuperReg) 289 .setMIFlag(MachineInstr::FrameSetup); 290 } 291 292 void restoreFromMemory(const int FI) { 293 MachineRegisterInfo &MRI = MF.getRegInfo(); 294 295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); 296 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass); 298 if (!TmpVGPR) 299 report_fatal_error("failed to find free scratch register"); 300 301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 302 Register SubReg = NumSubRegs == 1 303 ? SuperReg 304 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 305 306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, 307 TmpVGPR, FI, FrameReg, DwordOff); 308 MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); 309 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 310 .addReg(TmpVGPR, RegState::Kill); 311 DwordOff += 4; 312 } 313 } 314 315 void restoreFromVGPRLane(const int FI) { 316 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 317 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 318 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); 319 assert(Spill.size() == NumSubRegs); 320 321 for (unsigned I = 0; I < NumSubRegs; ++I) { 322 Register SubReg = NumSubRegs == 1 323 ? SuperReg 324 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 325 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 326 .addReg(Spill[I].VGPR) 327 .addImm(Spill[I].Lane); 328 } 329 } 330 331 void copyFromScratchSGPR(Register SrcReg) const { 332 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) 333 .addReg(SrcReg) 334 .setMIFlag(MachineInstr::FrameDestroy); 335 } 336 337 public: 338 PrologEpilogSGPRSpillBuilder(Register Reg, 339 const PrologEpilogSGPRSaveRestoreInfo SI, 340 MachineBasicBlock &MBB, 341 MachineBasicBlock::iterator MI, 342 const DebugLoc &DL, const SIInstrInfo *TII, 343 const SIRegisterInfo &TRI, 344 LiveRegUnits &LiveUnits, Register FrameReg) 345 : MI(MI), MBB(MBB), MF(*MBB.getParent()), 346 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), 347 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 348 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), 349 FrameReg(FrameReg) { 350 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 351 SplitParts = TRI.getRegSplitParts(RC, EltSize); 352 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 353 354 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 355 } 356 357 void save() { 358 switch (SI.getKind()) { 359 case SGPRSaveKind::SPILL_TO_MEM: 360 return saveToMemory(SI.getIndex()); 361 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 362 return saveToVGPRLane(SI.getIndex()); 363 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 364 return copyToScratchSGPR(SI.getReg()); 365 } 366 } 367 368 void restore() { 369 switch (SI.getKind()) { 370 case SGPRSaveKind::SPILL_TO_MEM: 371 return restoreFromMemory(SI.getIndex()); 372 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 373 return restoreFromVGPRLane(SI.getIndex()); 374 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 375 return copyFromScratchSGPR(SI.getReg()); 376 } 377 } 378 }; 379 380 } // namespace llvm 381 382 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 383 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 384 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 385 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 386 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 387 const SIInstrInfo *TII = ST.getInstrInfo(); 388 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 389 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 390 391 // We don't need this if we only have spills since there is no user facing 392 // scratch. 393 394 // TODO: If we know we don't have flat instructions earlier, we can omit 395 // this from the input registers. 396 // 397 // TODO: We only need to know if we access scratch space through a flat 398 // pointer. Because we only detect if flat instructions are used at all, 399 // this will be used more often than necessary on VI. 400 401 Register FlatScrInitLo; 402 Register FlatScrInitHi; 403 404 if (ST.isAmdPalOS()) { 405 // Extract the scratch offset from the descriptor in the GIT 406 LiveRegUnits LiveUnits; 407 LiveUnits.init(*TRI); 408 LiveUnits.addLiveIns(MBB); 409 410 // Find unused reg to load flat scratch init into 411 MachineRegisterInfo &MRI = MF.getRegInfo(); 412 Register FlatScrInit = AMDGPU::NoRegister; 413 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 414 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 415 AllSGPR64s = AllSGPR64s.slice( 416 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 417 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 418 for (MCPhysReg Reg : AllSGPR64s) { 419 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && 420 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 421 FlatScrInit = Reg; 422 break; 423 } 424 } 425 assert(FlatScrInit && "Failed to find free register for scratch init"); 426 427 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 428 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 429 430 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 431 432 // We now have the GIT ptr - now get the scratch descriptor from the entry 433 // at offset 0 (or offset 16 for a compute shader). 434 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 435 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 436 auto *MMO = MF.getMachineMemOperand( 437 PtrInfo, 438 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 439 MachineMemOperand::MODereferenceable, 440 8, Align(4)); 441 unsigned Offset = 442 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 443 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 444 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 445 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 446 .addReg(FlatScrInit) 447 .addImm(EncodedOffset) // offset 448 .addImm(0) // cpol 449 .addMemOperand(MMO); 450 451 // Mask the offset in [47:0] of the descriptor 452 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 453 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 454 .addReg(FlatScrInitHi) 455 .addImm(0xffff); 456 And->getOperand(3).setIsDead(); // Mark SCC as dead. 457 } else { 458 Register FlatScratchInitReg = 459 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 460 assert(FlatScratchInitReg); 461 462 MachineRegisterInfo &MRI = MF.getRegInfo(); 463 MRI.addLiveIn(FlatScratchInitReg); 464 MBB.addLiveIn(FlatScratchInitReg); 465 466 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 467 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 468 } 469 470 // Do a 64-bit pointer add. 471 if (ST.flatScratchIsPointer()) { 472 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 473 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 474 .addReg(FlatScrInitLo) 475 .addReg(ScratchWaveOffsetReg); 476 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 477 FlatScrInitHi) 478 .addReg(FlatScrInitHi) 479 .addImm(0); 480 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 481 482 using namespace AMDGPU::Hwreg; 483 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) 484 .addReg(FlatScrInitLo) 485 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32))); 486 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) 487 .addReg(FlatScrInitHi) 488 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32))); 489 return; 490 } 491 492 // For GFX9. 493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 494 .addReg(FlatScrInitLo) 495 .addReg(ScratchWaveOffsetReg); 496 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 497 AMDGPU::FLAT_SCR_HI) 498 .addReg(FlatScrInitHi) 499 .addImm(0); 500 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 501 502 return; 503 } 504 505 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 506 507 // Copy the size in bytes. 508 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 509 .addReg(FlatScrInitHi, RegState::Kill); 510 511 // Add wave offset in bytes to private base offset. 512 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 513 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) 514 .addReg(FlatScrInitLo) 515 .addReg(ScratchWaveOffsetReg); 516 517 // Convert offset to 256-byte units. 518 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), 519 AMDGPU::FLAT_SCR_HI) 520 .addReg(FlatScrInitLo, RegState::Kill) 521 .addImm(8); 522 LShr->getOperand(3).setIsDead(); // Mark SCC as dead. 523 } 524 525 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 526 // memory. They should have been removed by now. 527 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 528 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 529 I != E; ++I) { 530 if (!MFI.isDeadObjectIndex(I)) 531 return false; 532 } 533 534 return true; 535 } 536 537 // Shift down registers reserved for the scratch RSRC. 538 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 539 MachineFunction &MF) const { 540 541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 542 const SIInstrInfo *TII = ST.getInstrInfo(); 543 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 544 MachineRegisterInfo &MRI = MF.getRegInfo(); 545 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 546 547 assert(MFI->isEntryFunction()); 548 549 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 550 551 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 552 allStackObjectsAreDead(MF.getFrameInfo()))) 553 return Register(); 554 555 if (ST.hasSGPRInitBug() || 556 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 557 return ScratchRsrcReg; 558 559 // We reserved the last registers for this. Shift it down to the end of those 560 // which were actually used. 561 // 562 // FIXME: It might be safer to use a pseudoregister before replacement. 563 564 // FIXME: We should be able to eliminate unused input registers. We only 565 // cannot do this for the resources required for scratch access. For now we 566 // skip over user SGPRs and may leave unused holes. 567 568 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 569 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 570 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 571 572 // Skip the last N reserved elements because they should have already been 573 // reserved for VCC etc. 574 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 575 for (MCPhysReg Reg : AllSGPR128s) { 576 // Pick the first unallocated one. Make sure we don't clobber the other 577 // reserved input we needed. Also for PAL, make sure we don't clobber 578 // the GIT pointer passed in SGPR0 or SGPR8. 579 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 580 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { 581 MRI.replaceRegWith(ScratchRsrcReg, Reg); 582 MFI->setScratchRSrcReg(Reg); 583 MRI.reserveReg(Reg, TRI); 584 return Reg; 585 } 586 } 587 588 return ScratchRsrcReg; 589 } 590 591 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 592 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 593 } 594 595 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 596 MachineBasicBlock &MBB) const { 597 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 598 599 // FIXME: If we only have SGPR spills, we won't actually be using scratch 600 // memory since these spill to VGPRs. We should be cleaning up these unused 601 // SGPR spill frame indices somewhere. 602 603 // FIXME: We still have implicit uses on SGPR spill instructions in case they 604 // need to spill to vector memory. It's likely that will not happen, but at 605 // this point it appears we need the setup. This part of the prolog should be 606 // emitted after frame indices are eliminated. 607 608 // FIXME: Remove all of the isPhysRegUsed checks 609 610 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 611 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 612 const SIInstrInfo *TII = ST.getInstrInfo(); 613 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 614 MachineRegisterInfo &MRI = MF.getRegInfo(); 615 const Function &F = MF.getFunction(); 616 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 617 618 assert(MFI->isEntryFunction()); 619 620 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 621 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 622 623 // We need to do the replacement of the private segment buffer register even 624 // if there are no stack objects. There could be stores to undef or a 625 // constant without an associated object. 626 // 627 // This will return `Register()` in cases where there are no actual 628 // uses of the SRSRC. 629 Register ScratchRsrcReg; 630 if (!ST.enableFlatScratch()) 631 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 632 633 // Make the selected register live throughout the function. 634 if (ScratchRsrcReg) { 635 for (MachineBasicBlock &OtherBB : MF) { 636 if (&OtherBB != &MBB) { 637 OtherBB.addLiveIn(ScratchRsrcReg); 638 } 639 } 640 } 641 642 // Now that we have fixed the reserved SRSRC we need to locate the 643 // (potentially) preloaded SRSRC. 644 Register PreloadedScratchRsrcReg; 645 if (ST.isAmdHsaOrMesa(F)) { 646 PreloadedScratchRsrcReg = 647 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 648 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 649 // We added live-ins during argument lowering, but since they were not 650 // used they were deleted. We're adding the uses now, so add them back. 651 MRI.addLiveIn(PreloadedScratchRsrcReg); 652 MBB.addLiveIn(PreloadedScratchRsrcReg); 653 } 654 } 655 656 // Debug location must be unknown since the first debug location is used to 657 // determine the end of the prologue. 658 DebugLoc DL; 659 MachineBasicBlock::iterator I = MBB.begin(); 660 661 // We found the SRSRC first because it needs four registers and has an 662 // alignment requirement. If the SRSRC that we found is clobbering with 663 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 664 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 665 // wave offset to a free SGPR. 666 Register ScratchWaveOffsetReg; 667 if (PreloadedScratchWaveOffsetReg && 668 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 669 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 670 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 671 AllSGPRs = AllSGPRs.slice( 672 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 673 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 674 for (MCPhysReg Reg : AllSGPRs) { 675 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 676 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 677 ScratchWaveOffsetReg = Reg; 678 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 679 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 680 break; 681 } 682 } 683 684 // FIXME: We can spill incoming arguments and restore at the end of the 685 // prolog. 686 if (!ScratchWaveOffsetReg) 687 report_fatal_error( 688 "could not find temporary scratch offset register in prolog"); 689 } else { 690 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 691 } 692 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); 693 694 unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); 695 if (!mayReserveScratchForCWSR(MF)) { 696 if (hasFP(MF)) { 697 Register FPReg = MFI->getFrameOffsetReg(); 698 assert(FPReg != AMDGPU::FP_REG); 699 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 700 } 701 702 if (requiresStackPointerReference(MF)) { 703 Register SPReg = MFI->getStackPtrOffsetReg(); 704 assert(SPReg != AMDGPU::SP_REG); 705 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset); 706 } 707 } else { 708 // We need to check if we're on a compute queue - if we are, then the CWSR 709 // trap handler may need to store some VGPRs on the stack. The first VGPR 710 // block is saved separately, so we only need to allocate space for any 711 // additional VGPR blocks used. For now, we will make sure there's enough 712 // room for the theoretical maximum number of VGPRs that can be allocated. 713 // FIXME: Figure out if the shader uses fewer VGPRs in practice. 714 assert(hasFP(MF)); 715 Register FPReg = MFI->getFrameOffsetReg(); 716 assert(FPReg != AMDGPU::FP_REG); 717 unsigned VGPRSize = llvm::alignTo( 718 (ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) - 719 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, 720 MFI->getDynamicVGPRBlockSize())) * 721 4, 722 FrameInfo.getMaxAlign()); 723 MFI->setScratchReservedForDynamicVGPRs(VGPRSize); 724 725 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg) 726 .addImm(AMDGPU::Hwreg::HwregEncoding::encode( 727 AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2)); 728 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute 729 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set 730 // SCC, so we need to check for 0 manually. 731 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg); 732 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize); 733 if (requiresStackPointerReference(MF)) { 734 Register SPReg = MFI->getStackPtrOffsetReg(); 735 assert(SPReg != AMDGPU::SP_REG); 736 737 // If at least one of the constants can be inlined, then we can use 738 // s_cselect. Otherwise, use a mov and cmovk. 739 if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) || 740 AMDGPU::isInlinableLiteral32(Offset + VGPRSize, 741 ST.hasInv2PiInlineImm())) { 742 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg) 743 .addImm(Offset + VGPRSize) 744 .addImm(Offset); 745 } else { 746 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset); 747 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg) 748 .addImm(Offset + VGPRSize); 749 } 750 } 751 } 752 753 bool NeedsFlatScratchInit = 754 MFI->getUserSGPRInfo().hasFlatScratchInit() && 755 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || 756 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); 757 758 if ((NeedsFlatScratchInit || ScratchRsrcReg) && 759 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { 760 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 761 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 762 } 763 764 if (NeedsFlatScratchInit) { 765 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 766 } 767 768 if (ScratchRsrcReg) { 769 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 770 PreloadedScratchRsrcReg, 771 ScratchRsrcReg, ScratchWaveOffsetReg); 772 } 773 } 774 775 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 776 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 777 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 778 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 779 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 780 781 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 782 const SIInstrInfo *TII = ST.getInstrInfo(); 783 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 784 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 785 const Function &Fn = MF.getFunction(); 786 787 if (ST.isAmdPalOS()) { 788 // The pointer to the GIT is formed from the offset passed in and either 789 // the amdgpu-git-ptr-high function attribute or the top part of the PC 790 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 791 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 792 793 buildGitPtr(MBB, I, DL, TII, Rsrc01); 794 795 // We now have the GIT ptr - now get the scratch descriptor from the entry 796 // at offset 0 (or offset 16 for a compute shader). 797 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 798 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 799 auto *MMO = MF.getMachineMemOperand( 800 PtrInfo, 801 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 802 MachineMemOperand::MODereferenceable, 803 16, Align(4)); 804 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 805 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 806 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 807 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 808 .addReg(Rsrc01) 809 .addImm(EncodedOffset) // offset 810 .addImm(0) // cpol 811 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 812 .addMemOperand(MMO); 813 814 // The driver will always set the SRD for wave 64 (bits 118:117 of 815 // descriptor / bits 22:21 of third sub-reg will be 0b11) 816 // If the shader is actually wave32 we have to modify the const_index_stride 817 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 818 // reason the driver does this is that there can be cases where it presents 819 // 2 shaders with different wave size (e.g. VsFs). 820 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 821 if (ST.isWave32()) { 822 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 823 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 824 .addImm(21) 825 .addReg(Rsrc03); 826 } 827 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 828 assert(!ST.isAmdHsaOrMesa(Fn)); 829 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 830 831 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 832 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 833 834 // Use relocations to get the pointer, and setup the other bits manually. 835 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 836 837 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { 838 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 839 840 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 841 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 842 843 BuildMI(MBB, I, DL, Mov64, Rsrc01) 844 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 845 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 846 } else { 847 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 848 849 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 850 auto *MMO = MF.getMachineMemOperand( 851 PtrInfo, 852 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 853 MachineMemOperand::MODereferenceable, 854 8, Align(4)); 855 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 856 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 857 .addImm(0) // offset 858 .addImm(0) // cpol 859 .addMemOperand(MMO) 860 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 861 862 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 863 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 864 } 865 } else { 866 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 867 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 868 869 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 870 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 871 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 872 873 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 874 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 875 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 876 } 877 878 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 879 .addImm(Lo_32(Rsrc23)) 880 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 881 882 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 883 .addImm(Hi_32(Rsrc23)) 884 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 885 } else if (ST.isAmdHsaOrMesa(Fn)) { 886 assert(PreloadedScratchRsrcReg); 887 888 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 889 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 890 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 891 } 892 } 893 894 // Add the scratch wave offset into the scratch RSRC. 895 // 896 // We only want to update the first 48 bits, which is the base address 897 // pointer, without touching the adjacent 16 bits of flags. We know this add 898 // cannot carry-out from bit 47, otherwise the scratch allocation would be 899 // impossible to fit in the 48-bit global address space. 900 // 901 // TODO: Evaluate if it is better to just construct an SRD using the flat 902 // scratch init and some constants rather than update the one we are passed. 903 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 904 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 905 906 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 907 // the kernel body via inreg arguments. 908 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 909 .addReg(ScratchRsrcSub0) 910 .addReg(ScratchWaveOffsetReg) 911 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 912 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 913 .addReg(ScratchRsrcSub1) 914 .addImm(0) 915 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 916 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 917 } 918 919 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 920 switch (ID) { 921 case TargetStackID::Default: 922 case TargetStackID::NoAlloc: 923 case TargetStackID::SGPRSpill: 924 return true; 925 case TargetStackID::ScalableVector: 926 case TargetStackID::WasmLocal: 927 return false; 928 } 929 llvm_unreachable("Invalid TargetStackID::Value"); 930 } 931 932 // Activate only the inactive lanes when \p EnableInactiveLanes is true. 933 // Otherwise, activate all lanes. It returns the saved exec. 934 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, 935 MachineFunction &MF, 936 MachineBasicBlock &MBB, 937 MachineBasicBlock::iterator MBBI, 938 const DebugLoc &DL, bool IsProlog, 939 bool EnableInactiveLanes) { 940 Register ScratchExecCopy; 941 MachineRegisterInfo &MRI = MF.getRegInfo(); 942 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 943 const SIInstrInfo *TII = ST.getInstrInfo(); 944 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 945 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 946 947 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 948 949 ScratchExecCopy = findScratchNonCalleeSaveRegister( 950 MRI, LiveUnits, *TRI.getWaveMaskRegClass()); 951 if (!ScratchExecCopy) 952 report_fatal_error("failed to find free scratch register"); 953 954 LiveUnits.addReg(ScratchExecCopy); 955 956 const unsigned SaveExecOpc = 957 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 958 : AMDGPU::S_OR_SAVEEXEC_B32) 959 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 960 : AMDGPU::S_OR_SAVEEXEC_B64); 961 auto SaveExec = 962 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); 963 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 964 965 return ScratchExecCopy; 966 } 967 968 void SIFrameLowering::emitCSRSpillStores( 969 MachineFunction &MF, MachineBasicBlock &MBB, 970 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, 971 Register FrameReg, Register FramePtrRegScratchCopy) const { 972 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 973 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 974 const SIInstrInfo *TII = ST.getInstrInfo(); 975 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 976 977 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 978 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 979 // might end up flipping the EXEC bits twice. 980 Register ScratchExecCopy; 981 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 982 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 983 if (!WWMScratchRegs.empty()) 984 ScratchExecCopy = 985 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 986 /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 987 988 auto StoreWWMRegisters = 989 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 990 for (const auto &Reg : WWMRegs) { 991 Register VGPR = Reg.first; 992 int FI = Reg.second; 993 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 994 VGPR, FI, FrameReg); 995 } 996 }; 997 998 StoreWWMRegisters(WWMScratchRegs); 999 if (!WWMCalleeSavedRegs.empty()) { 1000 if (ScratchExecCopy) { 1001 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1002 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 1003 } else { 1004 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1005 /*IsProlog*/ true, 1006 /*EnableInactiveLanes*/ false); 1007 } 1008 } 1009 1010 StoreWWMRegisters(WWMCalleeSavedRegs); 1011 if (ScratchExecCopy) { 1012 // FIXME: Split block and make terminator. 1013 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1014 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 1015 .addReg(ScratchExecCopy, RegState::Kill); 1016 LiveUnits.addReg(ScratchExecCopy); 1017 } 1018 1019 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1020 1021 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 1022 // Special handle FP spill: 1023 // Skip if FP is saved to a scratch SGPR, the save has already been emitted. 1024 // Otherwise, FP has been moved to a temporary register and spill it 1025 // instead. 1026 Register Reg = 1027 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 1028 if (!Reg) 1029 continue; 1030 1031 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 1032 LiveUnits, FrameReg); 1033 SB.save(); 1034 } 1035 1036 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make 1037 // such scratch registers live throughout the function. 1038 SmallVector<Register, 1> ScratchSGPRs; 1039 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs); 1040 if (!ScratchSGPRs.empty()) { 1041 for (MachineBasicBlock &MBB : MF) { 1042 for (MCPhysReg Reg : ScratchSGPRs) 1043 MBB.addLiveIn(Reg); 1044 1045 MBB.sortUniqueLiveIns(); 1046 } 1047 if (!LiveUnits.empty()) { 1048 for (MCPhysReg Reg : ScratchSGPRs) 1049 LiveUnits.addReg(Reg); 1050 } 1051 } 1052 } 1053 1054 void SIFrameLowering::emitCSRSpillRestores( 1055 MachineFunction &MF, MachineBasicBlock &MBB, 1056 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, 1057 Register FrameReg, Register FramePtrRegScratchCopy) const { 1058 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1059 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1060 const SIInstrInfo *TII = ST.getInstrInfo(); 1061 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1062 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1063 1064 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 1065 // Special handle FP restore: 1066 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore 1067 // the FP value to a temporary register. The frame pointer should be 1068 // overwritten only at the end when all other spills are restored from 1069 // current frame. 1070 Register Reg = 1071 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 1072 if (!Reg) 1073 continue; 1074 1075 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 1076 LiveUnits, FrameReg); 1077 SB.restore(); 1078 } 1079 1080 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the 1081 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to 1082 // this, we might end up flipping the EXEC bits twice. 1083 Register ScratchExecCopy; 1084 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 1085 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 1086 if (!WWMScratchRegs.empty()) 1087 ScratchExecCopy = 1088 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1089 /*IsProlog*/ false, /*EnableInactiveLanes*/ true); 1090 1091 auto RestoreWWMRegisters = 1092 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 1093 for (const auto &Reg : WWMRegs) { 1094 Register VGPR = Reg.first; 1095 int FI = Reg.second; 1096 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 1097 VGPR, FI, FrameReg); 1098 } 1099 }; 1100 1101 RestoreWWMRegisters(WWMScratchRegs); 1102 if (!WWMCalleeSavedRegs.empty()) { 1103 if (ScratchExecCopy) { 1104 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1105 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 1106 } else { 1107 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1108 /*IsProlog*/ false, 1109 /*EnableInactiveLanes*/ false); 1110 } 1111 } 1112 1113 RestoreWWMRegisters(WWMCalleeSavedRegs); 1114 if (ScratchExecCopy) { 1115 // FIXME: Split block and make terminator. 1116 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1117 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 1118 .addReg(ScratchExecCopy, RegState::Kill); 1119 } 1120 } 1121 1122 void SIFrameLowering::emitPrologue(MachineFunction &MF, 1123 MachineBasicBlock &MBB) const { 1124 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1125 if (FuncInfo->isEntryFunction()) { 1126 emitEntryFunctionPrologue(MF, MBB); 1127 return; 1128 } 1129 1130 MachineFrameInfo &MFI = MF.getFrameInfo(); 1131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1132 const SIInstrInfo *TII = ST.getInstrInfo(); 1133 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1134 MachineRegisterInfo &MRI = MF.getRegInfo(); 1135 1136 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1137 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1138 Register BasePtrReg = 1139 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 1140 LiveRegUnits LiveUnits; 1141 1142 MachineBasicBlock::iterator MBBI = MBB.begin(); 1143 // DebugLoc must be unknown since the first instruction with DebugLoc is used 1144 // to determine the end of the prologue. 1145 DebugLoc DL; 1146 1147 if (FuncInfo->isChainFunction()) { 1148 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but 1149 // are free to set one up if they need it. 1150 bool UseSP = requiresStackPointerReference(MF); 1151 if (UseSP) { 1152 assert(StackPtrReg != AMDGPU::SP_REG); 1153 1154 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) 1155 .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); 1156 } 1157 } 1158 1159 bool HasFP = false; 1160 bool HasBP = false; 1161 uint32_t NumBytes = MFI.getStackSize(); 1162 uint32_t RoundedSize = NumBytes; 1163 1164 if (TRI.hasStackRealignment(MF)) 1165 HasFP = true; 1166 1167 Register FramePtrRegScratchCopy; 1168 if (!HasFP && !hasFP(MF)) { 1169 // Emit the CSR spill stores with SP base register. 1170 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, 1171 FuncInfo->isChainFunction() ? Register() : StackPtrReg, 1172 FramePtrRegScratchCopy); 1173 } else { 1174 // CSR spill stores will use FP as base register. 1175 Register SGPRForFPSaveRestoreCopy = 1176 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1177 1178 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 1179 if (SGPRForFPSaveRestoreCopy) { 1180 // Copy FP to the scratch register now and emit the CFI entry. It avoids 1181 // the extra FP copy needed in the other two cases when FP is spilled to 1182 // memory or to a VGPR lane. 1183 PrologEpilogSGPRSpillBuilder SB( 1184 FramePtrReg, 1185 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, 1186 DL, TII, TRI, LiveUnits, FramePtrReg); 1187 SB.save(); 1188 LiveUnits.addReg(SGPRForFPSaveRestoreCopy); 1189 } else { 1190 // Copy FP into a new scratch register so that its previous value can be 1191 // spilled after setting up the new frame. 1192 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1193 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); 1194 if (!FramePtrRegScratchCopy) 1195 report_fatal_error("failed to find free scratch register"); 1196 1197 LiveUnits.addReg(FramePtrRegScratchCopy); 1198 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) 1199 .addReg(FramePtrReg); 1200 } 1201 } 1202 1203 if (HasFP) { 1204 const unsigned Alignment = MFI.getMaxAlign().value(); 1205 1206 RoundedSize += Alignment; 1207 if (LiveUnits.empty()) { 1208 LiveUnits.init(TRI); 1209 LiveUnits.addLiveIns(MBB); 1210 } 1211 1212 // s_add_i32 s33, s32, NumBytes 1213 // s_and_b32 s33, s33, 0b111...0000 1214 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) 1215 .addReg(StackPtrReg) 1216 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 1217 .setMIFlag(MachineInstr::FrameSetup); 1218 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 1219 .addReg(FramePtrReg, RegState::Kill) 1220 .addImm(-Alignment * getScratchScaleFactor(ST)) 1221 .setMIFlag(MachineInstr::FrameSetup); 1222 And->getOperand(3).setIsDead(); // Mark SCC as dead. 1223 FuncInfo->setIsStackRealigned(true); 1224 } else if ((HasFP = hasFP(MF))) { 1225 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1226 .addReg(StackPtrReg) 1227 .setMIFlag(MachineInstr::FrameSetup); 1228 } 1229 1230 // If FP is used, emit the CSR spills with FP base register. 1231 if (HasFP) { 1232 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, 1233 FramePtrRegScratchCopy); 1234 if (FramePtrRegScratchCopy) 1235 LiveUnits.removeReg(FramePtrRegScratchCopy); 1236 } 1237 1238 // If we need a base pointer, set it up here. It's whatever the value of 1239 // the stack pointer is at this point. Any variable size objects will be 1240 // allocated after this, so we can still use the base pointer to reference 1241 // the incoming arguments. 1242 if ((HasBP = TRI.hasBasePointer(MF))) { 1243 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1244 .addReg(StackPtrReg) 1245 .setMIFlag(MachineInstr::FrameSetup); 1246 } 1247 1248 if (HasFP && RoundedSize != 0) { 1249 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1250 .addReg(StackPtrReg) 1251 .addImm(RoundedSize * getScratchScaleFactor(ST)) 1252 .setMIFlag(MachineInstr::FrameSetup); 1253 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1254 } 1255 1256 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1257 (void)FPSaved; 1258 assert((!HasFP || FPSaved) && 1259 "Needed to save FP but didn't save it anywhere"); 1260 1261 // If we allow spilling to AGPRs we may have saved FP but then spill 1262 // everything into AGPRs instead of the stack. 1263 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && 1264 "Saved FP but didn't need it"); 1265 1266 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg); 1267 (void)BPSaved; 1268 assert((!HasBP || BPSaved) && 1269 "Needed to save BP but didn't save it anywhere"); 1270 1271 assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); 1272 } 1273 1274 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 1275 MachineBasicBlock &MBB) const { 1276 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1277 if (FuncInfo->isEntryFunction()) 1278 return; 1279 1280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1281 const SIInstrInfo *TII = ST.getInstrInfo(); 1282 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1283 MachineRegisterInfo &MRI = MF.getRegInfo(); 1284 LiveRegUnits LiveUnits; 1285 // Get the insert location for the epilogue. If there were no terminators in 1286 // the block, get the last instruction. 1287 MachineBasicBlock::iterator MBBI = MBB.end(); 1288 DebugLoc DL; 1289 if (!MBB.empty()) { 1290 MBBI = MBB.getLastNonDebugInstr(); 1291 if (MBBI != MBB.end()) 1292 DL = MBBI->getDebugLoc(); 1293 1294 MBBI = MBB.getFirstTerminator(); 1295 } 1296 1297 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1298 uint32_t NumBytes = MFI.getStackSize(); 1299 uint32_t RoundedSize = FuncInfo->isStackRealigned() 1300 ? NumBytes + MFI.getMaxAlign().value() 1301 : NumBytes; 1302 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1303 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1304 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1305 1306 if (RoundedSize != 0) { 1307 if (TRI.hasBasePointer(MF)) { 1308 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg) 1309 .addReg(TRI.getBaseRegister()) 1310 .setMIFlag(MachineInstr::FrameDestroy); 1311 } else if (hasFP(MF)) { 1312 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg) 1313 .addReg(FramePtrReg) 1314 .setMIFlag(MachineInstr::FrameDestroy); 1315 } 1316 } 1317 1318 Register FramePtrRegScratchCopy; 1319 Register SGPRForFPSaveRestoreCopy = 1320 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1321 if (FPSaved) { 1322 // CSR spill restores should use FP as base register. If 1323 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP 1324 // into a new scratch register and copy to FP later when other registers are 1325 // restored from the current stack frame. 1326 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1327 if (SGPRForFPSaveRestoreCopy) { 1328 LiveUnits.addReg(SGPRForFPSaveRestoreCopy); 1329 } else { 1330 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1331 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); 1332 if (!FramePtrRegScratchCopy) 1333 report_fatal_error("failed to find free scratch register"); 1334 1335 LiveUnits.addReg(FramePtrRegScratchCopy); 1336 } 1337 1338 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, 1339 FramePtrRegScratchCopy); 1340 } 1341 1342 if (FPSaved) { 1343 // Insert the copy to restore FP. 1344 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy 1345 : FramePtrRegScratchCopy; 1346 MachineInstrBuilder MIB = 1347 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1348 .addReg(SrcReg); 1349 if (SGPRForFPSaveRestoreCopy) 1350 MIB.setMIFlag(MachineInstr::FrameDestroy); 1351 } else { 1352 // Insert the CSR spill restores with SP as the base register. 1353 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, 1354 FuncInfo->isChainFunction() ? Register() : StackPtrReg, 1355 FramePtrRegScratchCopy); 1356 } 1357 } 1358 1359 #ifndef NDEBUG 1360 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1361 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1362 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1363 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1364 I != E; ++I) { 1365 if (!MFI.isDeadObjectIndex(I) && 1366 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1367 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { 1368 return false; 1369 } 1370 } 1371 1372 return true; 1373 } 1374 #endif 1375 1376 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1377 int FI, 1378 Register &FrameReg) const { 1379 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1380 1381 FrameReg = RI->getFrameRegister(MF); 1382 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1383 } 1384 1385 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1386 MachineFunction &MF, 1387 RegScavenger *RS) const { 1388 MachineFrameInfo &MFI = MF.getFrameInfo(); 1389 1390 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1391 const SIInstrInfo *TII = ST.getInstrInfo(); 1392 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1393 MachineRegisterInfo &MRI = MF.getRegInfo(); 1394 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1395 1396 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() 1397 && EnableSpillVGPRToAGPR; 1398 1399 if (SpillVGPRToAGPR) { 1400 // To track the spill frame indices handled in this pass. 1401 BitVector SpillFIs(MFI.getObjectIndexEnd(), false); 1402 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); 1403 1404 bool SeenDbgInstr = false; 1405 1406 for (MachineBasicBlock &MBB : MF) { 1407 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { 1408 int FrameIndex; 1409 if (MI.isDebugInstr()) 1410 SeenDbgInstr = true; 1411 1412 if (TII->isVGPRSpill(MI)) { 1413 // Try to eliminate stack used by VGPR spills before frame 1414 // finalization. 1415 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1416 AMDGPU::OpName::vaddr); 1417 int FI = MI.getOperand(FIOp).getIndex(); 1418 Register VReg = 1419 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 1420 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, 1421 TRI->isAGPR(MRI, VReg))) { 1422 assert(RS != nullptr); 1423 RS->enterBasicBlockEnd(MBB); 1424 RS->backward(std::next(MI.getIterator())); 1425 TRI->eliminateFrameIndex(MI, 0, FIOp, RS); 1426 SpillFIs.set(FI); 1427 continue; 1428 } 1429 } else if (TII->isStoreToStackSlot(MI, FrameIndex) || 1430 TII->isLoadFromStackSlot(MI, FrameIndex)) 1431 if (!MFI.isFixedObjectIndex(FrameIndex)) 1432 NonVGPRSpillFIs.set(FrameIndex); 1433 } 1434 } 1435 1436 // Stack slot coloring may assign different objects to the same stack slot. 1437 // If not, then the VGPR to AGPR spill slot is dead. 1438 for (unsigned FI : SpillFIs.set_bits()) 1439 if (!NonVGPRSpillFIs.test(FI)) 1440 FuncInfo->setVGPRToAGPRSpillDead(FI); 1441 1442 for (MachineBasicBlock &MBB : MF) { 1443 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) 1444 MBB.addLiveIn(Reg); 1445 1446 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) 1447 MBB.addLiveIn(Reg); 1448 1449 MBB.sortUniqueLiveIns(); 1450 1451 if (!SpillFIs.empty() && SeenDbgInstr) { 1452 // FIXME: The dead frame indices are replaced with a null register from 1453 // the debug value instructions. We should instead, update it with the 1454 // correct register value. But not sure the register value alone is 1455 for (MachineInstr &MI : MBB) { 1456 if (MI.isDebugValue()) { 1457 uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0; 1458 if (MI.getOperand(StackOperandIdx).isFI() && 1459 !MFI.isFixedObjectIndex( 1460 MI.getOperand(StackOperandIdx).getIndex()) && 1461 SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) { 1462 MI.getOperand(StackOperandIdx) 1463 .ChangeToRegister(Register(), false /*isDef*/); 1464 } 1465 } 1466 } 1467 } 1468 } 1469 } 1470 1471 // At this point we've already allocated all spilled SGPRs to VGPRs if we 1472 // can. Any remaining SGPR spills will go to memory, so move them back to the 1473 // default stack. 1474 bool HaveSGPRToVMemSpill = 1475 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); 1476 assert(allSGPRSpillsAreDead(MF) && 1477 "SGPR spill should have been removed in SILowerSGPRSpills"); 1478 1479 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1480 // but currently hasNonSpillStackObjects is set only from source 1481 // allocas. Stack temps produced from legalization are not counted currently. 1482 if (!allStackObjectsAreDead(MFI)) { 1483 assert(RS && "RegScavenger required if spilling"); 1484 1485 // Add an emergency spill slot 1486 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1487 1488 // If we are spilling SGPRs to memory with a large frame, we may need a 1489 // second VGPR emergency frame index. 1490 if (HaveSGPRToVMemSpill && 1491 allocateScavengingFrameIndexesNearIncomingSP(MF)) { 1492 RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(4, Align(4))); 1493 } 1494 } 1495 } 1496 1497 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( 1498 MachineFunction &MF, RegScavenger *RS) const { 1499 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1500 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1501 MachineRegisterInfo &MRI = MF.getRegInfo(); 1502 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1503 1504 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 1505 // On gfx908, we had initially reserved highest available VGPR for AGPR 1506 // copy. Now since we are done with RA, check if there exist an unused VGPR 1507 // which is lower than the eariler reserved VGPR before RA. If one exist, 1508 // use it for AGPR copy instead of one reserved before RA. 1509 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); 1510 Register UnusedLowVGPR = 1511 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); 1512 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < 1513 TRI->getHWRegIndex(VGPRForAGPRCopy))) { 1514 // Reserve this newly identified VGPR (for AGPR copy) 1515 // reserved registers should already be frozen at this point 1516 // so we can avoid calling MRI.freezeReservedRegs and just use 1517 // MRI.reserveReg 1518 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); 1519 MRI.reserveReg(UnusedLowVGPR, TRI); 1520 } 1521 } 1522 // We initally reserved the highest available SGPR pair for long branches 1523 // now, after RA, we shift down to a lower unused one if one exists 1524 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); 1525 Register UnusedLowSGPR = 1526 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); 1527 // If LongBranchReservedReg is null then we didn't find a long branch 1528 // and never reserved a register to begin with so there is nothing to 1529 // shift down. Then if UnusedLowSGPR is null, there isn't available lower 1530 // register to use so just keep the original one we set. 1531 if (LongBranchReservedReg && UnusedLowSGPR) { 1532 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); 1533 MRI.reserveReg(UnusedLowSGPR, TRI); 1534 } 1535 } 1536 1537 // The special SGPR spills like the one needed for FP, BP or any reserved 1538 // registers delayed until frame lowering. 1539 void SIFrameLowering::determinePrologEpilogSGPRSaves( 1540 MachineFunction &MF, BitVector &SavedVGPRs, 1541 bool NeedExecCopyReservedReg) const { 1542 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1543 MachineRegisterInfo &MRI = MF.getRegInfo(); 1544 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1545 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1546 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1547 LiveRegUnits LiveUnits; 1548 LiveUnits.init(*TRI); 1549 // Initially mark callee saved registers as used so we will not choose them 1550 // while looking for scratch SGPRs. 1551 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); 1552 for (unsigned I = 0; CSRegs[I]; ++I) 1553 LiveUnits.addReg(CSRegs[I]); 1554 1555 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); 1556 1557 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy(); 1558 if (NeedExecCopyReservedReg || 1559 (ReservedRegForExecCopy && 1560 MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) { 1561 MRI.reserveReg(ReservedRegForExecCopy, TRI); 1562 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); 1563 if (UnusedScratchReg) { 1564 // If found any unused scratch SGPR, reserve the register itself for Exec 1565 // copy and there is no need for any spills in that case. 1566 MFI->setSGPRForEXECCopy(UnusedScratchReg); 1567 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg); 1568 LiveUnits.addReg(UnusedScratchReg); 1569 } else { 1570 // Needs spill. 1571 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) && 1572 "Re-reserving spill slot for EXEC copy register"); 1573 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC, 1574 /*IncludeScratchCopy=*/false); 1575 } 1576 } else if (ReservedRegForExecCopy) { 1577 // Reset it at this point. There are no whole-wave copies and spills 1578 // encountered. 1579 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister); 1580 } 1581 1582 // hasFP only knows about stack objects that already exist. We're now 1583 // determining the stack slots that will be created, so we have to predict 1584 // them. Stack objects force FP usage with calls. 1585 // 1586 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1587 // don't want to report it here. 1588 // 1589 // FIXME: Is this really hasReservedCallFrame? 1590 const bool WillHaveFP = 1591 FrameInfo.hasCalls() && 1592 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1593 1594 if (WillHaveFP || hasFP(MF)) { 1595 Register FramePtrReg = MFI->getFrameOffsetReg(); 1596 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && 1597 "Re-reserving spill slot for FP"); 1598 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg); 1599 } 1600 1601 if (TRI->hasBasePointer(MF)) { 1602 Register BasePtrReg = TRI->getBaseRegister(); 1603 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && 1604 "Re-reserving spill slot for BP"); 1605 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg); 1606 } 1607 } 1608 1609 // Only report VGPRs to generic code. 1610 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1611 BitVector &SavedVGPRs, 1612 RegScavenger *RS) const { 1613 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1614 1615 // If this is a function with the amdgpu_cs_chain[_preserve] calling 1616 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then 1617 // we don't need to save and restore anything. 1618 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) 1619 return; 1620 1621 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1622 1623 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1624 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1625 const SIInstrInfo *TII = ST.getInstrInfo(); 1626 bool NeedExecCopyReservedReg = false; 1627 1628 MachineInstr *ReturnMI = nullptr; 1629 for (MachineBasicBlock &MBB : MF) { 1630 for (MachineInstr &MI : MBB) { 1631 // TODO: Walking through all MBBs here would be a bad heuristic. Better 1632 // handle them elsewhere. 1633 if (TII->isWWMRegSpillOpcode(MI.getOpcode())) 1634 NeedExecCopyReservedReg = true; 1635 else if (MI.getOpcode() == AMDGPU::SI_RETURN || 1636 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1637 (MFI->isChainFunction() && 1638 TII->isChainCallOpcode(MI.getOpcode()))) { 1639 // We expect all return to be the same size. 1640 assert(!ReturnMI || 1641 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == 1642 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); 1643 ReturnMI = &MI; 1644 } 1645 } 1646 } 1647 1648 SmallVector<Register> SortedWWMVGPRs; 1649 for (Register Reg : MFI->getWWMReservedRegs()) { 1650 // The shift-back is needed only for the VGPRs used for SGPR spills and they 1651 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM 1652 // reserved registers. 1653 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); 1654 if (TRI->getRegSizeInBits(*RC) != 32) 1655 continue; 1656 SortedWWMVGPRs.push_back(Reg); 1657 } 1658 1659 sort(SortedWWMVGPRs, std::greater<Register>()); 1660 MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs); 1661 1662 if (MFI->isEntryFunction()) 1663 return; 1664 1665 // Remove any VGPRs used in the return value because these do not need to be saved. 1666 // This prevents CSR restore from clobbering return VGPRs. 1667 if (ReturnMI) { 1668 for (auto &Op : ReturnMI->operands()) { 1669 if (Op.isReg()) 1670 SavedVGPRs.reset(Op.getReg()); 1671 } 1672 } 1673 1674 // Create the stack objects for WWM registers now. 1675 for (Register Reg : MFI->getWWMReservedRegs()) { 1676 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); 1677 MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), 1678 TRI->getSpillAlign(*RC)); 1679 } 1680 1681 // Ignore the SGPRs the default implementation found. 1682 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1683 1684 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1685 // In gfx908 there was do AGPR loads and stores and thus spilling also 1686 // require a temporary VGPR. 1687 if (!ST.hasGFX90AInsts()) 1688 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1689 1690 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); 1691 1692 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't 1693 // allow the default insertion to handle them. 1694 for (auto &Reg : MFI->getWWMSpills()) 1695 SavedVGPRs.reset(Reg.first); 1696 } 1697 1698 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1699 BitVector &SavedRegs, 1700 RegScavenger *RS) const { 1701 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1702 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1703 if (MFI->isEntryFunction()) 1704 return; 1705 1706 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1707 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1708 1709 // The SP is specifically managed and we don't want extra spills of it. 1710 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1711 1712 const BitVector AllSavedRegs = SavedRegs; 1713 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1714 1715 // We have to anticipate introducing CSR VGPR spills or spill of caller 1716 // save VGPR reserved for SGPR spills as we now always create stack entry 1717 // for it, if we don't have any stack objects already, since we require a FP 1718 // if there is a call and stack. We will allocate a VGPR for SGPR spills if 1719 // there are any SGPR spills. Whether they are CSR spills or otherwise. 1720 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1721 const bool WillHaveFP = 1722 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); 1723 1724 // FP will be specially managed like SP. 1725 if (WillHaveFP || hasFP(MF)) 1726 SavedRegs.reset(MFI->getFrameOffsetReg()); 1727 1728 // Return address use with return instruction is hidden through the SI_RETURN 1729 // pseudo. Given that and since the IPRA computes actual register usage and 1730 // does not use CSR list, the clobbering of return address by function calls 1731 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register 1732 // usage collection. This will ensure save/restore of return address happens 1733 // in those scenarios. 1734 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1735 Register RetAddrReg = TRI->getReturnAddressReg(MF); 1736 if (!MFI->isEntryFunction() && 1737 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) { 1738 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); 1739 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); 1740 } 1741 } 1742 1743 static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, 1744 const GCNSubtarget &ST, 1745 std::vector<CalleeSavedInfo> &CSI, 1746 unsigned &MinCSFrameIndex, 1747 unsigned &MaxCSFrameIndex) { 1748 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1749 MachineFrameInfo &MFI = MF.getFrameInfo(); 1750 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1751 1752 assert( 1753 llvm::is_sorted(CSI, 1754 [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) { 1755 return A.getReg() < B.getReg(); 1756 }) && 1757 "Callee saved registers not sorted"); 1758 1759 auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) { 1760 return !CSI.isSpilledToReg() && 1761 TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass && 1762 !FuncInfo->isWWMReservedRegister(CSI.getReg()); 1763 }; 1764 1765 auto CSEnd = CSI.end(); 1766 for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) { 1767 Register Reg = CSIt->getReg(); 1768 if (!CanUseBlockOps(*CSIt)) 1769 continue; 1770 1771 // Find all the regs that will fit in a 32-bit mask starting at the current 1772 // reg and build said mask. It should have 1 for every register that's 1773 // included, with the current register as the least significant bit. 1774 uint32_t Mask = 1; 1775 CSEnd = std::remove_if( 1776 CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool { 1777 if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) { 1778 Mask |= 1 << (CSI.getReg() - Reg); 1779 return true; 1780 } else { 1781 return false; 1782 } 1783 }); 1784 1785 const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF); 1786 Register RegBlock = 1787 TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass); 1788 if (!RegBlock) { 1789 // We couldn't find a super register for the block. This can happen if 1790 // the register we started with is too high (e.g. v232 if the maximum is 1791 // v255). We therefore try to get the last register block and figure out 1792 // the mask from there. 1793 Register LastBlockStart = 1794 AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32); 1795 RegBlock = 1796 TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass); 1797 assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) && 1798 "Couldn't find super register"); 1799 int RegDelta = Reg - LastBlockStart; 1800 assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta && 1801 "Bad shift amount"); 1802 Mask <<= RegDelta; 1803 } 1804 1805 FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask); 1806 1807 // The stack objects can be a bit smaller than the register block if we know 1808 // some of the high bits of Mask are 0. This may happen often with calling 1809 // conventions where the caller and callee-saved VGPRs are interleaved at 1810 // a small boundary (e.g. 8 or 16). 1811 int UnusedBits = llvm::countl_zero(Mask); 1812 unsigned BlockSize = TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4; 1813 int FrameIdx = 1814 MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass), 1815 /*isSpillSlot=*/true); 1816 if ((unsigned)FrameIdx < MinCSFrameIndex) 1817 MinCSFrameIndex = FrameIdx; 1818 if ((unsigned)FrameIdx > MaxCSFrameIndex) 1819 MaxCSFrameIndex = FrameIdx; 1820 1821 CSIt->setFrameIdx(FrameIdx); 1822 CSIt->setReg(RegBlock); 1823 } 1824 CSI.erase(CSEnd, CSI.end()); 1825 } 1826 1827 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1828 MachineFunction &MF, const TargetRegisterInfo *TRI, 1829 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, 1830 unsigned &MaxCSFrameIndex) const { 1831 if (CSI.empty()) 1832 return true; // Early exit if no callee saved registers are modified! 1833 1834 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1835 bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR(); 1836 1837 if (UseVGPRBlocks) 1838 assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex); 1839 1840 return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks; 1841 } 1842 1843 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1844 MachineFunction &MF, const TargetRegisterInfo *TRI, 1845 std::vector<CalleeSavedInfo> &CSI) const { 1846 if (CSI.empty()) 1847 return true; // Early exit if no callee saved registers are modified! 1848 1849 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1850 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1851 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1852 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1853 Register BasePtrReg = RI->getBaseRegister(); 1854 Register SGPRForFPSaveRestoreCopy = 1855 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1856 Register SGPRForBPSaveRestoreCopy = 1857 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg); 1858 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) 1859 return false; 1860 1861 unsigned NumModifiedRegs = 0; 1862 1863 if (SGPRForFPSaveRestoreCopy) 1864 NumModifiedRegs++; 1865 if (SGPRForBPSaveRestoreCopy) 1866 NumModifiedRegs++; 1867 1868 for (auto &CS : CSI) { 1869 if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) { 1870 CS.setDstReg(SGPRForFPSaveRestoreCopy); 1871 if (--NumModifiedRegs) 1872 break; 1873 } else if (CS.getReg() == BasePtrReg.asMCReg() && 1874 SGPRForBPSaveRestoreCopy) { 1875 CS.setDstReg(SGPRForBPSaveRestoreCopy); 1876 if (--NumModifiedRegs) 1877 break; 1878 } 1879 } 1880 1881 return false; 1882 } 1883 1884 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( 1885 const MachineFunction &MF) const { 1886 1887 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1888 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1889 const SIInstrInfo *TII = ST.getInstrInfo(); 1890 uint64_t EstStackSize = MFI.estimateStackSize(MF); 1891 uint64_t MaxOffset = EstStackSize - 1; 1892 1893 // We need the emergency stack slots to be allocated in range of the 1894 // MUBUF/flat scratch immediate offset from the base register, so assign these 1895 // first at the incoming SP position. 1896 // 1897 // TODO: We could try sorting the objects to find a hole in the first bytes 1898 // rather than allocating as close to possible. This could save a lot of space 1899 // on frames with alignment requirements. 1900 if (ST.enableFlatScratch()) { 1901 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1902 SIInstrFlags::FlatScratch)) 1903 return false; 1904 } else { 1905 if (TII->isLegalMUBUFImmOffset(MaxOffset)) 1906 return false; 1907 } 1908 1909 return true; 1910 } 1911 1912 bool SIFrameLowering::spillCalleeSavedRegisters( 1913 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 1914 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { 1915 MachineFunction *MF = MBB.getParent(); 1916 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1917 if (!ST.useVGPRBlockOpsForCSR()) 1918 return false; 1919 1920 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1921 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1922 const SIInstrInfo *TII = ST.getInstrInfo(); 1923 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1924 1925 const TargetRegisterClass *BlockRegClass = 1926 static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(*MF); 1927 for (const CalleeSavedInfo &CS : CSI) { 1928 Register Reg = CS.getReg(); 1929 if (!BlockRegClass->contains(Reg) || 1930 !FuncInfo->hasMaskForVGPRBlockOps(Reg)) { 1931 spillCalleeSavedRegister(MBB, MI, CS, TII, TRI); 1932 continue; 1933 } 1934 1935 // Build a scratch block store. 1936 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg); 1937 int FrameIndex = CS.getFrameIdx(); 1938 MachinePointerInfo PtrInfo = 1939 MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1940 MachineMemOperand *MMO = 1941 MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 1942 FrameInfo.getObjectSize(FrameIndex), 1943 FrameInfo.getObjectAlign(FrameIndex)); 1944 1945 BuildMI(MBB, MI, MI->getDebugLoc(), 1946 TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE)) 1947 .addReg(Reg, getKillRegState(false)) 1948 .addFrameIndex(FrameIndex) 1949 .addReg(MFI->getStackPtrOffsetReg()) 1950 .addImm(0) 1951 .addImm(Mask) 1952 .addMemOperand(MMO); 1953 1954 FuncInfo->setHasSpilledVGPRs(); 1955 1956 // Add the register to the liveins. This is necessary because if any of the 1957 // VGPRs in the register block is reserved (e.g. if it's a WWM register), 1958 // then the whole block will be marked as reserved and `updateLiveness` will 1959 // skip it. 1960 MBB.addLiveIn(Reg); 1961 } 1962 MBB.sortUniqueLiveIns(); 1963 1964 return true; 1965 } 1966 1967 bool SIFrameLowering::restoreCalleeSavedRegisters( 1968 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 1969 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { 1970 MachineFunction *MF = MBB.getParent(); 1971 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1972 if (!ST.useVGPRBlockOpsForCSR()) 1973 return false; 1974 1975 SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1976 MachineFrameInfo &MFI = MF->getFrameInfo(); 1977 const SIInstrInfo *TII = ST.getInstrInfo(); 1978 const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI); 1979 const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(*MF); 1980 for (const CalleeSavedInfo &CS : reverse(CSI)) { 1981 Register Reg = CS.getReg(); 1982 if (!BlockRegClass->contains(Reg) || 1983 !FuncInfo->hasMaskForVGPRBlockOps(Reg)) { 1984 restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI); 1985 continue; 1986 } 1987 1988 // Build a scratch block load. 1989 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg); 1990 int FrameIndex = CS.getFrameIdx(); 1991 MachinePointerInfo PtrInfo = 1992 MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1993 MachineMemOperand *MMO = MF->getMachineMemOperand( 1994 PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), 1995 MFI.getObjectAlign(FrameIndex)); 1996 1997 auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), 1998 TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg) 1999 .addFrameIndex(FrameIndex) 2000 .addReg(FuncInfo->getStackPtrOffsetReg()) 2001 .addImm(0) 2002 .addImm(Mask) 2003 .addMemOperand(MMO); 2004 SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg); 2005 2006 // Add the register to the liveins. This is necessary because if any of the 2007 // VGPRs in the register block is reserved (e.g. if it's a WWM register), 2008 // then the whole block will be marked as reserved and `updateLiveness` will 2009 // skip it. 2010 MBB.addLiveIn(Reg); 2011 } 2012 2013 MBB.sortUniqueLiveIns(); 2014 return true; 2015 } 2016 2017 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 2018 MachineFunction &MF, 2019 MachineBasicBlock &MBB, 2020 MachineBasicBlock::iterator I) const { 2021 int64_t Amount = I->getOperand(0).getImm(); 2022 if (Amount == 0) 2023 return MBB.erase(I); 2024 2025 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2026 const SIInstrInfo *TII = ST.getInstrInfo(); 2027 const DebugLoc &DL = I->getDebugLoc(); 2028 unsigned Opc = I->getOpcode(); 2029 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 2030 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 2031 2032 if (!hasReservedCallFrame(MF)) { 2033 Amount = alignTo(Amount, getStackAlign()); 2034 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 2035 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2036 Register SPReg = MFI->getStackPtrOffsetReg(); 2037 2038 Amount *= getScratchScaleFactor(ST); 2039 if (IsDestroy) 2040 Amount = -Amount; 2041 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) 2042 .addReg(SPReg) 2043 .addImm(Amount); 2044 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 2045 } else if (CalleePopAmount != 0) { 2046 llvm_unreachable("is this used?"); 2047 } 2048 2049 return MBB.erase(I); 2050 } 2051 2052 /// Returns true if the frame will require a reference to the stack pointer. 2053 /// 2054 /// This is the set of conditions common to setting up the stack pointer in a 2055 /// kernel, and for using a frame pointer in a callable function. 2056 /// 2057 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 2058 /// references SP. 2059 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 2060 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 2061 } 2062 2063 // The FP for kernels is always known 0, so we never really need to setup an 2064 // explicit register for it. However, DisableFramePointerElim will force us to 2065 // use a register for it. 2066 bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { 2067 const MachineFrameInfo &MFI = MF.getFrameInfo(); 2068 2069 // For entry & chain functions we can use an immediate offset in most cases, 2070 // so the presence of calls doesn't imply we need a distinct frame pointer. 2071 if (MFI.hasCalls() && 2072 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 2073 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { 2074 // All offsets are unsigned, so need to be addressed in the same direction 2075 // as stack growth. 2076 2077 // FIXME: This function is pretty broken, since it can be called before the 2078 // frame layout is determined or CSR spills are inserted. 2079 return MFI.getStackSize() != 0; 2080 } 2081 2082 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 2083 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 2084 MF) || 2085 mayReserveScratchForCWSR(MF) || 2086 MF.getTarget().Options.DisableFramePointerElim(MF); 2087 } 2088 2089 bool SIFrameLowering::mayReserveScratchForCWSR( 2090 const MachineFunction &MF) const { 2091 return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() && 2092 AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) && 2093 AMDGPU::isCompute(MF.getFunction().getCallingConv()); 2094 } 2095 2096 // This is essentially a reduced version of hasFP for entry functions. Since the 2097 // stack pointer is known 0 on entry to kernels, we never really need an FP 2098 // register. We may need to initialize the stack pointer depending on the frame 2099 // properties, which logically overlaps many of the cases where an ordinary 2100 // function would require an FP. 2101 // Also used for chain functions. While not technically entry functions, chain 2102 // functions may need to set up a stack pointer in some situations. 2103 bool SIFrameLowering::requiresStackPointerReference( 2104 const MachineFunction &MF) const { 2105 // Callable functions always require a stack pointer reference. 2106 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || 2107 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && 2108 "only expected to call this for entry points and chain functions"); 2109 2110 const MachineFrameInfo &MFI = MF.getFrameInfo(); 2111 2112 // Entry points ordinarily don't need to initialize SP. We have to set it up 2113 // for callees if there are any. Also note tail calls are impossible/don't 2114 // make any sense for kernels. 2115 if (MFI.hasCalls()) 2116 return true; 2117 2118 // We still need to initialize the SP if we're doing anything weird that 2119 // references the SP, like variable sized stack objects. 2120 return frameTriviallyRequiresSP(MFI); 2121 } 2122