1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LivePhysRegs.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 static cl::opt<bool> EnableSpillVGPRToAGPR( 24 "amdgpu-spill-vgpr-to-agpr", 25 cl::desc("Enable spilling VGPRs to AGPRs"), 26 cl::ReallyHidden, 27 cl::init(true)); 28 29 // Find a register matching \p RC from \p LiveRegs which is unused and available 30 // throughout the function. On failure, returns AMDGPU::NoRegister. 31 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, 32 const LivePhysRegs &LiveRegs, 33 const TargetRegisterClass &RC) { 34 for (MCRegister Reg : RC) { 35 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 36 return Reg; 37 } 38 return MCRegister(); 39 } 40 41 // Find a scratch register that we can use in the prologue. We avoid using 42 // callee-save registers since they may appear to be free when this is called 43 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 44 // when this is called from emitPrologue. 45 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 46 LivePhysRegs &LiveRegs, 47 const TargetRegisterClass &RC, 48 bool Unused = false) { 49 // Mark callee saved registers as used so we will not choose them. 50 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 51 for (unsigned i = 0; CSRegs[i]; ++i) 52 LiveRegs.addReg(CSRegs[i]); 53 54 // We are looking for a register that can be used throughout the entire 55 // function, so any use is unacceptable. 56 if (Unused) 57 return findUnusedRegister(MRI, LiveRegs, RC); 58 59 for (MCRegister Reg : RC) { 60 if (LiveRegs.available(MRI, Reg)) 61 return Reg; 62 } 63 64 return MCRegister(); 65 } 66 67 static void getVGPRSpillLaneOrTempRegister( 68 MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, 69 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { 70 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 71 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 72 73 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 74 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 75 unsigned Size = TRI->getSpillSize(RC); 76 Align Alignment = TRI->getSpillAlign(RC); 77 78 // We need to save and restore the given SGPR. 79 80 // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs 81 // should have all the callee saved registers marked as used. 82 Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); 83 84 if (!ScratchSGPR) { 85 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, 86 TargetStackID::SGPRSpill); 87 88 if (TRI->spillSGPRToVGPR() && 89 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) { 90 // 2: There's no free lane to spill, and no free register to save the 91 // SGPR, so we're forced to take another VGPR to use for the spill. 92 MFI->addToPrologEpilogSGPRSpills( 93 SGPR, PrologEpilogSGPRSaveRestoreInfo( 94 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); 95 96 LLVM_DEBUG( 97 auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); 98 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " 99 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); 100 } else { 101 // Remove dead <FI> index 102 MF.getFrameInfo().RemoveStackObject(FI); 103 // 3: If all else fails, spill the register to memory. 104 FI = FrameInfo.CreateSpillStackObject(Size, Alignment); 105 MFI->addToPrologEpilogSGPRSpills( 106 SGPR, 107 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); 108 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " 109 << printReg(SGPR, TRI) << '\n'); 110 } 111 } else { 112 MFI->addToPrologEpilogSGPRSpills( 113 SGPR, PrologEpilogSGPRSaveRestoreInfo( 114 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); 115 LiveRegs.addReg(ScratchSGPR); 116 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " 117 << printReg(ScratchSGPR, TRI) << '\n'); 118 } 119 } 120 121 // We need to specially emit stack operations here because a different frame 122 // register is used than in the rest of the function, as getFrameRegister would 123 // use. 124 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 125 const SIMachineFunctionInfo &FuncInfo, 126 LivePhysRegs &LiveRegs, MachineFunction &MF, 127 MachineBasicBlock &MBB, 128 MachineBasicBlock::iterator I, const DebugLoc &DL, 129 Register SpillReg, int FI, Register FrameReg, 130 int64_t DwordOff = 0) { 131 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 132 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 133 134 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 135 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 136 MachineMemOperand *MMO = MF.getMachineMemOperand( 137 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 138 FrameInfo.getObjectAlign(FI)); 139 LiveRegs.addReg(SpillReg); 140 bool IsKill = !MBB.isLiveIn(SpillReg); 141 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, 142 DwordOff, MMO, nullptr, &LiveRegs); 143 if (IsKill) 144 LiveRegs.removeReg(SpillReg); 145 } 146 147 static void buildEpilogRestore(const GCNSubtarget &ST, 148 const SIRegisterInfo &TRI, 149 const SIMachineFunctionInfo &FuncInfo, 150 LivePhysRegs &LiveRegs, MachineFunction &MF, 151 MachineBasicBlock &MBB, 152 MachineBasicBlock::iterator I, 153 const DebugLoc &DL, Register SpillReg, int FI, 154 Register FrameReg, int64_t DwordOff = 0) { 155 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 156 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 157 158 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 159 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 160 MachineMemOperand *MMO = MF.getMachineMemOperand( 161 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 162 FrameInfo.getObjectAlign(FI)); 163 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, 164 DwordOff, MMO, nullptr, &LiveRegs); 165 } 166 167 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 168 const DebugLoc &DL, const SIInstrInfo *TII, 169 Register TargetReg) { 170 MachineFunction *MF = MBB.getParent(); 171 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 172 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 173 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 174 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 175 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 176 177 if (MFI->getGITPtrHigh() != 0xffffffff) { 178 BuildMI(MBB, I, DL, SMovB32, TargetHi) 179 .addImm(MFI->getGITPtrHigh()) 180 .addReg(TargetReg, RegState::ImplicitDefine); 181 } else { 182 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 183 BuildMI(MBB, I, DL, GetPC64, TargetReg); 184 } 185 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 186 MF->getRegInfo().addLiveIn(GitPtrLo); 187 MBB.addLiveIn(GitPtrLo); 188 BuildMI(MBB, I, DL, SMovB32, TargetLo) 189 .addReg(GitPtrLo); 190 } 191 192 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, 193 const SIMachineFunctionInfo *FuncInfo, 194 MachineFunction &MF, MachineBasicBlock &MBB, 195 MachineBasicBlock::iterator MBBI, bool IsProlog) { 196 if (LiveRegs.empty()) { 197 LiveRegs.init(TRI); 198 if (IsProlog) { 199 LiveRegs.addLiveIns(MBB); 200 } else { 201 // In epilog. 202 LiveRegs.addLiveOuts(MBB); 203 LiveRegs.stepBackward(*MBBI); 204 } 205 } 206 } 207 208 namespace llvm { 209 210 // SpillBuilder to save/restore special SGPR spills like the one needed for FP, 211 // BP, etc. These spills are delayed until the current function's frame is 212 // finalized. For a given register, the builder uses the 213 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. 214 class PrologEpilogSGPRSpillBuilder { 215 MachineBasicBlock::iterator MI; 216 MachineBasicBlock &MBB; 217 MachineFunction &MF; 218 const GCNSubtarget &ST; 219 MachineFrameInfo &MFI; 220 SIMachineFunctionInfo *FuncInfo; 221 const SIInstrInfo *TII; 222 const SIRegisterInfo &TRI; 223 Register SuperReg; 224 const PrologEpilogSGPRSaveRestoreInfo SI; 225 LivePhysRegs &LiveRegs; 226 const DebugLoc &DL; 227 Register FrameReg; 228 ArrayRef<int16_t> SplitParts; 229 unsigned NumSubRegs; 230 unsigned EltSize = 4; 231 232 void saveToMemory(const int FI) const { 233 MachineRegisterInfo &MRI = MF.getRegInfo(); 234 assert(!MFI.isDeadObjectIndex(FI)); 235 236 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); 237 238 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 239 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 240 if (!TmpVGPR) 241 report_fatal_error("failed to find free scratch register"); 242 243 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 244 Register SubReg = NumSubRegs == 1 245 ? SuperReg 246 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 247 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 248 .addReg(SubReg); 249 250 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, 251 FI, FrameReg, DwordOff); 252 DwordOff += 4; 253 } 254 } 255 256 void saveToVGPRLane(const int FI) const { 257 assert(!MFI.isDeadObjectIndex(FI)); 258 259 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 260 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 261 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); 262 assert(Spill.size() == NumSubRegs); 263 264 for (unsigned I = 0; I < NumSubRegs; ++I) { 265 Register SubReg = NumSubRegs == 1 266 ? SuperReg 267 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 268 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR) 269 .addReg(SubReg) 270 .addImm(Spill[I].Lane) 271 .addReg(Spill[I].VGPR, RegState::Undef); 272 } 273 } 274 275 void copyToScratchSGPR(Register DstReg) const { 276 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) 277 .addReg(SuperReg) 278 .setMIFlag(MachineInstr::FrameSetup); 279 } 280 281 void restoreFromMemory(const int FI) { 282 MachineRegisterInfo &MRI = MF.getRegInfo(); 283 284 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); 285 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 286 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 287 if (!TmpVGPR) 288 report_fatal_error("failed to find free scratch register"); 289 290 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 291 Register SubReg = NumSubRegs == 1 292 ? SuperReg 293 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 294 295 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, 296 FI, FrameReg, DwordOff); 297 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 298 .addReg(TmpVGPR, RegState::Kill); 299 DwordOff += 4; 300 } 301 } 302 303 void restoreFromVGPRLane(const int FI) { 304 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 305 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 306 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); 307 assert(Spill.size() == NumSubRegs); 308 309 for (unsigned I = 0; I < NumSubRegs; ++I) { 310 Register SubReg = NumSubRegs == 1 311 ? SuperReg 312 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 313 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) 314 .addReg(Spill[I].VGPR) 315 .addImm(Spill[I].Lane); 316 } 317 } 318 319 void copyFromScratchSGPR(Register SrcReg) const { 320 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) 321 .addReg(SrcReg) 322 .setMIFlag(MachineInstr::FrameDestroy); 323 } 324 325 public: 326 PrologEpilogSGPRSpillBuilder(Register Reg, 327 const PrologEpilogSGPRSaveRestoreInfo SI, 328 MachineBasicBlock &MBB, 329 MachineBasicBlock::iterator MI, 330 const DebugLoc &DL, const SIInstrInfo *TII, 331 const SIRegisterInfo &TRI, 332 LivePhysRegs &LiveRegs, Register FrameReg) 333 : MI(MI), MBB(MBB), MF(*MBB.getParent()), 334 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), 335 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 336 SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) { 337 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 338 SplitParts = TRI.getRegSplitParts(RC, EltSize); 339 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 340 341 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 342 } 343 344 void save() { 345 switch (SI.getKind()) { 346 case SGPRSaveKind::SPILL_TO_MEM: 347 return saveToMemory(SI.getIndex()); 348 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 349 return saveToVGPRLane(SI.getIndex()); 350 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 351 return copyToScratchSGPR(SI.getReg()); 352 } 353 } 354 355 void restore() { 356 switch (SI.getKind()) { 357 case SGPRSaveKind::SPILL_TO_MEM: 358 return restoreFromMemory(SI.getIndex()); 359 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 360 return restoreFromVGPRLane(SI.getIndex()); 361 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 362 return copyFromScratchSGPR(SI.getReg()); 363 } 364 } 365 }; 366 367 } // namespace llvm 368 369 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 370 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 371 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 372 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 373 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 374 const SIInstrInfo *TII = ST.getInstrInfo(); 375 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 376 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 377 378 // We don't need this if we only have spills since there is no user facing 379 // scratch. 380 381 // TODO: If we know we don't have flat instructions earlier, we can omit 382 // this from the input registers. 383 // 384 // TODO: We only need to know if we access scratch space through a flat 385 // pointer. Because we only detect if flat instructions are used at all, 386 // this will be used more often than necessary on VI. 387 388 Register FlatScrInitLo; 389 Register FlatScrInitHi; 390 391 if (ST.isAmdPalOS()) { 392 // Extract the scratch offset from the descriptor in the GIT 393 LivePhysRegs LiveRegs; 394 LiveRegs.init(*TRI); 395 LiveRegs.addLiveIns(MBB); 396 397 // Find unused reg to load flat scratch init into 398 MachineRegisterInfo &MRI = MF.getRegInfo(); 399 Register FlatScrInit = AMDGPU::NoRegister; 400 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 401 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 402 AllSGPR64s = AllSGPR64s.slice( 403 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 404 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 405 for (MCPhysReg Reg : AllSGPR64s) { 406 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && 407 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 408 FlatScrInit = Reg; 409 break; 410 } 411 } 412 assert(FlatScrInit && "Failed to find free register for scratch init"); 413 414 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 415 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 416 417 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 418 419 // We now have the GIT ptr - now get the scratch descriptor from the entry 420 // at offset 0 (or offset 16 for a compute shader). 421 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 422 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 423 auto *MMO = MF.getMachineMemOperand( 424 PtrInfo, 425 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 426 MachineMemOperand::MODereferenceable, 427 8, Align(4)); 428 unsigned Offset = 429 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 430 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 431 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 432 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 433 .addReg(FlatScrInit) 434 .addImm(EncodedOffset) // offset 435 .addImm(0) // cpol 436 .addMemOperand(MMO); 437 438 // Mask the offset in [47:0] of the descriptor 439 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 440 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 441 .addReg(FlatScrInitHi) 442 .addImm(0xffff); 443 And->getOperand(3).setIsDead(); // Mark SCC as dead. 444 } else { 445 Register FlatScratchInitReg = 446 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 447 assert(FlatScratchInitReg); 448 449 MachineRegisterInfo &MRI = MF.getRegInfo(); 450 MRI.addLiveIn(FlatScratchInitReg); 451 MBB.addLiveIn(FlatScratchInitReg); 452 453 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 454 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 455 } 456 457 // Do a 64-bit pointer add. 458 if (ST.flatScratchIsPointer()) { 459 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 460 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 461 .addReg(FlatScrInitLo) 462 .addReg(ScratchWaveOffsetReg); 463 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 464 FlatScrInitHi) 465 .addReg(FlatScrInitHi) 466 .addImm(0); 467 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 468 469 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 470 addReg(FlatScrInitLo). 471 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 472 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 473 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 474 addReg(FlatScrInitHi). 475 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 476 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 477 return; 478 } 479 480 // For GFX9. 481 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 482 .addReg(FlatScrInitLo) 483 .addReg(ScratchWaveOffsetReg); 484 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 485 AMDGPU::FLAT_SCR_HI) 486 .addReg(FlatScrInitHi) 487 .addImm(0); 488 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 489 490 return; 491 } 492 493 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 494 495 // Copy the size in bytes. 496 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 497 .addReg(FlatScrInitHi, RegState::Kill); 498 499 // Add wave offset in bytes to private base offset. 500 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 501 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) 502 .addReg(FlatScrInitLo) 503 .addReg(ScratchWaveOffsetReg); 504 505 // Convert offset to 256-byte units. 506 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), 507 AMDGPU::FLAT_SCR_HI) 508 .addReg(FlatScrInitLo, RegState::Kill) 509 .addImm(8); 510 LShr->getOperand(3).setIsDead(); // Mark SCC as dead. 511 } 512 513 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 514 // memory. They should have been removed by now. 515 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 516 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 517 I != E; ++I) { 518 if (!MFI.isDeadObjectIndex(I)) 519 return false; 520 } 521 522 return true; 523 } 524 525 // Shift down registers reserved for the scratch RSRC. 526 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 527 MachineFunction &MF) const { 528 529 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 530 const SIInstrInfo *TII = ST.getInstrInfo(); 531 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 532 MachineRegisterInfo &MRI = MF.getRegInfo(); 533 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 534 535 assert(MFI->isEntryFunction()); 536 537 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 538 539 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 540 allStackObjectsAreDead(MF.getFrameInfo()))) 541 return Register(); 542 543 if (ST.hasSGPRInitBug() || 544 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 545 return ScratchRsrcReg; 546 547 // We reserved the last registers for this. Shift it down to the end of those 548 // which were actually used. 549 // 550 // FIXME: It might be safer to use a pseudoregister before replacement. 551 552 // FIXME: We should be able to eliminate unused input registers. We only 553 // cannot do this for the resources required for scratch access. For now we 554 // skip over user SGPRs and may leave unused holes. 555 556 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 557 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 558 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 559 560 // Skip the last N reserved elements because they should have already been 561 // reserved for VCC etc. 562 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 563 for (MCPhysReg Reg : AllSGPR128s) { 564 // Pick the first unallocated one. Make sure we don't clobber the other 565 // reserved input we needed. Also for PAL, make sure we don't clobber 566 // the GIT pointer passed in SGPR0 or SGPR8. 567 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 568 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 569 MRI.replaceRegWith(ScratchRsrcReg, Reg); 570 MFI->setScratchRSrcReg(Reg); 571 return Reg; 572 } 573 } 574 575 return ScratchRsrcReg; 576 } 577 578 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 579 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 580 } 581 582 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 583 MachineBasicBlock &MBB) const { 584 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 585 586 // FIXME: If we only have SGPR spills, we won't actually be using scratch 587 // memory since these spill to VGPRs. We should be cleaning up these unused 588 // SGPR spill frame indices somewhere. 589 590 // FIXME: We still have implicit uses on SGPR spill instructions in case they 591 // need to spill to vector memory. It's likely that will not happen, but at 592 // this point it appears we need the setup. This part of the prolog should be 593 // emitted after frame indices are eliminated. 594 595 // FIXME: Remove all of the isPhysRegUsed checks 596 597 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 598 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 599 const SIInstrInfo *TII = ST.getInstrInfo(); 600 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 601 MachineRegisterInfo &MRI = MF.getRegInfo(); 602 const Function &F = MF.getFunction(); 603 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 604 605 assert(MFI->isEntryFunction()); 606 607 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 608 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 609 610 // We need to do the replacement of the private segment buffer register even 611 // if there are no stack objects. There could be stores to undef or a 612 // constant without an associated object. 613 // 614 // This will return `Register()` in cases where there are no actual 615 // uses of the SRSRC. 616 Register ScratchRsrcReg; 617 if (!ST.enableFlatScratch()) 618 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 619 620 // Make the selected register live throughout the function. 621 if (ScratchRsrcReg) { 622 for (MachineBasicBlock &OtherBB : MF) { 623 if (&OtherBB != &MBB) { 624 OtherBB.addLiveIn(ScratchRsrcReg); 625 } 626 } 627 } 628 629 // Now that we have fixed the reserved SRSRC we need to locate the 630 // (potentially) preloaded SRSRC. 631 Register PreloadedScratchRsrcReg; 632 if (ST.isAmdHsaOrMesa(F)) { 633 PreloadedScratchRsrcReg = 634 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 635 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 636 // We added live-ins during argument lowering, but since they were not 637 // used they were deleted. We're adding the uses now, so add them back. 638 MRI.addLiveIn(PreloadedScratchRsrcReg); 639 MBB.addLiveIn(PreloadedScratchRsrcReg); 640 } 641 } 642 643 // Debug location must be unknown since the first debug location is used to 644 // determine the end of the prologue. 645 DebugLoc DL; 646 MachineBasicBlock::iterator I = MBB.begin(); 647 648 // We found the SRSRC first because it needs four registers and has an 649 // alignment requirement. If the SRSRC that we found is clobbering with 650 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 651 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 652 // wave offset to a free SGPR. 653 Register ScratchWaveOffsetReg; 654 if (PreloadedScratchWaveOffsetReg && 655 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 656 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 657 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 658 AllSGPRs = AllSGPRs.slice( 659 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 660 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 661 for (MCPhysReg Reg : AllSGPRs) { 662 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 663 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 664 ScratchWaveOffsetReg = Reg; 665 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 666 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 667 break; 668 } 669 } 670 } else { 671 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 672 } 673 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); 674 675 if (requiresStackPointerReference(MF)) { 676 Register SPReg = MFI->getStackPtrOffsetReg(); 677 assert(SPReg != AMDGPU::SP_REG); 678 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 679 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); 680 } 681 682 if (hasFP(MF)) { 683 Register FPReg = MFI->getFrameOffsetReg(); 684 assert(FPReg != AMDGPU::FP_REG); 685 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 686 } 687 688 bool NeedsFlatScratchInit = 689 MFI->hasFlatScratchInit() && 690 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || 691 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); 692 693 if ((NeedsFlatScratchInit || ScratchRsrcReg) && 694 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { 695 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 696 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 697 } 698 699 if (NeedsFlatScratchInit) { 700 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 701 } 702 703 if (ScratchRsrcReg) { 704 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 705 PreloadedScratchRsrcReg, 706 ScratchRsrcReg, ScratchWaveOffsetReg); 707 } 708 } 709 710 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 711 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 712 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 713 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 714 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 715 716 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 717 const SIInstrInfo *TII = ST.getInstrInfo(); 718 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 719 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 720 const Function &Fn = MF.getFunction(); 721 722 if (ST.isAmdPalOS()) { 723 // The pointer to the GIT is formed from the offset passed in and either 724 // the amdgpu-git-ptr-high function attribute or the top part of the PC 725 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 726 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 727 728 buildGitPtr(MBB, I, DL, TII, Rsrc01); 729 730 // We now have the GIT ptr - now get the scratch descriptor from the entry 731 // at offset 0 (or offset 16 for a compute shader). 732 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 733 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 734 auto MMO = MF.getMachineMemOperand(PtrInfo, 735 MachineMemOperand::MOLoad | 736 MachineMemOperand::MOInvariant | 737 MachineMemOperand::MODereferenceable, 738 16, Align(4)); 739 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 740 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 741 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 742 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 743 .addReg(Rsrc01) 744 .addImm(EncodedOffset) // offset 745 .addImm(0) // cpol 746 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 747 .addMemOperand(MMO); 748 749 // The driver will always set the SRD for wave 64 (bits 118:117 of 750 // descriptor / bits 22:21 of third sub-reg will be 0b11) 751 // If the shader is actually wave32 we have to modify the const_index_stride 752 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 753 // reason the driver does this is that there can be cases where it presents 754 // 2 shaders with different wave size (e.g. VsFs). 755 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 756 if (ST.isWave32()) { 757 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 758 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 759 .addImm(21) 760 .addReg(Rsrc03); 761 } 762 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 763 assert(!ST.isAmdHsaOrMesa(Fn)); 764 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 765 766 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 767 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 768 769 // Use relocations to get the pointer, and setup the other bits manually. 770 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 771 772 if (MFI->hasImplicitBufferPtr()) { 773 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 774 775 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 776 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 777 778 BuildMI(MBB, I, DL, Mov64, Rsrc01) 779 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 780 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 781 } else { 782 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 783 784 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 785 auto MMO = MF.getMachineMemOperand( 786 PtrInfo, 787 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 788 MachineMemOperand::MODereferenceable, 789 8, Align(4)); 790 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 791 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 792 .addImm(0) // offset 793 .addImm(0) // cpol 794 .addMemOperand(MMO) 795 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 796 797 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 798 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 799 } 800 } else { 801 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 802 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 803 804 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 805 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 806 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 807 808 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 809 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 810 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 811 812 } 813 814 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 815 .addImm(Rsrc23 & 0xffffffff) 816 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 817 818 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 819 .addImm(Rsrc23 >> 32) 820 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 821 } else if (ST.isAmdHsaOrMesa(Fn)) { 822 assert(PreloadedScratchRsrcReg); 823 824 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 825 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 826 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 827 } 828 } 829 830 // Add the scratch wave offset into the scratch RSRC. 831 // 832 // We only want to update the first 48 bits, which is the base address 833 // pointer, without touching the adjacent 16 bits of flags. We know this add 834 // cannot carry-out from bit 47, otherwise the scratch allocation would be 835 // impossible to fit in the 48-bit global address space. 836 // 837 // TODO: Evaluate if it is better to just construct an SRD using the flat 838 // scratch init and some constants rather than update the one we are passed. 839 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 840 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 841 842 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 843 // the kernel body via inreg arguments. 844 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 845 .addReg(ScratchRsrcSub0) 846 .addReg(ScratchWaveOffsetReg) 847 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 848 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 849 .addReg(ScratchRsrcSub1) 850 .addImm(0) 851 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 852 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 853 } 854 855 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 856 switch (ID) { 857 case TargetStackID::Default: 858 case TargetStackID::NoAlloc: 859 case TargetStackID::SGPRSpill: 860 return true; 861 case TargetStackID::ScalableVector: 862 case TargetStackID::WasmLocal: 863 return false; 864 } 865 llvm_unreachable("Invalid TargetStackID::Value"); 866 } 867 868 // Activate only the inactive lanes when \p EnableInactiveLanes is true. 869 // Otherwise, activate all lanes. It returns the saved exec. 870 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 871 MachineFunction &MF, 872 MachineBasicBlock &MBB, 873 MachineBasicBlock::iterator MBBI, 874 const DebugLoc &DL, bool IsProlog, 875 bool EnableInactiveLanes) { 876 Register ScratchExecCopy; 877 MachineRegisterInfo &MRI = MF.getRegInfo(); 878 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 879 const SIInstrInfo *TII = ST.getInstrInfo(); 880 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 881 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 882 883 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 884 885 ScratchExecCopy = findScratchNonCalleeSaveRegister( 886 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 887 if (!ScratchExecCopy) 888 report_fatal_error("failed to find free scratch register"); 889 890 LiveRegs.addReg(ScratchExecCopy); 891 892 const unsigned SaveExecOpc = 893 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 894 : AMDGPU::S_OR_SAVEEXEC_B32) 895 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 896 : AMDGPU::S_OR_SAVEEXEC_B64); 897 auto SaveExec = 898 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); 899 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 900 901 return ScratchExecCopy; 902 } 903 904 void SIFrameLowering::emitCSRSpillStores( 905 MachineFunction &MF, MachineBasicBlock &MBB, 906 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, 907 Register FrameReg, Register FramePtrRegScratchCopy) const { 908 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 909 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 910 const SIInstrInfo *TII = ST.getInstrInfo(); 911 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 912 913 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 914 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 915 // might end up flipping the EXEC bits twice. 916 Register ScratchExecCopy; 917 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 918 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 919 if (!WWMScratchRegs.empty()) 920 ScratchExecCopy = 921 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 922 /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 923 924 auto StoreWWMRegisters = 925 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 926 for (const auto &Reg : WWMRegs) { 927 Register VGPR = Reg.first; 928 int FI = Reg.second; 929 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, 930 VGPR, FI, FrameReg); 931 } 932 }; 933 934 StoreWWMRegisters(WWMScratchRegs); 935 if (!WWMCalleeSavedRegs.empty()) { 936 if (ScratchExecCopy) { 937 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 938 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 939 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); 940 } else { 941 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 942 /*IsProlog*/ true, 943 /*EnableInactiveLanes*/ false); 944 } 945 } 946 947 StoreWWMRegisters(WWMCalleeSavedRegs); 948 if (ScratchExecCopy) { 949 // FIXME: Split block and make terminator. 950 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 951 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 952 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 953 .addReg(ScratchExecCopy, RegState::Kill); 954 LiveRegs.addReg(ScratchExecCopy); 955 } 956 957 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 958 959 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 960 // Special handle FP spill: 961 // Skip if FP is saved to a scratch SGPR, the save has already been emitted. 962 // Otherwise, FP has been moved to a temporary register and spill it 963 // instead. 964 Register Reg = 965 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 966 if (!Reg) 967 continue; 968 969 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 970 LiveRegs, FrameReg); 971 SB.save(); 972 } 973 974 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make 975 // such scratch registers live throughout the function. 976 SmallVector<Register, 1> ScratchSGPRs; 977 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs); 978 if (!ScratchSGPRs.empty()) { 979 for (MachineBasicBlock &MBB : MF) { 980 for (MCPhysReg Reg : ScratchSGPRs) 981 MBB.addLiveIn(Reg); 982 983 MBB.sortUniqueLiveIns(); 984 } 985 if (!LiveRegs.empty()) { 986 for (MCPhysReg Reg : ScratchSGPRs) 987 LiveRegs.addReg(Reg); 988 } 989 } 990 } 991 992 void SIFrameLowering::emitCSRSpillRestores( 993 MachineFunction &MF, MachineBasicBlock &MBB, 994 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, 995 Register FrameReg, Register FramePtrRegScratchCopy) const { 996 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 997 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 998 const SIInstrInfo *TII = ST.getInstrInfo(); 999 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1000 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1001 1002 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 1003 // Special handle FP restore: 1004 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore 1005 // the FP value to a temporary register. The frame pointer should be 1006 // overwritten only at the end when all other spills are restored from 1007 // current frame. 1008 Register Reg = 1009 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 1010 if (!Reg) 1011 continue; 1012 1013 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 1014 LiveRegs, FrameReg); 1015 SB.restore(); 1016 } 1017 1018 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the 1019 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to 1020 // this, we might end up flipping the EXEC bits twice. 1021 Register ScratchExecCopy; 1022 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 1023 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 1024 if (!WWMScratchRegs.empty()) 1025 ScratchExecCopy = 1026 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 1027 /*IsProlog*/ false, /*EnableInactiveLanes*/ true); 1028 1029 auto RestoreWWMRegisters = 1030 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 1031 for (const auto &Reg : WWMRegs) { 1032 Register VGPR = Reg.first; 1033 int FI = Reg.second; 1034 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, 1035 VGPR, FI, FrameReg); 1036 } 1037 }; 1038 1039 RestoreWWMRegisters(WWMScratchRegs); 1040 if (!WWMCalleeSavedRegs.empty()) { 1041 if (ScratchExecCopy) { 1042 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1043 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1044 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); 1045 } else { 1046 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 1047 /*IsProlog*/ false, 1048 /*EnableInactiveLanes*/ false); 1049 } 1050 } 1051 1052 RestoreWWMRegisters(WWMCalleeSavedRegs); 1053 if (ScratchExecCopy) { 1054 // FIXME: Split block and make terminator. 1055 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1056 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1057 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 1058 .addReg(ScratchExecCopy, RegState::Kill); 1059 } 1060 } 1061 1062 void SIFrameLowering::emitPrologue(MachineFunction &MF, 1063 MachineBasicBlock &MBB) const { 1064 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1065 if (FuncInfo->isEntryFunction()) { 1066 emitEntryFunctionPrologue(MF, MBB); 1067 return; 1068 } 1069 1070 MachineFrameInfo &MFI = MF.getFrameInfo(); 1071 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1072 const SIInstrInfo *TII = ST.getInstrInfo(); 1073 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1074 MachineRegisterInfo &MRI = MF.getRegInfo(); 1075 1076 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1077 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1078 Register BasePtrReg = 1079 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 1080 LivePhysRegs LiveRegs; 1081 1082 MachineBasicBlock::iterator MBBI = MBB.begin(); 1083 // DebugLoc must be unknown since the first instruction with DebugLoc is used 1084 // to determine the end of the prologue. 1085 DebugLoc DL; 1086 1087 bool HasFP = false; 1088 bool HasBP = false; 1089 uint32_t NumBytes = MFI.getStackSize(); 1090 uint32_t RoundedSize = NumBytes; 1091 1092 if (TRI.hasStackRealignment(MF)) 1093 HasFP = true; 1094 1095 Register FramePtrRegScratchCopy; 1096 if (!HasFP && !hasFP(MF)) { 1097 // Emit the CSR spill stores with SP base register. 1098 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, 1099 FramePtrRegScratchCopy); 1100 } else { 1101 // CSR spill stores will use FP as base register. 1102 Register SGPRForFPSaveRestoreCopy = 1103 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1104 1105 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 1106 if (SGPRForFPSaveRestoreCopy) { 1107 // Copy FP to the scratch register now and emit the CFI entry. It avoids 1108 // the extra FP copy needed in the other two cases when FP is spilled to 1109 // memory or to a VGPR lane. 1110 PrologEpilogSGPRSpillBuilder SB( 1111 FramePtrReg, 1112 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, 1113 DL, TII, TRI, LiveRegs, FramePtrReg); 1114 SB.save(); 1115 LiveRegs.addReg(SGPRForFPSaveRestoreCopy); 1116 } else { 1117 // Copy FP into a new scratch register so that its previous value can be 1118 // spilled after setting up the new frame. 1119 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1120 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); 1121 if (!FramePtrRegScratchCopy) 1122 report_fatal_error("failed to find free scratch register"); 1123 1124 LiveRegs.addReg(FramePtrRegScratchCopy); 1125 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) 1126 .addReg(FramePtrReg); 1127 } 1128 } 1129 1130 if (HasFP) { 1131 const unsigned Alignment = MFI.getMaxAlign().value(); 1132 1133 RoundedSize += Alignment; 1134 if (LiveRegs.empty()) { 1135 LiveRegs.init(TRI); 1136 LiveRegs.addLiveIns(MBB); 1137 } 1138 1139 // s_add_i32 s33, s32, NumBytes 1140 // s_and_b32 s33, s33, 0b111...0000 1141 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) 1142 .addReg(StackPtrReg) 1143 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 1144 .setMIFlag(MachineInstr::FrameSetup); 1145 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 1146 .addReg(FramePtrReg, RegState::Kill) 1147 .addImm(-Alignment * getScratchScaleFactor(ST)) 1148 .setMIFlag(MachineInstr::FrameSetup); 1149 And->getOperand(3).setIsDead(); // Mark SCC as dead. 1150 FuncInfo->setIsStackRealigned(true); 1151 } else if ((HasFP = hasFP(MF))) { 1152 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1153 .addReg(StackPtrReg) 1154 .setMIFlag(MachineInstr::FrameSetup); 1155 } 1156 1157 // If FP is used, emit the CSR spills with FP base register. 1158 if (HasFP) { 1159 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, 1160 FramePtrRegScratchCopy); 1161 if (FramePtrRegScratchCopy) 1162 LiveRegs.removeReg(FramePtrRegScratchCopy); 1163 } 1164 1165 // If we need a base pointer, set it up here. It's whatever the value of 1166 // the stack pointer is at this point. Any variable size objects will be 1167 // allocated after this, so we can still use the base pointer to reference 1168 // the incoming arguments. 1169 if ((HasBP = TRI.hasBasePointer(MF))) { 1170 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1171 .addReg(StackPtrReg) 1172 .setMIFlag(MachineInstr::FrameSetup); 1173 } 1174 1175 if (HasFP && RoundedSize != 0) { 1176 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1177 .addReg(StackPtrReg) 1178 .addImm(RoundedSize * getScratchScaleFactor(ST)) 1179 .setMIFlag(MachineInstr::FrameSetup); 1180 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1181 } 1182 1183 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1184 (void)FPSaved; 1185 assert((!HasFP || FPSaved) && 1186 "Needed to save FP but didn't save it anywhere"); 1187 1188 // If we allow spilling to AGPRs we may have saved FP but then spill 1189 // everything into AGPRs instead of the stack. 1190 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && 1191 "Saved FP but didn't need it"); 1192 1193 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg); 1194 (void)BPSaved; 1195 assert((!HasBP || BPSaved) && 1196 "Needed to save BP but didn't save it anywhere"); 1197 1198 assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); 1199 } 1200 1201 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 1202 MachineBasicBlock &MBB) const { 1203 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1204 if (FuncInfo->isEntryFunction()) 1205 return; 1206 1207 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1208 const SIInstrInfo *TII = ST.getInstrInfo(); 1209 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1210 MachineRegisterInfo &MRI = MF.getRegInfo(); 1211 LivePhysRegs LiveRegs; 1212 // Get the insert location for the epilogue. If there were no terminators in 1213 // the block, get the last instruction. 1214 MachineBasicBlock::iterator MBBI = MBB.end(); 1215 DebugLoc DL; 1216 if (!MBB.empty()) { 1217 MBBI = MBB.getLastNonDebugInstr(); 1218 if (MBBI != MBB.end()) 1219 DL = MBBI->getDebugLoc(); 1220 1221 MBBI = MBB.getFirstTerminator(); 1222 } 1223 1224 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1225 uint32_t NumBytes = MFI.getStackSize(); 1226 uint32_t RoundedSize = FuncInfo->isStackRealigned() 1227 ? NumBytes + MFI.getMaxAlign().value() 1228 : NumBytes; 1229 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1230 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1231 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1232 1233 Register FramePtrRegScratchCopy; 1234 Register SGPRForFPSaveRestoreCopy = 1235 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1236 if (FPSaved) { 1237 // CSR spill restores should use FP as base register. If 1238 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP 1239 // into a new scratch register and copy to FP later when other registers are 1240 // restored from the current stack frame. 1241 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1242 if (SGPRForFPSaveRestoreCopy) { 1243 LiveRegs.addReg(SGPRForFPSaveRestoreCopy); 1244 } else { 1245 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1246 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); 1247 if (!FramePtrRegScratchCopy) 1248 report_fatal_error("failed to find free scratch register"); 1249 1250 LiveRegs.addReg(FramePtrRegScratchCopy); 1251 } 1252 1253 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, 1254 FramePtrRegScratchCopy); 1255 } 1256 1257 if (RoundedSize != 0 && hasFP(MF)) { 1258 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1259 .addReg(StackPtrReg) 1260 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) 1261 .setMIFlag(MachineInstr::FrameDestroy); 1262 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1263 } 1264 1265 if (FPSaved) { 1266 // Insert the copy to restore FP. 1267 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy 1268 : FramePtrRegScratchCopy; 1269 MachineInstrBuilder MIB = 1270 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1271 .addReg(SrcReg); 1272 if (SGPRForFPSaveRestoreCopy) 1273 MIB.setMIFlag(MachineInstr::FrameDestroy); 1274 } else { 1275 // Insert the CSR spill restores with SP as the base register. 1276 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, 1277 FramePtrRegScratchCopy); 1278 } 1279 } 1280 1281 #ifndef NDEBUG 1282 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1283 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1284 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1285 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1286 I != E; ++I) { 1287 if (!MFI.isDeadObjectIndex(I) && 1288 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1289 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { 1290 return false; 1291 } 1292 } 1293 1294 return true; 1295 } 1296 #endif 1297 1298 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1299 int FI, 1300 Register &FrameReg) const { 1301 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1302 1303 FrameReg = RI->getFrameRegister(MF); 1304 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1305 } 1306 1307 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1308 MachineFunction &MF, 1309 RegScavenger *RS) const { 1310 MachineFrameInfo &MFI = MF.getFrameInfo(); 1311 1312 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1313 const SIInstrInfo *TII = ST.getInstrInfo(); 1314 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1315 MachineRegisterInfo &MRI = MF.getRegInfo(); 1316 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1317 1318 // Allocate spill slots for WWM reserved VGPRs. 1319 if (!FuncInfo->isEntryFunction()) { 1320 for (Register Reg : FuncInfo->getWWMReservedRegs()) { 1321 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); 1322 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), 1323 TRI->getSpillAlign(*RC)); 1324 } 1325 } 1326 1327 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() 1328 && EnableSpillVGPRToAGPR; 1329 1330 if (SpillVGPRToAGPR) { 1331 // To track the spill frame indices handled in this pass. 1332 BitVector SpillFIs(MFI.getObjectIndexEnd(), false); 1333 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); 1334 1335 bool SeenDbgInstr = false; 1336 1337 for (MachineBasicBlock &MBB : MF) { 1338 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { 1339 int FrameIndex; 1340 if (MI.isDebugInstr()) 1341 SeenDbgInstr = true; 1342 1343 if (TII->isVGPRSpill(MI)) { 1344 // Try to eliminate stack used by VGPR spills before frame 1345 // finalization. 1346 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1347 AMDGPU::OpName::vaddr); 1348 int FI = MI.getOperand(FIOp).getIndex(); 1349 Register VReg = 1350 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 1351 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, 1352 TRI->isAGPR(MRI, VReg))) { 1353 // FIXME: change to enterBasicBlockEnd() 1354 RS->enterBasicBlock(MBB); 1355 TRI->eliminateFrameIndex(MI, 0, FIOp, RS); 1356 SpillFIs.set(FI); 1357 continue; 1358 } 1359 } else if (TII->isStoreToStackSlot(MI, FrameIndex) || 1360 TII->isLoadFromStackSlot(MI, FrameIndex)) 1361 if (!MFI.isFixedObjectIndex(FrameIndex)) 1362 NonVGPRSpillFIs.set(FrameIndex); 1363 } 1364 } 1365 1366 // Stack slot coloring may assign different objects to the same stack slot. 1367 // If not, then the VGPR to AGPR spill slot is dead. 1368 for (unsigned FI : SpillFIs.set_bits()) 1369 if (!NonVGPRSpillFIs.test(FI)) 1370 FuncInfo->setVGPRToAGPRSpillDead(FI); 1371 1372 for (MachineBasicBlock &MBB : MF) { 1373 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) 1374 MBB.addLiveIn(Reg); 1375 1376 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) 1377 MBB.addLiveIn(Reg); 1378 1379 MBB.sortUniqueLiveIns(); 1380 1381 if (!SpillFIs.empty() && SeenDbgInstr) { 1382 // FIXME: The dead frame indices are replaced with a null register from 1383 // the debug value instructions. We should instead, update it with the 1384 // correct register value. But not sure the register value alone is 1385 for (MachineInstr &MI : MBB) { 1386 if (MI.isDebugValue() && MI.getOperand(0).isFI() && 1387 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) && 1388 SpillFIs[MI.getOperand(0).getIndex()]) { 1389 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); 1390 } 1391 } 1392 } 1393 } 1394 } 1395 1396 // At this point we've already allocated all spilled SGPRs to VGPRs if we 1397 // can. Any remaining SGPR spills will go to memory, so move them back to the 1398 // default stack. 1399 bool HaveSGPRToVMemSpill = 1400 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); 1401 assert(allSGPRSpillsAreDead(MF) && 1402 "SGPR spill should have been removed in SILowerSGPRSpills"); 1403 1404 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1405 // but currently hasNonSpillStackObjects is set only from source 1406 // allocas. Stack temps produced from legalization are not counted currently. 1407 if (!allStackObjectsAreDead(MFI)) { 1408 assert(RS && "RegScavenger required if spilling"); 1409 1410 // Add an emergency spill slot 1411 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1412 1413 // If we are spilling SGPRs to memory with a large frame, we may need a 1414 // second VGPR emergency frame index. 1415 if (HaveSGPRToVMemSpill && 1416 allocateScavengingFrameIndexesNearIncomingSP(MF)) { 1417 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false)); 1418 } 1419 } 1420 } 1421 1422 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( 1423 MachineFunction &MF, RegScavenger *RS) const { 1424 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1425 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1426 MachineRegisterInfo &MRI = MF.getRegInfo(); 1427 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1428 1429 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 1430 // On gfx908, we had initially reserved highest available VGPR for AGPR 1431 // copy. Now since we are done with RA, check if there exist an unused VGPR 1432 // which is lower than the eariler reserved VGPR before RA. If one exist, 1433 // use it for AGPR copy instead of one reserved before RA. 1434 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); 1435 Register UnusedLowVGPR = 1436 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); 1437 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < 1438 TRI->getHWRegIndex(VGPRForAGPRCopy))) { 1439 // Call to setVGPRForAGPRCopy() should happen first before calling 1440 // freezeReservedRegs() so that getReservedRegs() can reserve this newly 1441 // identified VGPR (for AGPR copy). 1442 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); 1443 MRI.freezeReservedRegs(MF); 1444 } 1445 } 1446 } 1447 1448 // The special SGPR spills like the one needed for FP, BP or any reserved 1449 // registers delayed until frame lowering. 1450 void SIFrameLowering::determinePrologEpilogSGPRSaves( 1451 MachineFunction &MF, BitVector &SavedVGPRs) const { 1452 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1453 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1454 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1455 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1456 LivePhysRegs LiveRegs; 1457 LiveRegs.init(*TRI); 1458 // Initially mark callee saved registers as used so we will not choose them 1459 // while looking for scratch SGPRs. 1460 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); 1461 for (unsigned I = 0; CSRegs[I]; ++I) 1462 LiveRegs.addReg(CSRegs[I]); 1463 1464 // hasFP only knows about stack objects that already exist. We're now 1465 // determining the stack slots that will be created, so we have to predict 1466 // them. Stack objects force FP usage with calls. 1467 // 1468 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1469 // don't want to report it here. 1470 // 1471 // FIXME: Is this really hasReservedCallFrame? 1472 const bool WillHaveFP = 1473 FrameInfo.hasCalls() && 1474 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1475 1476 if (WillHaveFP || hasFP(MF)) { 1477 Register FramePtrReg = MFI->getFrameOffsetReg(); 1478 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && 1479 "Re-reserving spill slot for FP"); 1480 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg); 1481 } 1482 1483 if (TRI->hasBasePointer(MF)) { 1484 Register BasePtrReg = TRI->getBaseRegister(); 1485 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && 1486 "Re-reserving spill slot for BP"); 1487 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg); 1488 } 1489 } 1490 1491 // Only report VGPRs to generic code. 1492 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1493 BitVector &SavedVGPRs, 1494 RegScavenger *RS) const { 1495 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1496 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1497 if (MFI->isEntryFunction()) 1498 return; 1499 1500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1501 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1502 1503 for (MachineBasicBlock &MBB : MF) { 1504 for (MachineInstr &MI : MBB) { 1505 // WRITELANE instructions used for SGPR spills can overwrite the inactive 1506 // lanes of VGPRs and callee must spill and restore them even if they are 1507 // marked Caller-saved. 1508 1509 // TODO: Handle this elsewhere at an early point. Walking through all MBBs 1510 // here would be a bad heuristic. A better way should be by calling 1511 // allocateWWMSpill during the regalloc pipeline whenever a physical 1512 // register is allocated for the intended virtual registers. That will 1513 // also help excluding the general use of WRITELANE/READLANE intrinsics 1514 // that won't really need any such special handling. 1515 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32) 1516 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); 1517 else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) 1518 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); 1519 } 1520 } 1521 1522 // Ignore the SGPRs the default implementation found. 1523 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1524 1525 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1526 // In gfx908 there was do AGPR loads and stores and thus spilling also 1527 // require a temporary VGPR. 1528 if (!ST.hasGFX90AInsts()) 1529 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1530 1531 determinePrologEpilogSGPRSaves(MF, SavedVGPRs); 1532 1533 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't 1534 // allow the default insertion to handle them. 1535 for (auto &Reg : MFI->getWWMSpills()) 1536 SavedVGPRs.reset(Reg.first); 1537 1538 // Mark all lane VGPRs as BB LiveIns. 1539 for (MachineBasicBlock &MBB : MF) { 1540 for (auto &Reg : MFI->getWWMSpills()) 1541 MBB.addLiveIn(Reg.first); 1542 1543 MBB.sortUniqueLiveIns(); 1544 } 1545 } 1546 1547 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1548 BitVector &SavedRegs, 1549 RegScavenger *RS) const { 1550 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1551 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1552 if (MFI->isEntryFunction()) 1553 return; 1554 1555 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1556 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1557 1558 // The SP is specifically managed and we don't want extra spills of it. 1559 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1560 1561 const BitVector AllSavedRegs = SavedRegs; 1562 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1563 1564 // We have to anticipate introducing CSR VGPR spills or spill of caller 1565 // save VGPR reserved for SGPR spills as we now always create stack entry 1566 // for it, if we don't have any stack objects already, since we require a FP 1567 // if there is a call and stack. We will allocate a VGPR for SGPR spills if 1568 // there are any SGPR spills. Whether they are CSR spills or otherwise. 1569 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1570 const bool WillHaveFP = 1571 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); 1572 1573 // FP will be specially managed like SP. 1574 if (WillHaveFP || hasFP(MF)) 1575 SavedRegs.reset(MFI->getFrameOffsetReg()); 1576 1577 // Return address use with return instruction is hidden through the SI_RETURN 1578 // pseudo. Given that and since the IPRA computes actual register usage and 1579 // does not use CSR list, the clobbering of return address by function calls 1580 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register 1581 // usage collection. This will ensure save/restore of return address happens 1582 // in those scenarios. 1583 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1584 Register RetAddrReg = TRI->getReturnAddressReg(MF); 1585 if (!MFI->isEntryFunction() && 1586 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) { 1587 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); 1588 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); 1589 } 1590 } 1591 1592 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1593 MachineFunction &MF, const TargetRegisterInfo *TRI, 1594 std::vector<CalleeSavedInfo> &CSI) const { 1595 if (CSI.empty()) 1596 return true; // Early exit if no callee saved registers are modified! 1597 1598 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1599 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1600 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1601 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1602 Register BasePtrReg = RI->getBaseRegister(); 1603 Register SGPRForFPSaveRestoreCopy = 1604 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1605 Register SGPRForBPSaveRestoreCopy = 1606 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg); 1607 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) 1608 return false; 1609 1610 unsigned NumModifiedRegs = 0; 1611 1612 if (SGPRForFPSaveRestoreCopy) 1613 NumModifiedRegs++; 1614 if (SGPRForBPSaveRestoreCopy) 1615 NumModifiedRegs++; 1616 1617 for (auto &CS : CSI) { 1618 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { 1619 CS.setDstReg(SGPRForFPSaveRestoreCopy); 1620 if (--NumModifiedRegs) 1621 break; 1622 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { 1623 CS.setDstReg(SGPRForBPSaveRestoreCopy); 1624 if (--NumModifiedRegs) 1625 break; 1626 } 1627 } 1628 1629 return false; 1630 } 1631 1632 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( 1633 const MachineFunction &MF) const { 1634 1635 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1636 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1637 uint64_t EstStackSize = MFI.estimateStackSize(MF); 1638 uint64_t MaxOffset = EstStackSize - 1; 1639 1640 // We need the emergency stack slots to be allocated in range of the 1641 // MUBUF/flat scratch immediate offset from the base register, so assign these 1642 // first at the incoming SP position. 1643 // 1644 // TODO: We could try sorting the objects to find a hole in the first bytes 1645 // rather than allocating as close to possible. This could save a lot of space 1646 // on frames with alignment requirements. 1647 if (ST.enableFlatScratch()) { 1648 const SIInstrInfo *TII = ST.getInstrInfo(); 1649 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1650 SIInstrFlags::FlatScratch)) 1651 return false; 1652 } else { 1653 if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) 1654 return false; 1655 } 1656 1657 return true; 1658 } 1659 1660 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1661 MachineFunction &MF, 1662 MachineBasicBlock &MBB, 1663 MachineBasicBlock::iterator I) const { 1664 int64_t Amount = I->getOperand(0).getImm(); 1665 if (Amount == 0) 1666 return MBB.erase(I); 1667 1668 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1669 const SIInstrInfo *TII = ST.getInstrInfo(); 1670 const DebugLoc &DL = I->getDebugLoc(); 1671 unsigned Opc = I->getOpcode(); 1672 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1673 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1674 1675 if (!hasReservedCallFrame(MF)) { 1676 Amount = alignTo(Amount, getStackAlign()); 1677 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1678 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1679 Register SPReg = MFI->getStackPtrOffsetReg(); 1680 1681 Amount *= getScratchScaleFactor(ST); 1682 if (IsDestroy) 1683 Amount = -Amount; 1684 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) 1685 .addReg(SPReg) 1686 .addImm(Amount); 1687 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1688 } else if (CalleePopAmount != 0) { 1689 llvm_unreachable("is this used?"); 1690 } 1691 1692 return MBB.erase(I); 1693 } 1694 1695 /// Returns true if the frame will require a reference to the stack pointer. 1696 /// 1697 /// This is the set of conditions common to setting up the stack pointer in a 1698 /// kernel, and for using a frame pointer in a callable function. 1699 /// 1700 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1701 /// references SP. 1702 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1703 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1704 } 1705 1706 // The FP for kernels is always known 0, so we never really need to setup an 1707 // explicit register for it. However, DisableFramePointerElim will force us to 1708 // use a register for it. 1709 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1710 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1711 1712 // For entry functions we can use an immediate offset in most cases, so the 1713 // presence of calls doesn't imply we need a distinct frame pointer. 1714 if (MFI.hasCalls() && 1715 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1716 // All offsets are unsigned, so need to be addressed in the same direction 1717 // as stack growth. 1718 1719 // FIXME: This function is pretty broken, since it can be called before the 1720 // frame layout is determined or CSR spills are inserted. 1721 return MFI.getStackSize() != 0; 1722 } 1723 1724 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1725 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1726 MF) || 1727 MF.getTarget().Options.DisableFramePointerElim(MF); 1728 } 1729 1730 // This is essentially a reduced version of hasFP for entry functions. Since the 1731 // stack pointer is known 0 on entry to kernels, we never really need an FP 1732 // register. We may need to initialize the stack pointer depending on the frame 1733 // properties, which logically overlaps many of the cases where an ordinary 1734 // function would require an FP. 1735 bool SIFrameLowering::requiresStackPointerReference( 1736 const MachineFunction &MF) const { 1737 // Callable functions always require a stack pointer reference. 1738 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1739 "only expected to call this for entry points"); 1740 1741 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1742 1743 // Entry points ordinarily don't need to initialize SP. We have to set it up 1744 // for callees if there are any. Also note tail calls are impossible/don't 1745 // make any sense for kernels. 1746 if (MFI.hasCalls()) 1747 return true; 1748 1749 // We still need to initialize the SP if we're doing anything weird that 1750 // references the SP, like variable sized stack objects. 1751 return frameTriviallyRequiresSP(MFI); 1752 } 1753