1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LivePhysRegs.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 static cl::opt<bool> EnableSpillVGPRToAGPR( 24 "amdgpu-spill-vgpr-to-agpr", 25 cl::desc("Enable spilling VGPRs to AGPRs"), 26 cl::ReallyHidden, 27 cl::init(true)); 28 29 // Find a register matching \p RC from \p LiveRegs which is unused and available 30 // throughout the function. On failure, returns AMDGPU::NoRegister. 31 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, 32 const LivePhysRegs &LiveRegs, 33 const TargetRegisterClass &RC) { 34 for (MCRegister Reg : RC) { 35 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 36 return Reg; 37 } 38 return MCRegister(); 39 } 40 41 // Find a scratch register that we can use in the prologue. We avoid using 42 // callee-save registers since they may appear to be free when this is called 43 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 44 // when this is called from emitPrologue. 45 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 46 LivePhysRegs &LiveRegs, 47 const TargetRegisterClass &RC, 48 bool Unused = false) { 49 // Mark callee saved registers as used so we will not choose them. 50 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 51 for (unsigned i = 0; CSRegs[i]; ++i) 52 LiveRegs.addReg(CSRegs[i]); 53 54 // We are looking for a register that can be used throughout the entire 55 // function, so any use is unacceptable. 56 if (Unused) 57 return findUnusedRegister(MRI, LiveRegs, RC); 58 59 for (MCRegister Reg : RC) { 60 if (LiveRegs.available(MRI, Reg)) 61 return Reg; 62 } 63 64 return MCRegister(); 65 } 66 67 /// Query target location for spilling SGPRs 68 /// \p IncludeScratchCopy : Also look for free scratch SGPRs 69 static void getVGPRSpillLaneOrTempRegister( 70 MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, 71 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, 72 bool IncludeScratchCopy = true) { 73 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 74 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 75 76 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 77 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 78 unsigned Size = TRI->getSpillSize(RC); 79 Align Alignment = TRI->getSpillAlign(RC); 80 81 // We need to save and restore the given SGPR. 82 83 Register ScratchSGPR; 84 // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs 85 // should have all the callee saved registers marked as used. For certain 86 // cases we skip copy to scratch SGPR. 87 if (IncludeScratchCopy) 88 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); 89 90 if (!ScratchSGPR) { 91 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, 92 TargetStackID::SGPRSpill); 93 94 if (TRI->spillSGPRToVGPR() && 95 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) { 96 // 2: There's no free lane to spill, and no free register to save the 97 // SGPR, so we're forced to take another VGPR to use for the spill. 98 MFI->addToPrologEpilogSGPRSpills( 99 SGPR, PrologEpilogSGPRSaveRestoreInfo( 100 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); 101 102 LLVM_DEBUG( 103 auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); 104 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " 105 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); 106 } else { 107 // Remove dead <FI> index 108 MF.getFrameInfo().RemoveStackObject(FI); 109 // 3: If all else fails, spill the register to memory. 110 FI = FrameInfo.CreateSpillStackObject(Size, Alignment); 111 MFI->addToPrologEpilogSGPRSpills( 112 SGPR, 113 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); 114 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " 115 << printReg(SGPR, TRI) << '\n'); 116 } 117 } else { 118 MFI->addToPrologEpilogSGPRSpills( 119 SGPR, PrologEpilogSGPRSaveRestoreInfo( 120 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); 121 LiveRegs.addReg(ScratchSGPR); 122 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " 123 << printReg(ScratchSGPR, TRI) << '\n'); 124 } 125 } 126 127 // We need to specially emit stack operations here because a different frame 128 // register is used than in the rest of the function, as getFrameRegister would 129 // use. 130 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 131 const SIMachineFunctionInfo &FuncInfo, 132 LivePhysRegs &LiveRegs, MachineFunction &MF, 133 MachineBasicBlock &MBB, 134 MachineBasicBlock::iterator I, const DebugLoc &DL, 135 Register SpillReg, int FI, Register FrameReg, 136 int64_t DwordOff = 0) { 137 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 138 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 139 140 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 141 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 142 MachineMemOperand *MMO = MF.getMachineMemOperand( 143 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 144 FrameInfo.getObjectAlign(FI)); 145 LiveRegs.addReg(SpillReg); 146 bool IsKill = !MBB.isLiveIn(SpillReg); 147 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, 148 DwordOff, MMO, nullptr, &LiveRegs); 149 if (IsKill) 150 LiveRegs.removeReg(SpillReg); 151 } 152 153 static void buildEpilogRestore(const GCNSubtarget &ST, 154 const SIRegisterInfo &TRI, 155 const SIMachineFunctionInfo &FuncInfo, 156 LivePhysRegs &LiveRegs, MachineFunction &MF, 157 MachineBasicBlock &MBB, 158 MachineBasicBlock::iterator I, 159 const DebugLoc &DL, Register SpillReg, int FI, 160 Register FrameReg, int64_t DwordOff = 0) { 161 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 162 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 163 164 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 165 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 166 MachineMemOperand *MMO = MF.getMachineMemOperand( 167 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 168 FrameInfo.getObjectAlign(FI)); 169 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, 170 DwordOff, MMO, nullptr, &LiveRegs); 171 } 172 173 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 174 const DebugLoc &DL, const SIInstrInfo *TII, 175 Register TargetReg) { 176 MachineFunction *MF = MBB.getParent(); 177 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 178 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 179 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 180 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 181 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 182 183 if (MFI->getGITPtrHigh() != 0xffffffff) { 184 BuildMI(MBB, I, DL, SMovB32, TargetHi) 185 .addImm(MFI->getGITPtrHigh()) 186 .addReg(TargetReg, RegState::ImplicitDefine); 187 } else { 188 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 189 BuildMI(MBB, I, DL, GetPC64, TargetReg); 190 } 191 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 192 MF->getRegInfo().addLiveIn(GitPtrLo); 193 MBB.addLiveIn(GitPtrLo); 194 BuildMI(MBB, I, DL, SMovB32, TargetLo) 195 .addReg(GitPtrLo); 196 } 197 198 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, 199 const SIMachineFunctionInfo *FuncInfo, 200 MachineFunction &MF, MachineBasicBlock &MBB, 201 MachineBasicBlock::iterator MBBI, bool IsProlog) { 202 if (LiveRegs.empty()) { 203 LiveRegs.init(TRI); 204 if (IsProlog) { 205 LiveRegs.addLiveIns(MBB); 206 } else { 207 // In epilog. 208 LiveRegs.addLiveOuts(MBB); 209 LiveRegs.stepBackward(*MBBI); 210 } 211 } 212 } 213 214 namespace llvm { 215 216 // SpillBuilder to save/restore special SGPR spills like the one needed for FP, 217 // BP, etc. These spills are delayed until the current function's frame is 218 // finalized. For a given register, the builder uses the 219 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. 220 class PrologEpilogSGPRSpillBuilder { 221 MachineBasicBlock::iterator MI; 222 MachineBasicBlock &MBB; 223 MachineFunction &MF; 224 const GCNSubtarget &ST; 225 MachineFrameInfo &MFI; 226 SIMachineFunctionInfo *FuncInfo; 227 const SIInstrInfo *TII; 228 const SIRegisterInfo &TRI; 229 Register SuperReg; 230 const PrologEpilogSGPRSaveRestoreInfo SI; 231 LivePhysRegs &LiveRegs; 232 const DebugLoc &DL; 233 Register FrameReg; 234 ArrayRef<int16_t> SplitParts; 235 unsigned NumSubRegs; 236 unsigned EltSize = 4; 237 238 void saveToMemory(const int FI) const { 239 MachineRegisterInfo &MRI = MF.getRegInfo(); 240 assert(!MFI.isDeadObjectIndex(FI)); 241 242 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); 243 244 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 245 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 246 if (!TmpVGPR) 247 report_fatal_error("failed to find free scratch register"); 248 249 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 250 Register SubReg = NumSubRegs == 1 251 ? SuperReg 252 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 253 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 254 .addReg(SubReg); 255 256 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, 257 FI, FrameReg, DwordOff); 258 DwordOff += 4; 259 } 260 } 261 262 void saveToVGPRLane(const int FI) const { 263 assert(!MFI.isDeadObjectIndex(FI)); 264 265 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 266 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 267 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); 268 assert(Spill.size() == NumSubRegs); 269 270 for (unsigned I = 0; I < NumSubRegs; ++I) { 271 Register SubReg = NumSubRegs == 1 272 ? SuperReg 273 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 274 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR) 275 .addReg(SubReg) 276 .addImm(Spill[I].Lane) 277 .addReg(Spill[I].VGPR, RegState::Undef); 278 } 279 } 280 281 void copyToScratchSGPR(Register DstReg) const { 282 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) 283 .addReg(SuperReg) 284 .setMIFlag(MachineInstr::FrameSetup); 285 } 286 287 void restoreFromMemory(const int FI) { 288 MachineRegisterInfo &MRI = MF.getRegInfo(); 289 290 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); 291 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 292 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 293 if (!TmpVGPR) 294 report_fatal_error("failed to find free scratch register"); 295 296 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 297 Register SubReg = NumSubRegs == 1 298 ? SuperReg 299 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 300 301 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, 302 FI, FrameReg, DwordOff); 303 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 304 .addReg(TmpVGPR, RegState::Kill); 305 DwordOff += 4; 306 } 307 } 308 309 void restoreFromVGPRLane(const int FI) { 310 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 311 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 312 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); 313 assert(Spill.size() == NumSubRegs); 314 315 for (unsigned I = 0; I < NumSubRegs; ++I) { 316 Register SubReg = NumSubRegs == 1 317 ? SuperReg 318 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 319 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) 320 .addReg(Spill[I].VGPR) 321 .addImm(Spill[I].Lane); 322 } 323 } 324 325 void copyFromScratchSGPR(Register SrcReg) const { 326 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) 327 .addReg(SrcReg) 328 .setMIFlag(MachineInstr::FrameDestroy); 329 } 330 331 public: 332 PrologEpilogSGPRSpillBuilder(Register Reg, 333 const PrologEpilogSGPRSaveRestoreInfo SI, 334 MachineBasicBlock &MBB, 335 MachineBasicBlock::iterator MI, 336 const DebugLoc &DL, const SIInstrInfo *TII, 337 const SIRegisterInfo &TRI, 338 LivePhysRegs &LiveRegs, Register FrameReg) 339 : MI(MI), MBB(MBB), MF(*MBB.getParent()), 340 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), 341 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 342 SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) { 343 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 344 SplitParts = TRI.getRegSplitParts(RC, EltSize); 345 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 346 347 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 348 } 349 350 void save() { 351 switch (SI.getKind()) { 352 case SGPRSaveKind::SPILL_TO_MEM: 353 return saveToMemory(SI.getIndex()); 354 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 355 return saveToVGPRLane(SI.getIndex()); 356 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 357 return copyToScratchSGPR(SI.getReg()); 358 } 359 } 360 361 void restore() { 362 switch (SI.getKind()) { 363 case SGPRSaveKind::SPILL_TO_MEM: 364 return restoreFromMemory(SI.getIndex()); 365 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 366 return restoreFromVGPRLane(SI.getIndex()); 367 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 368 return copyFromScratchSGPR(SI.getReg()); 369 } 370 } 371 }; 372 373 } // namespace llvm 374 375 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 376 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 377 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 378 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 379 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 380 const SIInstrInfo *TII = ST.getInstrInfo(); 381 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 382 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 383 384 // We don't need this if we only have spills since there is no user facing 385 // scratch. 386 387 // TODO: If we know we don't have flat instructions earlier, we can omit 388 // this from the input registers. 389 // 390 // TODO: We only need to know if we access scratch space through a flat 391 // pointer. Because we only detect if flat instructions are used at all, 392 // this will be used more often than necessary on VI. 393 394 Register FlatScrInitLo; 395 Register FlatScrInitHi; 396 397 if (ST.isAmdPalOS()) { 398 // Extract the scratch offset from the descriptor in the GIT 399 LivePhysRegs LiveRegs; 400 LiveRegs.init(*TRI); 401 LiveRegs.addLiveIns(MBB); 402 403 // Find unused reg to load flat scratch init into 404 MachineRegisterInfo &MRI = MF.getRegInfo(); 405 Register FlatScrInit = AMDGPU::NoRegister; 406 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 407 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 408 AllSGPR64s = AllSGPR64s.slice( 409 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 410 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 411 for (MCPhysReg Reg : AllSGPR64s) { 412 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && 413 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 414 FlatScrInit = Reg; 415 break; 416 } 417 } 418 assert(FlatScrInit && "Failed to find free register for scratch init"); 419 420 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 421 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 422 423 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 424 425 // We now have the GIT ptr - now get the scratch descriptor from the entry 426 // at offset 0 (or offset 16 for a compute shader). 427 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 428 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 429 auto *MMO = MF.getMachineMemOperand( 430 PtrInfo, 431 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 432 MachineMemOperand::MODereferenceable, 433 8, Align(4)); 434 unsigned Offset = 435 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 436 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 437 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 438 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 439 .addReg(FlatScrInit) 440 .addImm(EncodedOffset) // offset 441 .addImm(0) // cpol 442 .addMemOperand(MMO); 443 444 // Mask the offset in [47:0] of the descriptor 445 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 446 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 447 .addReg(FlatScrInitHi) 448 .addImm(0xffff); 449 And->getOperand(3).setIsDead(); // Mark SCC as dead. 450 } else { 451 Register FlatScratchInitReg = 452 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 453 assert(FlatScratchInitReg); 454 455 MachineRegisterInfo &MRI = MF.getRegInfo(); 456 MRI.addLiveIn(FlatScratchInitReg); 457 MBB.addLiveIn(FlatScratchInitReg); 458 459 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 460 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 461 } 462 463 // Do a 64-bit pointer add. 464 if (ST.flatScratchIsPointer()) { 465 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 466 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 467 .addReg(FlatScrInitLo) 468 .addReg(ScratchWaveOffsetReg); 469 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 470 FlatScrInitHi) 471 .addReg(FlatScrInitHi) 472 .addImm(0); 473 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 474 475 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 476 addReg(FlatScrInitLo). 477 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 478 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 479 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 480 addReg(FlatScrInitHi). 481 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 482 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 483 return; 484 } 485 486 // For GFX9. 487 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 488 .addReg(FlatScrInitLo) 489 .addReg(ScratchWaveOffsetReg); 490 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 491 AMDGPU::FLAT_SCR_HI) 492 .addReg(FlatScrInitHi) 493 .addImm(0); 494 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 495 496 return; 497 } 498 499 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 500 501 // Copy the size in bytes. 502 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 503 .addReg(FlatScrInitHi, RegState::Kill); 504 505 // Add wave offset in bytes to private base offset. 506 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 507 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) 508 .addReg(FlatScrInitLo) 509 .addReg(ScratchWaveOffsetReg); 510 511 // Convert offset to 256-byte units. 512 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), 513 AMDGPU::FLAT_SCR_HI) 514 .addReg(FlatScrInitLo, RegState::Kill) 515 .addImm(8); 516 LShr->getOperand(3).setIsDead(); // Mark SCC as dead. 517 } 518 519 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 520 // memory. They should have been removed by now. 521 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 522 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 523 I != E; ++I) { 524 if (!MFI.isDeadObjectIndex(I)) 525 return false; 526 } 527 528 return true; 529 } 530 531 // Shift down registers reserved for the scratch RSRC. 532 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 533 MachineFunction &MF) const { 534 535 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 536 const SIInstrInfo *TII = ST.getInstrInfo(); 537 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 538 MachineRegisterInfo &MRI = MF.getRegInfo(); 539 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 540 541 assert(MFI->isEntryFunction()); 542 543 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 544 545 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 546 allStackObjectsAreDead(MF.getFrameInfo()))) 547 return Register(); 548 549 if (ST.hasSGPRInitBug() || 550 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 551 return ScratchRsrcReg; 552 553 // We reserved the last registers for this. Shift it down to the end of those 554 // which were actually used. 555 // 556 // FIXME: It might be safer to use a pseudoregister before replacement. 557 558 // FIXME: We should be able to eliminate unused input registers. We only 559 // cannot do this for the resources required for scratch access. For now we 560 // skip over user SGPRs and may leave unused holes. 561 562 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 563 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 564 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 565 566 // Skip the last N reserved elements because they should have already been 567 // reserved for VCC etc. 568 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 569 for (MCPhysReg Reg : AllSGPR128s) { 570 // Pick the first unallocated one. Make sure we don't clobber the other 571 // reserved input we needed. Also for PAL, make sure we don't clobber 572 // the GIT pointer passed in SGPR0 or SGPR8. 573 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 574 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { 575 MRI.replaceRegWith(ScratchRsrcReg, Reg); 576 MFI->setScratchRSrcReg(Reg); 577 return Reg; 578 } 579 } 580 581 return ScratchRsrcReg; 582 } 583 584 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 585 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 586 } 587 588 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 589 MachineBasicBlock &MBB) const { 590 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 591 592 // FIXME: If we only have SGPR spills, we won't actually be using scratch 593 // memory since these spill to VGPRs. We should be cleaning up these unused 594 // SGPR spill frame indices somewhere. 595 596 // FIXME: We still have implicit uses on SGPR spill instructions in case they 597 // need to spill to vector memory. It's likely that will not happen, but at 598 // this point it appears we need the setup. This part of the prolog should be 599 // emitted after frame indices are eliminated. 600 601 // FIXME: Remove all of the isPhysRegUsed checks 602 603 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 604 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 605 const SIInstrInfo *TII = ST.getInstrInfo(); 606 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 607 MachineRegisterInfo &MRI = MF.getRegInfo(); 608 const Function &F = MF.getFunction(); 609 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 610 611 assert(MFI->isEntryFunction()); 612 613 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 614 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 615 616 // We need to do the replacement of the private segment buffer register even 617 // if there are no stack objects. There could be stores to undef or a 618 // constant without an associated object. 619 // 620 // This will return `Register()` in cases where there are no actual 621 // uses of the SRSRC. 622 Register ScratchRsrcReg; 623 if (!ST.enableFlatScratch()) 624 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 625 626 // Make the selected register live throughout the function. 627 if (ScratchRsrcReg) { 628 for (MachineBasicBlock &OtherBB : MF) { 629 if (&OtherBB != &MBB) { 630 OtherBB.addLiveIn(ScratchRsrcReg); 631 } 632 } 633 } 634 635 // Now that we have fixed the reserved SRSRC we need to locate the 636 // (potentially) preloaded SRSRC. 637 Register PreloadedScratchRsrcReg; 638 if (ST.isAmdHsaOrMesa(F)) { 639 PreloadedScratchRsrcReg = 640 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 641 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 642 // We added live-ins during argument lowering, but since they were not 643 // used they were deleted. We're adding the uses now, so add them back. 644 MRI.addLiveIn(PreloadedScratchRsrcReg); 645 MBB.addLiveIn(PreloadedScratchRsrcReg); 646 } 647 } 648 649 // Debug location must be unknown since the first debug location is used to 650 // determine the end of the prologue. 651 DebugLoc DL; 652 MachineBasicBlock::iterator I = MBB.begin(); 653 654 // We found the SRSRC first because it needs four registers and has an 655 // alignment requirement. If the SRSRC that we found is clobbering with 656 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 657 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 658 // wave offset to a free SGPR. 659 Register ScratchWaveOffsetReg; 660 if (PreloadedScratchWaveOffsetReg && 661 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 662 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 663 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 664 AllSGPRs = AllSGPRs.slice( 665 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 666 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 667 for (MCPhysReg Reg : AllSGPRs) { 668 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 669 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 670 ScratchWaveOffsetReg = Reg; 671 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 672 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 673 break; 674 } 675 } 676 } else { 677 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 678 } 679 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); 680 681 if (requiresStackPointerReference(MF)) { 682 Register SPReg = MFI->getStackPtrOffsetReg(); 683 assert(SPReg != AMDGPU::SP_REG); 684 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 685 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); 686 } 687 688 if (hasFP(MF)) { 689 Register FPReg = MFI->getFrameOffsetReg(); 690 assert(FPReg != AMDGPU::FP_REG); 691 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 692 } 693 694 bool NeedsFlatScratchInit = 695 MFI->hasFlatScratchInit() && 696 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || 697 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); 698 699 if ((NeedsFlatScratchInit || ScratchRsrcReg) && 700 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { 701 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 702 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 703 } 704 705 if (NeedsFlatScratchInit) { 706 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 707 } 708 709 if (ScratchRsrcReg) { 710 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 711 PreloadedScratchRsrcReg, 712 ScratchRsrcReg, ScratchWaveOffsetReg); 713 } 714 } 715 716 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 717 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 718 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 719 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 720 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 721 722 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 723 const SIInstrInfo *TII = ST.getInstrInfo(); 724 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 725 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 726 const Function &Fn = MF.getFunction(); 727 728 if (ST.isAmdPalOS()) { 729 // The pointer to the GIT is formed from the offset passed in and either 730 // the amdgpu-git-ptr-high function attribute or the top part of the PC 731 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 732 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 733 734 buildGitPtr(MBB, I, DL, TII, Rsrc01); 735 736 // We now have the GIT ptr - now get the scratch descriptor from the entry 737 // at offset 0 (or offset 16 for a compute shader). 738 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 739 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 740 auto MMO = MF.getMachineMemOperand(PtrInfo, 741 MachineMemOperand::MOLoad | 742 MachineMemOperand::MOInvariant | 743 MachineMemOperand::MODereferenceable, 744 16, Align(4)); 745 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 746 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 747 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 748 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 749 .addReg(Rsrc01) 750 .addImm(EncodedOffset) // offset 751 .addImm(0) // cpol 752 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 753 .addMemOperand(MMO); 754 755 // The driver will always set the SRD for wave 64 (bits 118:117 of 756 // descriptor / bits 22:21 of third sub-reg will be 0b11) 757 // If the shader is actually wave32 we have to modify the const_index_stride 758 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 759 // reason the driver does this is that there can be cases where it presents 760 // 2 shaders with different wave size (e.g. VsFs). 761 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 762 if (ST.isWave32()) { 763 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 764 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 765 .addImm(21) 766 .addReg(Rsrc03); 767 } 768 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 769 assert(!ST.isAmdHsaOrMesa(Fn)); 770 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 771 772 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 773 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 774 775 // Use relocations to get the pointer, and setup the other bits manually. 776 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 777 778 if (MFI->hasImplicitBufferPtr()) { 779 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 780 781 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 782 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 783 784 BuildMI(MBB, I, DL, Mov64, Rsrc01) 785 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 786 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 787 } else { 788 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 789 790 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 791 auto MMO = MF.getMachineMemOperand( 792 PtrInfo, 793 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 794 MachineMemOperand::MODereferenceable, 795 8, Align(4)); 796 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 797 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 798 .addImm(0) // offset 799 .addImm(0) // cpol 800 .addMemOperand(MMO) 801 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 802 803 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 804 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 805 } 806 } else { 807 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 808 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 809 810 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 811 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 812 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 813 814 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 815 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 816 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 817 818 } 819 820 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 821 .addImm(Rsrc23 & 0xffffffff) 822 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 823 824 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 825 .addImm(Rsrc23 >> 32) 826 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 827 } else if (ST.isAmdHsaOrMesa(Fn)) { 828 assert(PreloadedScratchRsrcReg); 829 830 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 831 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 832 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 833 } 834 } 835 836 // Add the scratch wave offset into the scratch RSRC. 837 // 838 // We only want to update the first 48 bits, which is the base address 839 // pointer, without touching the adjacent 16 bits of flags. We know this add 840 // cannot carry-out from bit 47, otherwise the scratch allocation would be 841 // impossible to fit in the 48-bit global address space. 842 // 843 // TODO: Evaluate if it is better to just construct an SRD using the flat 844 // scratch init and some constants rather than update the one we are passed. 845 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 846 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 847 848 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 849 // the kernel body via inreg arguments. 850 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 851 .addReg(ScratchRsrcSub0) 852 .addReg(ScratchWaveOffsetReg) 853 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 854 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 855 .addReg(ScratchRsrcSub1) 856 .addImm(0) 857 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 858 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 859 } 860 861 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 862 switch (ID) { 863 case TargetStackID::Default: 864 case TargetStackID::NoAlloc: 865 case TargetStackID::SGPRSpill: 866 return true; 867 case TargetStackID::ScalableVector: 868 case TargetStackID::WasmLocal: 869 return false; 870 } 871 llvm_unreachable("Invalid TargetStackID::Value"); 872 } 873 874 // Activate only the inactive lanes when \p EnableInactiveLanes is true. 875 // Otherwise, activate all lanes. It returns the saved exec. 876 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 877 MachineFunction &MF, 878 MachineBasicBlock &MBB, 879 MachineBasicBlock::iterator MBBI, 880 const DebugLoc &DL, bool IsProlog, 881 bool EnableInactiveLanes) { 882 Register ScratchExecCopy; 883 MachineRegisterInfo &MRI = MF.getRegInfo(); 884 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 885 const SIInstrInfo *TII = ST.getInstrInfo(); 886 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 887 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 888 889 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 890 891 ScratchExecCopy = findScratchNonCalleeSaveRegister( 892 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 893 if (!ScratchExecCopy) 894 report_fatal_error("failed to find free scratch register"); 895 896 LiveRegs.addReg(ScratchExecCopy); 897 898 const unsigned SaveExecOpc = 899 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 900 : AMDGPU::S_OR_SAVEEXEC_B32) 901 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 902 : AMDGPU::S_OR_SAVEEXEC_B64); 903 auto SaveExec = 904 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); 905 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 906 907 return ScratchExecCopy; 908 } 909 910 void SIFrameLowering::emitCSRSpillStores( 911 MachineFunction &MF, MachineBasicBlock &MBB, 912 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, 913 Register FrameReg, Register FramePtrRegScratchCopy) const { 914 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 915 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 916 const SIInstrInfo *TII = ST.getInstrInfo(); 917 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 918 919 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 920 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 921 // might end up flipping the EXEC bits twice. 922 Register ScratchExecCopy; 923 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 924 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 925 if (!WWMScratchRegs.empty()) 926 ScratchExecCopy = 927 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 928 /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 929 930 auto StoreWWMRegisters = 931 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 932 for (const auto &Reg : WWMRegs) { 933 Register VGPR = Reg.first; 934 int FI = Reg.second; 935 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, 936 VGPR, FI, FrameReg); 937 } 938 }; 939 940 StoreWWMRegisters(WWMScratchRegs); 941 if (!WWMCalleeSavedRegs.empty()) { 942 if (ScratchExecCopy) { 943 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 944 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 945 } else { 946 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 947 /*IsProlog*/ true, 948 /*EnableInactiveLanes*/ false); 949 } 950 } 951 952 StoreWWMRegisters(WWMCalleeSavedRegs); 953 if (ScratchExecCopy) { 954 // FIXME: Split block and make terminator. 955 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 956 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 957 .addReg(ScratchExecCopy, RegState::Kill); 958 LiveRegs.addReg(ScratchExecCopy); 959 } 960 961 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 962 963 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 964 // Special handle FP spill: 965 // Skip if FP is saved to a scratch SGPR, the save has already been emitted. 966 // Otherwise, FP has been moved to a temporary register and spill it 967 // instead. 968 Register Reg = 969 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 970 if (!Reg) 971 continue; 972 973 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 974 LiveRegs, FrameReg); 975 SB.save(); 976 } 977 978 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make 979 // such scratch registers live throughout the function. 980 SmallVector<Register, 1> ScratchSGPRs; 981 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs); 982 if (!ScratchSGPRs.empty()) { 983 for (MachineBasicBlock &MBB : MF) { 984 for (MCPhysReg Reg : ScratchSGPRs) 985 MBB.addLiveIn(Reg); 986 987 MBB.sortUniqueLiveIns(); 988 } 989 if (!LiveRegs.empty()) { 990 for (MCPhysReg Reg : ScratchSGPRs) 991 LiveRegs.addReg(Reg); 992 } 993 } 994 } 995 996 void SIFrameLowering::emitCSRSpillRestores( 997 MachineFunction &MF, MachineBasicBlock &MBB, 998 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, 999 Register FrameReg, Register FramePtrRegScratchCopy) const { 1000 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1001 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1002 const SIInstrInfo *TII = ST.getInstrInfo(); 1003 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1004 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1005 1006 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 1007 // Special handle FP restore: 1008 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore 1009 // the FP value to a temporary register. The frame pointer should be 1010 // overwritten only at the end when all other spills are restored from 1011 // current frame. 1012 Register Reg = 1013 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 1014 if (!Reg) 1015 continue; 1016 1017 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 1018 LiveRegs, FrameReg); 1019 SB.restore(); 1020 } 1021 1022 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the 1023 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to 1024 // this, we might end up flipping the EXEC bits twice. 1025 Register ScratchExecCopy; 1026 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 1027 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 1028 if (!WWMScratchRegs.empty()) 1029 ScratchExecCopy = 1030 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 1031 /*IsProlog*/ false, /*EnableInactiveLanes*/ true); 1032 1033 auto RestoreWWMRegisters = 1034 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 1035 for (const auto &Reg : WWMRegs) { 1036 Register VGPR = Reg.first; 1037 int FI = Reg.second; 1038 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, 1039 VGPR, FI, FrameReg); 1040 } 1041 }; 1042 1043 RestoreWWMRegisters(WWMScratchRegs); 1044 if (!WWMCalleeSavedRegs.empty()) { 1045 if (ScratchExecCopy) { 1046 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1047 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 1048 } else { 1049 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, 1050 /*IsProlog*/ false, 1051 /*EnableInactiveLanes*/ false); 1052 } 1053 } 1054 1055 RestoreWWMRegisters(WWMCalleeSavedRegs); 1056 if (ScratchExecCopy) { 1057 // FIXME: Split block and make terminator. 1058 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1059 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 1060 .addReg(ScratchExecCopy, RegState::Kill); 1061 } 1062 } 1063 1064 void SIFrameLowering::emitPrologue(MachineFunction &MF, 1065 MachineBasicBlock &MBB) const { 1066 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1067 if (FuncInfo->isEntryFunction()) { 1068 emitEntryFunctionPrologue(MF, MBB); 1069 return; 1070 } 1071 1072 MachineFrameInfo &MFI = MF.getFrameInfo(); 1073 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1074 const SIInstrInfo *TII = ST.getInstrInfo(); 1075 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1076 MachineRegisterInfo &MRI = MF.getRegInfo(); 1077 1078 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1079 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1080 Register BasePtrReg = 1081 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 1082 LivePhysRegs LiveRegs; 1083 1084 MachineBasicBlock::iterator MBBI = MBB.begin(); 1085 // DebugLoc must be unknown since the first instruction with DebugLoc is used 1086 // to determine the end of the prologue. 1087 DebugLoc DL; 1088 1089 bool HasFP = false; 1090 bool HasBP = false; 1091 uint32_t NumBytes = MFI.getStackSize(); 1092 uint32_t RoundedSize = NumBytes; 1093 1094 if (TRI.hasStackRealignment(MF)) 1095 HasFP = true; 1096 1097 Register FramePtrRegScratchCopy; 1098 if (!HasFP && !hasFP(MF)) { 1099 // Emit the CSR spill stores with SP base register. 1100 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, 1101 FramePtrRegScratchCopy); 1102 } else { 1103 // CSR spill stores will use FP as base register. 1104 Register SGPRForFPSaveRestoreCopy = 1105 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1106 1107 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 1108 if (SGPRForFPSaveRestoreCopy) { 1109 // Copy FP to the scratch register now and emit the CFI entry. It avoids 1110 // the extra FP copy needed in the other two cases when FP is spilled to 1111 // memory or to a VGPR lane. 1112 PrologEpilogSGPRSpillBuilder SB( 1113 FramePtrReg, 1114 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, 1115 DL, TII, TRI, LiveRegs, FramePtrReg); 1116 SB.save(); 1117 LiveRegs.addReg(SGPRForFPSaveRestoreCopy); 1118 } else { 1119 // Copy FP into a new scratch register so that its previous value can be 1120 // spilled after setting up the new frame. 1121 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1122 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); 1123 if (!FramePtrRegScratchCopy) 1124 report_fatal_error("failed to find free scratch register"); 1125 1126 LiveRegs.addReg(FramePtrRegScratchCopy); 1127 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) 1128 .addReg(FramePtrReg); 1129 } 1130 } 1131 1132 if (HasFP) { 1133 const unsigned Alignment = MFI.getMaxAlign().value(); 1134 1135 RoundedSize += Alignment; 1136 if (LiveRegs.empty()) { 1137 LiveRegs.init(TRI); 1138 LiveRegs.addLiveIns(MBB); 1139 } 1140 1141 // s_add_i32 s33, s32, NumBytes 1142 // s_and_b32 s33, s33, 0b111...0000 1143 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) 1144 .addReg(StackPtrReg) 1145 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 1146 .setMIFlag(MachineInstr::FrameSetup); 1147 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 1148 .addReg(FramePtrReg, RegState::Kill) 1149 .addImm(-Alignment * getScratchScaleFactor(ST)) 1150 .setMIFlag(MachineInstr::FrameSetup); 1151 And->getOperand(3).setIsDead(); // Mark SCC as dead. 1152 FuncInfo->setIsStackRealigned(true); 1153 } else if ((HasFP = hasFP(MF))) { 1154 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1155 .addReg(StackPtrReg) 1156 .setMIFlag(MachineInstr::FrameSetup); 1157 } 1158 1159 // If FP is used, emit the CSR spills with FP base register. 1160 if (HasFP) { 1161 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, 1162 FramePtrRegScratchCopy); 1163 if (FramePtrRegScratchCopy) 1164 LiveRegs.removeReg(FramePtrRegScratchCopy); 1165 } 1166 1167 // If we need a base pointer, set it up here. It's whatever the value of 1168 // the stack pointer is at this point. Any variable size objects will be 1169 // allocated after this, so we can still use the base pointer to reference 1170 // the incoming arguments. 1171 if ((HasBP = TRI.hasBasePointer(MF))) { 1172 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1173 .addReg(StackPtrReg) 1174 .setMIFlag(MachineInstr::FrameSetup); 1175 } 1176 1177 if (HasFP && RoundedSize != 0) { 1178 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1179 .addReg(StackPtrReg) 1180 .addImm(RoundedSize * getScratchScaleFactor(ST)) 1181 .setMIFlag(MachineInstr::FrameSetup); 1182 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1183 } 1184 1185 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1186 (void)FPSaved; 1187 assert((!HasFP || FPSaved) && 1188 "Needed to save FP but didn't save it anywhere"); 1189 1190 // If we allow spilling to AGPRs we may have saved FP but then spill 1191 // everything into AGPRs instead of the stack. 1192 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && 1193 "Saved FP but didn't need it"); 1194 1195 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg); 1196 (void)BPSaved; 1197 assert((!HasBP || BPSaved) && 1198 "Needed to save BP but didn't save it anywhere"); 1199 1200 assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); 1201 } 1202 1203 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 1204 MachineBasicBlock &MBB) const { 1205 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1206 if (FuncInfo->isEntryFunction()) 1207 return; 1208 1209 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1210 const SIInstrInfo *TII = ST.getInstrInfo(); 1211 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1212 MachineRegisterInfo &MRI = MF.getRegInfo(); 1213 LivePhysRegs LiveRegs; 1214 // Get the insert location for the epilogue. If there were no terminators in 1215 // the block, get the last instruction. 1216 MachineBasicBlock::iterator MBBI = MBB.end(); 1217 DebugLoc DL; 1218 if (!MBB.empty()) { 1219 MBBI = MBB.getLastNonDebugInstr(); 1220 if (MBBI != MBB.end()) 1221 DL = MBBI->getDebugLoc(); 1222 1223 MBBI = MBB.getFirstTerminator(); 1224 } 1225 1226 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1227 uint32_t NumBytes = MFI.getStackSize(); 1228 uint32_t RoundedSize = FuncInfo->isStackRealigned() 1229 ? NumBytes + MFI.getMaxAlign().value() 1230 : NumBytes; 1231 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1232 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1233 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1234 1235 Register FramePtrRegScratchCopy; 1236 Register SGPRForFPSaveRestoreCopy = 1237 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1238 if (FPSaved) { 1239 // CSR spill restores should use FP as base register. If 1240 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP 1241 // into a new scratch register and copy to FP later when other registers are 1242 // restored from the current stack frame. 1243 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1244 if (SGPRForFPSaveRestoreCopy) { 1245 LiveRegs.addReg(SGPRForFPSaveRestoreCopy); 1246 } else { 1247 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1248 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); 1249 if (!FramePtrRegScratchCopy) 1250 report_fatal_error("failed to find free scratch register"); 1251 1252 LiveRegs.addReg(FramePtrRegScratchCopy); 1253 } 1254 1255 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, 1256 FramePtrRegScratchCopy); 1257 } 1258 1259 if (RoundedSize != 0 && hasFP(MF)) { 1260 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1261 .addReg(StackPtrReg) 1262 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) 1263 .setMIFlag(MachineInstr::FrameDestroy); 1264 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1265 } 1266 1267 if (FPSaved) { 1268 // Insert the copy to restore FP. 1269 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy 1270 : FramePtrRegScratchCopy; 1271 MachineInstrBuilder MIB = 1272 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1273 .addReg(SrcReg); 1274 if (SGPRForFPSaveRestoreCopy) 1275 MIB.setMIFlag(MachineInstr::FrameDestroy); 1276 } else { 1277 // Insert the CSR spill restores with SP as the base register. 1278 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, 1279 FramePtrRegScratchCopy); 1280 } 1281 } 1282 1283 #ifndef NDEBUG 1284 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1285 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1286 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1287 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1288 I != E; ++I) { 1289 if (!MFI.isDeadObjectIndex(I) && 1290 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1291 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { 1292 return false; 1293 } 1294 } 1295 1296 return true; 1297 } 1298 #endif 1299 1300 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1301 int FI, 1302 Register &FrameReg) const { 1303 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1304 1305 FrameReg = RI->getFrameRegister(MF); 1306 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1307 } 1308 1309 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1310 MachineFunction &MF, 1311 RegScavenger *RS) const { 1312 MachineFrameInfo &MFI = MF.getFrameInfo(); 1313 1314 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1315 const SIInstrInfo *TII = ST.getInstrInfo(); 1316 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1317 MachineRegisterInfo &MRI = MF.getRegInfo(); 1318 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1319 1320 // Allocate spill slots for WWM reserved VGPRs. 1321 if (!FuncInfo->isEntryFunction()) { 1322 for (Register Reg : FuncInfo->getWWMReservedRegs()) { 1323 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); 1324 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), 1325 TRI->getSpillAlign(*RC)); 1326 } 1327 } 1328 1329 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() 1330 && EnableSpillVGPRToAGPR; 1331 1332 if (SpillVGPRToAGPR) { 1333 // To track the spill frame indices handled in this pass. 1334 BitVector SpillFIs(MFI.getObjectIndexEnd(), false); 1335 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); 1336 1337 bool SeenDbgInstr = false; 1338 1339 for (MachineBasicBlock &MBB : MF) { 1340 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { 1341 int FrameIndex; 1342 if (MI.isDebugInstr()) 1343 SeenDbgInstr = true; 1344 1345 if (TII->isVGPRSpill(MI)) { 1346 // Try to eliminate stack used by VGPR spills before frame 1347 // finalization. 1348 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1349 AMDGPU::OpName::vaddr); 1350 int FI = MI.getOperand(FIOp).getIndex(); 1351 Register VReg = 1352 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 1353 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, 1354 TRI->isAGPR(MRI, VReg))) { 1355 assert(RS != nullptr); 1356 // FIXME: change to enterBasicBlockEnd() 1357 RS->enterBasicBlock(MBB); 1358 TRI->eliminateFrameIndex(MI, 0, FIOp, RS); 1359 SpillFIs.set(FI); 1360 continue; 1361 } 1362 } else if (TII->isStoreToStackSlot(MI, FrameIndex) || 1363 TII->isLoadFromStackSlot(MI, FrameIndex)) 1364 if (!MFI.isFixedObjectIndex(FrameIndex)) 1365 NonVGPRSpillFIs.set(FrameIndex); 1366 } 1367 } 1368 1369 // Stack slot coloring may assign different objects to the same stack slot. 1370 // If not, then the VGPR to AGPR spill slot is dead. 1371 for (unsigned FI : SpillFIs.set_bits()) 1372 if (!NonVGPRSpillFIs.test(FI)) 1373 FuncInfo->setVGPRToAGPRSpillDead(FI); 1374 1375 for (MachineBasicBlock &MBB : MF) { 1376 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) 1377 MBB.addLiveIn(Reg); 1378 1379 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) 1380 MBB.addLiveIn(Reg); 1381 1382 MBB.sortUniqueLiveIns(); 1383 1384 if (!SpillFIs.empty() && SeenDbgInstr) { 1385 // FIXME: The dead frame indices are replaced with a null register from 1386 // the debug value instructions. We should instead, update it with the 1387 // correct register value. But not sure the register value alone is 1388 for (MachineInstr &MI : MBB) { 1389 if (MI.isDebugValue() && MI.getOperand(0).isFI() && 1390 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) && 1391 SpillFIs[MI.getOperand(0).getIndex()]) { 1392 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); 1393 } 1394 } 1395 } 1396 } 1397 } 1398 1399 // At this point we've already allocated all spilled SGPRs to VGPRs if we 1400 // can. Any remaining SGPR spills will go to memory, so move them back to the 1401 // default stack. 1402 bool HaveSGPRToVMemSpill = 1403 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); 1404 assert(allSGPRSpillsAreDead(MF) && 1405 "SGPR spill should have been removed in SILowerSGPRSpills"); 1406 1407 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1408 // but currently hasNonSpillStackObjects is set only from source 1409 // allocas. Stack temps produced from legalization are not counted currently. 1410 if (!allStackObjectsAreDead(MFI)) { 1411 assert(RS && "RegScavenger required if spilling"); 1412 1413 // Add an emergency spill slot 1414 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1415 1416 // If we are spilling SGPRs to memory with a large frame, we may need a 1417 // second VGPR emergency frame index. 1418 if (HaveSGPRToVMemSpill && 1419 allocateScavengingFrameIndexesNearIncomingSP(MF)) { 1420 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false)); 1421 } 1422 } 1423 } 1424 1425 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( 1426 MachineFunction &MF, RegScavenger *RS) const { 1427 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1428 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1429 MachineRegisterInfo &MRI = MF.getRegInfo(); 1430 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1431 1432 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 1433 // On gfx908, we had initially reserved highest available VGPR for AGPR 1434 // copy. Now since we are done with RA, check if there exist an unused VGPR 1435 // which is lower than the eariler reserved VGPR before RA. If one exist, 1436 // use it for AGPR copy instead of one reserved before RA. 1437 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); 1438 Register UnusedLowVGPR = 1439 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); 1440 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < 1441 TRI->getHWRegIndex(VGPRForAGPRCopy))) { 1442 // Reserve this newly identified VGPR (for AGPR copy) 1443 // reserved registers should already be frozen at this point 1444 // so we can avoid calling MRI.freezeReservedRegs and just use 1445 // MRI.reserveReg 1446 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); 1447 MRI.reserveReg(UnusedLowVGPR, TRI); 1448 } 1449 } 1450 // We initally reserved the highest available SGPR pair for long branches 1451 // now, after RA, we shift down to a lower unused one if one exists 1452 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); 1453 Register UnusedLowSGPR = 1454 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); 1455 // If LongBranchReservedReg is null then we didn't find a long branch 1456 // and never reserved a register to begin with so there is nothing to 1457 // shift down. Then if UnusedLowSGPR is null, there isn't available lower 1458 // register to use so just keep the original one we set. 1459 if (LongBranchReservedReg && UnusedLowSGPR) { 1460 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); 1461 MRI.reserveReg(UnusedLowSGPR, TRI); 1462 } 1463 } 1464 1465 // The special SGPR spills like the one needed for FP, BP or any reserved 1466 // registers delayed until frame lowering. 1467 void SIFrameLowering::determinePrologEpilogSGPRSaves( 1468 MachineFunction &MF, BitVector &SavedVGPRs, 1469 bool NeedExecCopyReservedReg) const { 1470 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1471 MachineRegisterInfo &MRI = MF.getRegInfo(); 1472 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1473 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1474 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1475 LivePhysRegs LiveRegs; 1476 LiveRegs.init(*TRI); 1477 // Initially mark callee saved registers as used so we will not choose them 1478 // while looking for scratch SGPRs. 1479 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); 1480 for (unsigned I = 0; CSRegs[I]; ++I) 1481 LiveRegs.addReg(CSRegs[I]); 1482 1483 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); 1484 1485 if (NeedExecCopyReservedReg) { 1486 Register ReservedReg = MFI->getSGPRForEXECCopy(); 1487 assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); 1488 Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); 1489 if (UnusedScratchReg) { 1490 // If found any unused scratch SGPR, reserve the register itself for Exec 1491 // copy and there is no need for any spills in that case. 1492 MFI->setSGPRForEXECCopy(UnusedScratchReg); 1493 LiveRegs.addReg(UnusedScratchReg); 1494 } else { 1495 // Needs spill. 1496 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && 1497 "Re-reserving spill slot for EXEC copy register"); 1498 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, 1499 /*IncludeScratchCopy=*/false); 1500 } 1501 } 1502 1503 // hasFP only knows about stack objects that already exist. We're now 1504 // determining the stack slots that will be created, so we have to predict 1505 // them. Stack objects force FP usage with calls. 1506 // 1507 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1508 // don't want to report it here. 1509 // 1510 // FIXME: Is this really hasReservedCallFrame? 1511 const bool WillHaveFP = 1512 FrameInfo.hasCalls() && 1513 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1514 1515 if (WillHaveFP || hasFP(MF)) { 1516 Register FramePtrReg = MFI->getFrameOffsetReg(); 1517 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && 1518 "Re-reserving spill slot for FP"); 1519 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg); 1520 } 1521 1522 if (TRI->hasBasePointer(MF)) { 1523 Register BasePtrReg = TRI->getBaseRegister(); 1524 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && 1525 "Re-reserving spill slot for BP"); 1526 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg); 1527 } 1528 } 1529 1530 // Only report VGPRs to generic code. 1531 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1532 BitVector &SavedVGPRs, 1533 RegScavenger *RS) const { 1534 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1535 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1536 if (MFI->isEntryFunction()) 1537 return; 1538 1539 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1540 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1541 const SIInstrInfo *TII = ST.getInstrInfo(); 1542 bool NeedExecCopyReservedReg = false; 1543 1544 MachineInstr *ReturnMI = nullptr; 1545 for (MachineBasicBlock &MBB : MF) { 1546 for (MachineInstr &MI : MBB) { 1547 // WRITELANE instructions used for SGPR spills can overwrite the inactive 1548 // lanes of VGPRs and callee must spill and restore them even if they are 1549 // marked Caller-saved. 1550 1551 // TODO: Handle this elsewhere at an early point. Walking through all MBBs 1552 // here would be a bad heuristic. A better way should be by calling 1553 // allocateWWMSpill during the regalloc pipeline whenever a physical 1554 // register is allocated for the intended virtual registers. That will 1555 // also help excluding the general use of WRITELANE/READLANE intrinsics 1556 // that won't really need any such special handling. 1557 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32) 1558 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); 1559 else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) 1560 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); 1561 else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) 1562 NeedExecCopyReservedReg = true; 1563 else if (MI.getOpcode() == AMDGPU::SI_RETURN || 1564 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { 1565 // We expect all return to be the same size. 1566 assert(!ReturnMI || 1567 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == 1568 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); 1569 ReturnMI = &MI; 1570 } 1571 } 1572 } 1573 1574 // Remove any VGPRs used in the return value because these do not need to be saved. 1575 // This prevents CSR restore from clobbering return VGPRs. 1576 if (ReturnMI) { 1577 for (auto &Op : ReturnMI->operands()) { 1578 if (Op.isReg()) 1579 SavedVGPRs.reset(Op.getReg()); 1580 } 1581 } 1582 1583 // Ignore the SGPRs the default implementation found. 1584 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1585 1586 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1587 // In gfx908 there was do AGPR loads and stores and thus spilling also 1588 // require a temporary VGPR. 1589 if (!ST.hasGFX90AInsts()) 1590 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1591 1592 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); 1593 1594 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't 1595 // allow the default insertion to handle them. 1596 for (auto &Reg : MFI->getWWMSpills()) 1597 SavedVGPRs.reset(Reg.first); 1598 1599 // Mark all lane VGPRs as BB LiveIns. 1600 for (MachineBasicBlock &MBB : MF) { 1601 for (auto &Reg : MFI->getWWMSpills()) 1602 MBB.addLiveIn(Reg.first); 1603 1604 MBB.sortUniqueLiveIns(); 1605 } 1606 } 1607 1608 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1609 BitVector &SavedRegs, 1610 RegScavenger *RS) const { 1611 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1612 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1613 if (MFI->isEntryFunction()) 1614 return; 1615 1616 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1617 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1618 1619 // The SP is specifically managed and we don't want extra spills of it. 1620 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1621 1622 const BitVector AllSavedRegs = SavedRegs; 1623 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1624 1625 // We have to anticipate introducing CSR VGPR spills or spill of caller 1626 // save VGPR reserved for SGPR spills as we now always create stack entry 1627 // for it, if we don't have any stack objects already, since we require a FP 1628 // if there is a call and stack. We will allocate a VGPR for SGPR spills if 1629 // there are any SGPR spills. Whether they are CSR spills or otherwise. 1630 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1631 const bool WillHaveFP = 1632 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); 1633 1634 // FP will be specially managed like SP. 1635 if (WillHaveFP || hasFP(MF)) 1636 SavedRegs.reset(MFI->getFrameOffsetReg()); 1637 1638 // Return address use with return instruction is hidden through the SI_RETURN 1639 // pseudo. Given that and since the IPRA computes actual register usage and 1640 // does not use CSR list, the clobbering of return address by function calls 1641 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register 1642 // usage collection. This will ensure save/restore of return address happens 1643 // in those scenarios. 1644 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1645 Register RetAddrReg = TRI->getReturnAddressReg(MF); 1646 if (!MFI->isEntryFunction() && 1647 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) { 1648 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); 1649 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); 1650 } 1651 } 1652 1653 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1654 MachineFunction &MF, const TargetRegisterInfo *TRI, 1655 std::vector<CalleeSavedInfo> &CSI) const { 1656 if (CSI.empty()) 1657 return true; // Early exit if no callee saved registers are modified! 1658 1659 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1660 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1661 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1662 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1663 Register BasePtrReg = RI->getBaseRegister(); 1664 Register SGPRForFPSaveRestoreCopy = 1665 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1666 Register SGPRForBPSaveRestoreCopy = 1667 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg); 1668 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) 1669 return false; 1670 1671 unsigned NumModifiedRegs = 0; 1672 1673 if (SGPRForFPSaveRestoreCopy) 1674 NumModifiedRegs++; 1675 if (SGPRForBPSaveRestoreCopy) 1676 NumModifiedRegs++; 1677 1678 for (auto &CS : CSI) { 1679 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { 1680 CS.setDstReg(SGPRForFPSaveRestoreCopy); 1681 if (--NumModifiedRegs) 1682 break; 1683 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { 1684 CS.setDstReg(SGPRForBPSaveRestoreCopy); 1685 if (--NumModifiedRegs) 1686 break; 1687 } 1688 } 1689 1690 return false; 1691 } 1692 1693 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( 1694 const MachineFunction &MF) const { 1695 1696 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1697 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1698 uint64_t EstStackSize = MFI.estimateStackSize(MF); 1699 uint64_t MaxOffset = EstStackSize - 1; 1700 1701 // We need the emergency stack slots to be allocated in range of the 1702 // MUBUF/flat scratch immediate offset from the base register, so assign these 1703 // first at the incoming SP position. 1704 // 1705 // TODO: We could try sorting the objects to find a hole in the first bytes 1706 // rather than allocating as close to possible. This could save a lot of space 1707 // on frames with alignment requirements. 1708 if (ST.enableFlatScratch()) { 1709 const SIInstrInfo *TII = ST.getInstrInfo(); 1710 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1711 SIInstrFlags::FlatScratch)) 1712 return false; 1713 } else { 1714 if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) 1715 return false; 1716 } 1717 1718 return true; 1719 } 1720 1721 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1722 MachineFunction &MF, 1723 MachineBasicBlock &MBB, 1724 MachineBasicBlock::iterator I) const { 1725 int64_t Amount = I->getOperand(0).getImm(); 1726 if (Amount == 0) 1727 return MBB.erase(I); 1728 1729 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1730 const SIInstrInfo *TII = ST.getInstrInfo(); 1731 const DebugLoc &DL = I->getDebugLoc(); 1732 unsigned Opc = I->getOpcode(); 1733 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1734 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1735 1736 if (!hasReservedCallFrame(MF)) { 1737 Amount = alignTo(Amount, getStackAlign()); 1738 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1739 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1740 Register SPReg = MFI->getStackPtrOffsetReg(); 1741 1742 Amount *= getScratchScaleFactor(ST); 1743 if (IsDestroy) 1744 Amount = -Amount; 1745 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) 1746 .addReg(SPReg) 1747 .addImm(Amount); 1748 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1749 } else if (CalleePopAmount != 0) { 1750 llvm_unreachable("is this used?"); 1751 } 1752 1753 return MBB.erase(I); 1754 } 1755 1756 /// Returns true if the frame will require a reference to the stack pointer. 1757 /// 1758 /// This is the set of conditions common to setting up the stack pointer in a 1759 /// kernel, and for using a frame pointer in a callable function. 1760 /// 1761 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1762 /// references SP. 1763 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1764 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1765 } 1766 1767 // The FP for kernels is always known 0, so we never really need to setup an 1768 // explicit register for it. However, DisableFramePointerElim will force us to 1769 // use a register for it. 1770 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1771 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1772 1773 // For entry functions we can use an immediate offset in most cases, so the 1774 // presence of calls doesn't imply we need a distinct frame pointer. 1775 if (MFI.hasCalls() && 1776 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1777 // All offsets are unsigned, so need to be addressed in the same direction 1778 // as stack growth. 1779 1780 // FIXME: This function is pretty broken, since it can be called before the 1781 // frame layout is determined or CSR spills are inserted. 1782 return MFI.getStackSize() != 0; 1783 } 1784 1785 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1786 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1787 MF) || 1788 MF.getTarget().Options.DisableFramePointerElim(MF); 1789 } 1790 1791 // This is essentially a reduced version of hasFP for entry functions. Since the 1792 // stack pointer is known 0 on entry to kernels, we never really need an FP 1793 // register. We may need to initialize the stack pointer depending on the frame 1794 // properties, which logically overlaps many of the cases where an ordinary 1795 // function would require an FP. 1796 bool SIFrameLowering::requiresStackPointerReference( 1797 const MachineFunction &MF) const { 1798 // Callable functions always require a stack pointer reference. 1799 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1800 "only expected to call this for entry points"); 1801 1802 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1803 1804 // Entry points ordinarily don't need to initialize SP. We have to set it up 1805 // for callees if there are any. Also note tail calls are impossible/don't 1806 // make any sense for kernels. 1807 if (MFI.hasCalls()) 1808 return true; 1809 1810 // We still need to initialize the SP if we're doing anything weird that 1811 // references the SP, like variable sized stack objects. 1812 return frameTriviallyRequiresSP(MFI); 1813 } 1814