1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LiveRegUnits.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 static cl::opt<bool> EnableSpillVGPRToAGPR( 24 "amdgpu-spill-vgpr-to-agpr", 25 cl::desc("Enable spilling VGPRs to AGPRs"), 26 cl::ReallyHidden, 27 cl::init(true)); 28 29 // Find a register matching \p RC from \p LiveUnits which is unused and 30 // available throughout the function. On failure, returns AMDGPU::NoRegister. 31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of 32 // MCRegisters. This should reduce the number of iterations and avoid redundant 33 // checking. 34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, 35 const LiveRegUnits &LiveUnits, 36 const TargetRegisterClass &RC) { 37 for (MCRegister Reg : RC) { 38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) && 39 !MRI.isReserved(Reg)) 40 return Reg; 41 } 42 return MCRegister(); 43 } 44 45 // Find a scratch register that we can use in the prologue. We avoid using 46 // callee-save registers since they may appear to be free when this is called 47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 48 // when this is called from emitPrologue. 49 static MCRegister findScratchNonCalleeSaveRegister( 50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, 51 const TargetRegisterClass &RC, bool Unused = false) { 52 // Mark callee saved registers as used so we will not choose them. 53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 54 for (unsigned i = 0; CSRegs[i]; ++i) 55 LiveUnits.addReg(CSRegs[i]); 56 57 // We are looking for a register that can be used throughout the entire 58 // function, so any use is unacceptable. 59 if (Unused) 60 return findUnusedRegister(MRI, LiveUnits, RC); 61 62 for (MCRegister Reg : RC) { 63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg)) 64 return Reg; 65 } 66 67 return MCRegister(); 68 } 69 70 /// Query target location for spilling SGPRs 71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs 72 static void getVGPRSpillLaneOrTempRegister( 73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, 74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, 75 bool IncludeScratchCopy = true) { 76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 77 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 78 79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 80 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 81 unsigned Size = TRI->getSpillSize(RC); 82 Align Alignment = TRI->getSpillAlign(RC); 83 84 // We need to save and restore the given SGPR. 85 86 Register ScratchSGPR; 87 // 1: Try to save the given register into an unused scratch SGPR. The 88 // LiveUnits should have all the callee saved registers marked as used. For 89 // certain cases we skip copy to scratch SGPR. 90 if (IncludeScratchCopy) 91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC); 92 93 if (!ScratchSGPR) { 94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, 95 TargetStackID::SGPRSpill); 96 97 if (TRI->spillSGPRToVGPR() && 98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, 99 /*IsPrologEpilog=*/true)) { 100 // 2: There's no free lane to spill, and no free register to save the 101 // SGPR, so we're forced to take another VGPR to use for the spill. 102 MFI->addToPrologEpilogSGPRSpills( 103 SGPR, PrologEpilogSGPRSaveRestoreInfo( 104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); 105 106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); 107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " 108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 109 << '\n';); 110 } else { 111 // Remove dead <FI> index 112 MF.getFrameInfo().RemoveStackObject(FI); 113 // 3: If all else fails, spill the register to memory. 114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment); 115 MFI->addToPrologEpilogSGPRSpills( 116 SGPR, 117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); 118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " 119 << printReg(SGPR, TRI) << '\n'); 120 } 121 } else { 122 MFI->addToPrologEpilogSGPRSpills( 123 SGPR, PrologEpilogSGPRSaveRestoreInfo( 124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); 125 LiveUnits.addReg(ScratchSGPR); 126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " 127 << printReg(ScratchSGPR, TRI) << '\n'); 128 } 129 } 130 131 // We need to specially emit stack operations here because a different frame 132 // register is used than in the rest of the function, as getFrameRegister would 133 // use. 134 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 135 const SIMachineFunctionInfo &FuncInfo, 136 LiveRegUnits &LiveUnits, MachineFunction &MF, 137 MachineBasicBlock &MBB, 138 MachineBasicBlock::iterator I, const DebugLoc &DL, 139 Register SpillReg, int FI, Register FrameReg, 140 int64_t DwordOff = 0) { 141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 143 144 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 146 MachineMemOperand *MMO = MF.getMachineMemOperand( 147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 148 FrameInfo.getObjectAlign(FI)); 149 LiveUnits.addReg(SpillReg); 150 bool IsKill = !MBB.isLiveIn(SpillReg); 151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, 152 DwordOff, MMO, nullptr, &LiveUnits); 153 if (IsKill) 154 LiveUnits.removeReg(SpillReg); 155 } 156 157 static void buildEpilogRestore(const GCNSubtarget &ST, 158 const SIRegisterInfo &TRI, 159 const SIMachineFunctionInfo &FuncInfo, 160 LiveRegUnits &LiveUnits, MachineFunction &MF, 161 MachineBasicBlock &MBB, 162 MachineBasicBlock::iterator I, 163 const DebugLoc &DL, Register SpillReg, int FI, 164 Register FrameReg, int64_t DwordOff = 0) { 165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 167 168 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 170 MachineMemOperand *MMO = MF.getMachineMemOperand( 171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 172 FrameInfo.getObjectAlign(FI)); 173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, 174 DwordOff, MMO, nullptr, &LiveUnits); 175 } 176 177 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 178 const DebugLoc &DL, const SIInstrInfo *TII, 179 Register TargetReg) { 180 MachineFunction *MF = MBB.getParent(); 181 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 182 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 186 187 if (MFI->getGITPtrHigh() != 0xffffffff) { 188 BuildMI(MBB, I, DL, SMovB32, TargetHi) 189 .addImm(MFI->getGITPtrHigh()) 190 .addReg(TargetReg, RegState::ImplicitDefine); 191 } else { 192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo); 193 BuildMI(MBB, I, DL, GetPC64, TargetReg); 194 } 195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 196 MF->getRegInfo().addLiveIn(GitPtrLo); 197 MBB.addLiveIn(GitPtrLo); 198 BuildMI(MBB, I, DL, SMovB32, TargetLo) 199 .addReg(GitPtrLo); 200 } 201 202 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, 203 const SIMachineFunctionInfo *FuncInfo, 204 MachineFunction &MF, MachineBasicBlock &MBB, 205 MachineBasicBlock::iterator MBBI, bool IsProlog) { 206 if (LiveUnits.empty()) { 207 LiveUnits.init(TRI); 208 if (IsProlog) { 209 LiveUnits.addLiveIns(MBB); 210 } else { 211 // In epilog. 212 LiveUnits.addLiveOuts(MBB); 213 LiveUnits.stepBackward(*MBBI); 214 } 215 } 216 } 217 218 namespace llvm { 219 220 // SpillBuilder to save/restore special SGPR spills like the one needed for FP, 221 // BP, etc. These spills are delayed until the current function's frame is 222 // finalized. For a given register, the builder uses the 223 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. 224 class PrologEpilogSGPRSpillBuilder { 225 MachineBasicBlock::iterator MI; 226 MachineBasicBlock &MBB; 227 MachineFunction &MF; 228 const GCNSubtarget &ST; 229 MachineFrameInfo &MFI; 230 SIMachineFunctionInfo *FuncInfo; 231 const SIInstrInfo *TII; 232 const SIRegisterInfo &TRI; 233 Register SuperReg; 234 const PrologEpilogSGPRSaveRestoreInfo SI; 235 LiveRegUnits &LiveUnits; 236 const DebugLoc &DL; 237 Register FrameReg; 238 ArrayRef<int16_t> SplitParts; 239 unsigned NumSubRegs; 240 unsigned EltSize = 4; 241 242 void saveToMemory(const int FI) const { 243 MachineRegisterInfo &MRI = MF.getRegInfo(); 244 assert(!MFI.isDeadObjectIndex(FI)); 245 246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); 247 248 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass); 250 if (!TmpVGPR) 251 report_fatal_error("failed to find free scratch register"); 252 253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 254 Register SubReg = NumSubRegs == 1 255 ? SuperReg 256 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 258 .addReg(SubReg); 259 260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, 261 FI, FrameReg, DwordOff); 262 DwordOff += 4; 263 } 264 } 265 266 void saveToVGPRLane(const int FI) const { 267 assert(!MFI.isDeadObjectIndex(FI)); 268 269 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 270 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 271 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); 272 assert(Spill.size() == NumSubRegs); 273 274 for (unsigned I = 0; I < NumSubRegs; ++I) { 275 Register SubReg = NumSubRegs == 1 276 ? SuperReg 277 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR), 279 Spill[I].VGPR) 280 .addReg(SubReg) 281 .addImm(Spill[I].Lane) 282 .addReg(Spill[I].VGPR, RegState::Undef); 283 } 284 } 285 286 void copyToScratchSGPR(Register DstReg) const { 287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) 288 .addReg(SuperReg) 289 .setMIFlag(MachineInstr::FrameSetup); 290 } 291 292 void restoreFromMemory(const int FI) { 293 MachineRegisterInfo &MRI = MF.getRegInfo(); 294 295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); 296 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass); 298 if (!TmpVGPR) 299 report_fatal_error("failed to find free scratch register"); 300 301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { 302 Register SubReg = NumSubRegs == 1 303 ? SuperReg 304 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 305 306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, 307 TmpVGPR, FI, FrameReg, DwordOff); 308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 309 .addReg(TmpVGPR, RegState::Kill); 310 DwordOff += 4; 311 } 312 } 313 314 void restoreFromVGPRLane(const int FI) { 315 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 316 ArrayRef<SIRegisterInfo::SpilledReg> Spill = 317 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); 318 assert(Spill.size() == NumSubRegs); 319 320 for (unsigned I = 0; I < NumSubRegs; ++I) { 321 Register SubReg = NumSubRegs == 1 322 ? SuperReg 323 : Register(TRI.getSubReg(SuperReg, SplitParts[I])); 324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 325 .addReg(Spill[I].VGPR) 326 .addImm(Spill[I].Lane); 327 } 328 } 329 330 void copyFromScratchSGPR(Register SrcReg) const { 331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) 332 .addReg(SrcReg) 333 .setMIFlag(MachineInstr::FrameDestroy); 334 } 335 336 public: 337 PrologEpilogSGPRSpillBuilder(Register Reg, 338 const PrologEpilogSGPRSaveRestoreInfo SI, 339 MachineBasicBlock &MBB, 340 MachineBasicBlock::iterator MI, 341 const DebugLoc &DL, const SIInstrInfo *TII, 342 const SIRegisterInfo &TRI, 343 LiveRegUnits &LiveUnits, Register FrameReg) 344 : MI(MI), MBB(MBB), MF(*MBB.getParent()), 345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), 346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), 348 FrameReg(FrameReg) { 349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 350 SplitParts = TRI.getRegSplitParts(RC, EltSize); 351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 352 353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 354 } 355 356 void save() { 357 switch (SI.getKind()) { 358 case SGPRSaveKind::SPILL_TO_MEM: 359 return saveToMemory(SI.getIndex()); 360 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 361 return saveToVGPRLane(SI.getIndex()); 362 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 363 return copyToScratchSGPR(SI.getReg()); 364 } 365 } 366 367 void restore() { 368 switch (SI.getKind()) { 369 case SGPRSaveKind::SPILL_TO_MEM: 370 return restoreFromMemory(SI.getIndex()); 371 case SGPRSaveKind::SPILL_TO_VGPR_LANE: 372 return restoreFromVGPRLane(SI.getIndex()); 373 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: 374 return copyFromScratchSGPR(SI.getReg()); 375 } 376 } 377 }; 378 379 } // namespace llvm 380 381 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 382 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 383 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 385 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 386 const SIInstrInfo *TII = ST.getInstrInfo(); 387 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 388 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 389 390 // We don't need this if we only have spills since there is no user facing 391 // scratch. 392 393 // TODO: If we know we don't have flat instructions earlier, we can omit 394 // this from the input registers. 395 // 396 // TODO: We only need to know if we access scratch space through a flat 397 // pointer. Because we only detect if flat instructions are used at all, 398 // this will be used more often than necessary on VI. 399 400 Register FlatScrInitLo; 401 Register FlatScrInitHi; 402 403 if (ST.isAmdPalOS()) { 404 // Extract the scratch offset from the descriptor in the GIT 405 LiveRegUnits LiveUnits; 406 LiveUnits.init(*TRI); 407 LiveUnits.addLiveIns(MBB); 408 409 // Find unused reg to load flat scratch init into 410 MachineRegisterInfo &MRI = MF.getRegInfo(); 411 Register FlatScrInit = AMDGPU::NoRegister; 412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 414 AllSGPR64s = AllSGPR64s.slice( 415 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 417 for (MCPhysReg Reg : AllSGPR64s) { 418 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && 419 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 420 FlatScrInit = Reg; 421 break; 422 } 423 } 424 assert(FlatScrInit && "Failed to find free register for scratch init"); 425 426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 428 429 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 430 431 // We now have the GIT ptr - now get the scratch descriptor from the entry 432 // at offset 0 (or offset 16 for a compute shader). 433 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 435 auto *MMO = MF.getMachineMemOperand( 436 PtrInfo, 437 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 438 MachineMemOperand::MODereferenceable, 439 8, Align(4)); 440 unsigned Offset = 441 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 444 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 445 .addReg(FlatScrInit) 446 .addImm(EncodedOffset) // offset 447 .addImm(0) // cpol 448 .addMemOperand(MMO); 449 450 // Mask the offset in [47:0] of the descriptor 451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 452 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 453 .addReg(FlatScrInitHi) 454 .addImm(0xffff); 455 And->getOperand(3).setIsDead(); // Mark SCC as dead. 456 } else { 457 Register FlatScratchInitReg = 458 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 459 assert(FlatScratchInitReg); 460 461 MachineRegisterInfo &MRI = MF.getRegInfo(); 462 MRI.addLiveIn(FlatScratchInitReg); 463 MBB.addLiveIn(FlatScratchInitReg); 464 465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 467 } 468 469 // Do a 64-bit pointer add. 470 if (ST.flatScratchIsPointer()) { 471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 473 .addReg(FlatScrInitLo) 474 .addReg(ScratchWaveOffsetReg); 475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 476 FlatScrInitHi) 477 .addReg(FlatScrInitHi) 478 .addImm(0); 479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 480 481 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 482 addReg(FlatScrInitLo). 483 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 484 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 486 addReg(FlatScrInitHi). 487 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 488 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 489 return; 490 } 491 492 // For GFX9. 493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 494 .addReg(FlatScrInitLo) 495 .addReg(ScratchWaveOffsetReg); 496 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), 497 AMDGPU::FLAT_SCR_HI) 498 .addReg(FlatScrInitHi) 499 .addImm(0); 500 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 501 502 return; 503 } 504 505 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 506 507 // Copy the size in bytes. 508 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 509 .addReg(FlatScrInitHi, RegState::Kill); 510 511 // Add wave offset in bytes to private base offset. 512 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 513 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) 514 .addReg(FlatScrInitLo) 515 .addReg(ScratchWaveOffsetReg); 516 517 // Convert offset to 256-byte units. 518 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), 519 AMDGPU::FLAT_SCR_HI) 520 .addReg(FlatScrInitLo, RegState::Kill) 521 .addImm(8); 522 LShr->getOperand(3).setIsDead(); // Mark SCC as dead. 523 } 524 525 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 526 // memory. They should have been removed by now. 527 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 528 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 529 I != E; ++I) { 530 if (!MFI.isDeadObjectIndex(I)) 531 return false; 532 } 533 534 return true; 535 } 536 537 // Shift down registers reserved for the scratch RSRC. 538 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 539 MachineFunction &MF) const { 540 541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 542 const SIInstrInfo *TII = ST.getInstrInfo(); 543 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 544 MachineRegisterInfo &MRI = MF.getRegInfo(); 545 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 546 547 assert(MFI->isEntryFunction()); 548 549 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 550 551 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 552 allStackObjectsAreDead(MF.getFrameInfo()))) 553 return Register(); 554 555 if (ST.hasSGPRInitBug() || 556 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 557 return ScratchRsrcReg; 558 559 // We reserved the last registers for this. Shift it down to the end of those 560 // which were actually used. 561 // 562 // FIXME: It might be safer to use a pseudoregister before replacement. 563 564 // FIXME: We should be able to eliminate unused input registers. We only 565 // cannot do this for the resources required for scratch access. For now we 566 // skip over user SGPRs and may leave unused holes. 567 568 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 569 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 570 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 571 572 // Skip the last N reserved elements because they should have already been 573 // reserved for VCC etc. 574 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 575 for (MCPhysReg Reg : AllSGPR128s) { 576 // Pick the first unallocated one. Make sure we don't clobber the other 577 // reserved input we needed. Also for PAL, make sure we don't clobber 578 // the GIT pointer passed in SGPR0 or SGPR8. 579 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 580 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { 581 MRI.replaceRegWith(ScratchRsrcReg, Reg); 582 MFI->setScratchRSrcReg(Reg); 583 return Reg; 584 } 585 } 586 587 return ScratchRsrcReg; 588 } 589 590 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 591 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 592 } 593 594 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 595 MachineBasicBlock &MBB) const { 596 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 597 598 // FIXME: If we only have SGPR spills, we won't actually be using scratch 599 // memory since these spill to VGPRs. We should be cleaning up these unused 600 // SGPR spill frame indices somewhere. 601 602 // FIXME: We still have implicit uses on SGPR spill instructions in case they 603 // need to spill to vector memory. It's likely that will not happen, but at 604 // this point it appears we need the setup. This part of the prolog should be 605 // emitted after frame indices are eliminated. 606 607 // FIXME: Remove all of the isPhysRegUsed checks 608 609 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 610 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 611 const SIInstrInfo *TII = ST.getInstrInfo(); 612 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 613 MachineRegisterInfo &MRI = MF.getRegInfo(); 614 const Function &F = MF.getFunction(); 615 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 616 617 assert(MFI->isEntryFunction()); 618 619 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 620 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 621 622 // We need to do the replacement of the private segment buffer register even 623 // if there are no stack objects. There could be stores to undef or a 624 // constant without an associated object. 625 // 626 // This will return `Register()` in cases where there are no actual 627 // uses of the SRSRC. 628 Register ScratchRsrcReg; 629 if (!ST.enableFlatScratch()) 630 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 631 632 // Make the selected register live throughout the function. 633 if (ScratchRsrcReg) { 634 for (MachineBasicBlock &OtherBB : MF) { 635 if (&OtherBB != &MBB) { 636 OtherBB.addLiveIn(ScratchRsrcReg); 637 } 638 } 639 } 640 641 // Now that we have fixed the reserved SRSRC we need to locate the 642 // (potentially) preloaded SRSRC. 643 Register PreloadedScratchRsrcReg; 644 if (ST.isAmdHsaOrMesa(F)) { 645 PreloadedScratchRsrcReg = 646 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 647 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 648 // We added live-ins during argument lowering, but since they were not 649 // used they were deleted. We're adding the uses now, so add them back. 650 MRI.addLiveIn(PreloadedScratchRsrcReg); 651 MBB.addLiveIn(PreloadedScratchRsrcReg); 652 } 653 } 654 655 // Debug location must be unknown since the first debug location is used to 656 // determine the end of the prologue. 657 DebugLoc DL; 658 MachineBasicBlock::iterator I = MBB.begin(); 659 660 // We found the SRSRC first because it needs four registers and has an 661 // alignment requirement. If the SRSRC that we found is clobbering with 662 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 663 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 664 // wave offset to a free SGPR. 665 Register ScratchWaveOffsetReg; 666 if (PreloadedScratchWaveOffsetReg && 667 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 668 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 669 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 670 AllSGPRs = AllSGPRs.slice( 671 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 672 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 673 for (MCPhysReg Reg : AllSGPRs) { 674 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 675 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 676 ScratchWaveOffsetReg = Reg; 677 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 678 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 679 break; 680 } 681 } 682 } else { 683 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 684 } 685 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); 686 687 if (requiresStackPointerReference(MF)) { 688 Register SPReg = MFI->getStackPtrOffsetReg(); 689 assert(SPReg != AMDGPU::SP_REG); 690 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 691 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); 692 } 693 694 if (hasFP(MF)) { 695 Register FPReg = MFI->getFrameOffsetReg(); 696 assert(FPReg != AMDGPU::FP_REG); 697 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 698 } 699 700 bool NeedsFlatScratchInit = 701 MFI->getUserSGPRInfo().hasFlatScratchInit() && 702 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || 703 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); 704 705 if ((NeedsFlatScratchInit || ScratchRsrcReg) && 706 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { 707 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 708 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 709 } 710 711 if (NeedsFlatScratchInit) { 712 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 713 } 714 715 if (ScratchRsrcReg) { 716 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 717 PreloadedScratchRsrcReg, 718 ScratchRsrcReg, ScratchWaveOffsetReg); 719 } 720 } 721 722 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 723 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 724 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 725 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 726 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 727 728 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 729 const SIInstrInfo *TII = ST.getInstrInfo(); 730 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 731 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 732 const Function &Fn = MF.getFunction(); 733 734 if (ST.isAmdPalOS()) { 735 // The pointer to the GIT is formed from the offset passed in and either 736 // the amdgpu-git-ptr-high function attribute or the top part of the PC 737 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 738 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 739 740 buildGitPtr(MBB, I, DL, TII, Rsrc01); 741 742 // We now have the GIT ptr - now get the scratch descriptor from the entry 743 // at offset 0 (or offset 16 for a compute shader). 744 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 745 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 746 auto MMO = MF.getMachineMemOperand(PtrInfo, 747 MachineMemOperand::MOLoad | 748 MachineMemOperand::MOInvariant | 749 MachineMemOperand::MODereferenceable, 750 16, Align(4)); 751 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 752 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 753 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 754 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 755 .addReg(Rsrc01) 756 .addImm(EncodedOffset) // offset 757 .addImm(0) // cpol 758 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 759 .addMemOperand(MMO); 760 761 // The driver will always set the SRD for wave 64 (bits 118:117 of 762 // descriptor / bits 22:21 of third sub-reg will be 0b11) 763 // If the shader is actually wave32 we have to modify the const_index_stride 764 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 765 // reason the driver does this is that there can be cases where it presents 766 // 2 shaders with different wave size (e.g. VsFs). 767 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 768 if (ST.isWave32()) { 769 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 770 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 771 .addImm(21) 772 .addReg(Rsrc03); 773 } 774 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 775 assert(!ST.isAmdHsaOrMesa(Fn)); 776 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 777 778 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 779 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 780 781 // Use relocations to get the pointer, and setup the other bits manually. 782 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 783 784 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { 785 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 786 787 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 788 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 789 790 BuildMI(MBB, I, DL, Mov64, Rsrc01) 791 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 792 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 793 } else { 794 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 795 796 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 797 auto MMO = MF.getMachineMemOperand( 798 PtrInfo, 799 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 800 MachineMemOperand::MODereferenceable, 801 8, Align(4)); 802 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 803 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 804 .addImm(0) // offset 805 .addImm(0) // cpol 806 .addMemOperand(MMO) 807 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 808 809 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 810 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 811 } 812 } else { 813 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 814 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 815 816 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 817 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 818 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 819 820 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 821 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 822 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 823 } 824 825 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 826 .addImm(Rsrc23 & 0xffffffff) 827 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 828 829 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 830 .addImm(Rsrc23 >> 32) 831 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 832 } else if (ST.isAmdHsaOrMesa(Fn)) { 833 assert(PreloadedScratchRsrcReg); 834 835 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 836 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 837 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 838 } 839 } 840 841 // Add the scratch wave offset into the scratch RSRC. 842 // 843 // We only want to update the first 48 bits, which is the base address 844 // pointer, without touching the adjacent 16 bits of flags. We know this add 845 // cannot carry-out from bit 47, otherwise the scratch allocation would be 846 // impossible to fit in the 48-bit global address space. 847 // 848 // TODO: Evaluate if it is better to just construct an SRD using the flat 849 // scratch init and some constants rather than update the one we are passed. 850 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 851 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 852 853 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 854 // the kernel body via inreg arguments. 855 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 856 .addReg(ScratchRsrcSub0) 857 .addReg(ScratchWaveOffsetReg) 858 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 859 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 860 .addReg(ScratchRsrcSub1) 861 .addImm(0) 862 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 863 Addc->getOperand(3).setIsDead(); // Mark SCC as dead. 864 } 865 866 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 867 switch (ID) { 868 case TargetStackID::Default: 869 case TargetStackID::NoAlloc: 870 case TargetStackID::SGPRSpill: 871 return true; 872 case TargetStackID::ScalableVector: 873 case TargetStackID::WasmLocal: 874 return false; 875 } 876 llvm_unreachable("Invalid TargetStackID::Value"); 877 } 878 879 // Activate only the inactive lanes when \p EnableInactiveLanes is true. 880 // Otherwise, activate all lanes. It returns the saved exec. 881 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, 882 MachineFunction &MF, 883 MachineBasicBlock &MBB, 884 MachineBasicBlock::iterator MBBI, 885 const DebugLoc &DL, bool IsProlog, 886 bool EnableInactiveLanes) { 887 Register ScratchExecCopy; 888 MachineRegisterInfo &MRI = MF.getRegInfo(); 889 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 890 const SIInstrInfo *TII = ST.getInstrInfo(); 891 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 892 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 893 894 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 895 896 ScratchExecCopy = findScratchNonCalleeSaveRegister( 897 MRI, LiveUnits, *TRI.getWaveMaskRegClass()); 898 if (!ScratchExecCopy) 899 report_fatal_error("failed to find free scratch register"); 900 901 LiveUnits.addReg(ScratchExecCopy); 902 903 const unsigned SaveExecOpc = 904 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 905 : AMDGPU::S_OR_SAVEEXEC_B32) 906 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 907 : AMDGPU::S_OR_SAVEEXEC_B64); 908 auto SaveExec = 909 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); 910 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 911 912 return ScratchExecCopy; 913 } 914 915 void SIFrameLowering::emitCSRSpillStores( 916 MachineFunction &MF, MachineBasicBlock &MBB, 917 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, 918 Register FrameReg, Register FramePtrRegScratchCopy) const { 919 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 920 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 921 const SIInstrInfo *TII = ST.getInstrInfo(); 922 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 923 924 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 925 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 926 // might end up flipping the EXEC bits twice. 927 Register ScratchExecCopy; 928 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 929 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 930 if (!WWMScratchRegs.empty()) 931 ScratchExecCopy = 932 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 933 /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 934 935 auto StoreWWMRegisters = 936 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 937 for (const auto &Reg : WWMRegs) { 938 Register VGPR = Reg.first; 939 int FI = Reg.second; 940 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 941 VGPR, FI, FrameReg); 942 } 943 }; 944 945 StoreWWMRegisters(WWMScratchRegs); 946 if (!WWMCalleeSavedRegs.empty()) { 947 if (ScratchExecCopy) { 948 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 949 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 950 } else { 951 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 952 /*IsProlog*/ true, 953 /*EnableInactiveLanes*/ false); 954 } 955 } 956 957 StoreWWMRegisters(WWMCalleeSavedRegs); 958 if (ScratchExecCopy) { 959 // FIXME: Split block and make terminator. 960 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 961 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 962 .addReg(ScratchExecCopy, RegState::Kill); 963 LiveUnits.addReg(ScratchExecCopy); 964 } 965 966 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 967 968 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 969 // Special handle FP spill: 970 // Skip if FP is saved to a scratch SGPR, the save has already been emitted. 971 // Otherwise, FP has been moved to a temporary register and spill it 972 // instead. 973 Register Reg = 974 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 975 if (!Reg) 976 continue; 977 978 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 979 LiveUnits, FrameReg); 980 SB.save(); 981 } 982 983 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make 984 // such scratch registers live throughout the function. 985 SmallVector<Register, 1> ScratchSGPRs; 986 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs); 987 if (!ScratchSGPRs.empty()) { 988 for (MachineBasicBlock &MBB : MF) { 989 for (MCPhysReg Reg : ScratchSGPRs) 990 MBB.addLiveIn(Reg); 991 992 MBB.sortUniqueLiveIns(); 993 } 994 if (!LiveUnits.empty()) { 995 for (MCPhysReg Reg : ScratchSGPRs) 996 LiveUnits.addReg(Reg); 997 } 998 } 999 } 1000 1001 void SIFrameLowering::emitCSRSpillRestores( 1002 MachineFunction &MF, MachineBasicBlock &MBB, 1003 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, 1004 Register FrameReg, Register FramePtrRegScratchCopy) const { 1005 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1006 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1007 const SIInstrInfo *TII = ST.getInstrInfo(); 1008 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1009 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1010 1011 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { 1012 // Special handle FP restore: 1013 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore 1014 // the FP value to a temporary register. The frame pointer should be 1015 // overwritten only at the end when all other spills are restored from 1016 // current frame. 1017 Register Reg = 1018 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; 1019 if (!Reg) 1020 continue; 1021 1022 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, 1023 LiveUnits, FrameReg); 1024 SB.restore(); 1025 } 1026 1027 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the 1028 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to 1029 // this, we might end up flipping the EXEC bits twice. 1030 Register ScratchExecCopy; 1031 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 1032 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 1033 if (!WWMScratchRegs.empty()) 1034 ScratchExecCopy = 1035 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1036 /*IsProlog*/ false, /*EnableInactiveLanes*/ true); 1037 1038 auto RestoreWWMRegisters = 1039 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 1040 for (const auto &Reg : WWMRegs) { 1041 Register VGPR = Reg.first; 1042 int FI = Reg.second; 1043 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 1044 VGPR, FI, FrameReg); 1045 } 1046 }; 1047 1048 RestoreWWMRegisters(WWMScratchRegs); 1049 if (!WWMCalleeSavedRegs.empty()) { 1050 if (ScratchExecCopy) { 1051 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1052 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); 1053 } else { 1054 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 1055 /*IsProlog*/ false, 1056 /*EnableInactiveLanes*/ false); 1057 } 1058 } 1059 1060 RestoreWWMRegisters(WWMCalleeSavedRegs); 1061 if (ScratchExecCopy) { 1062 // FIXME: Split block and make terminator. 1063 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1064 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) 1065 .addReg(ScratchExecCopy, RegState::Kill); 1066 } 1067 } 1068 1069 void SIFrameLowering::emitPrologue(MachineFunction &MF, 1070 MachineBasicBlock &MBB) const { 1071 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1072 if (FuncInfo->isEntryFunction()) { 1073 emitEntryFunctionPrologue(MF, MBB); 1074 return; 1075 } 1076 1077 MachineFrameInfo &MFI = MF.getFrameInfo(); 1078 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1079 const SIInstrInfo *TII = ST.getInstrInfo(); 1080 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1081 MachineRegisterInfo &MRI = MF.getRegInfo(); 1082 1083 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1084 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1085 Register BasePtrReg = 1086 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 1087 LiveRegUnits LiveUnits; 1088 1089 MachineBasicBlock::iterator MBBI = MBB.begin(); 1090 // DebugLoc must be unknown since the first instruction with DebugLoc is used 1091 // to determine the end of the prologue. 1092 DebugLoc DL; 1093 1094 if (FuncInfo->isChainFunction()) { 1095 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but 1096 // are free to set one up if they need it. 1097 bool UseSP = requiresStackPointerReference(MF); 1098 if (UseSP) { 1099 assert(StackPtrReg != AMDGPU::SP_REG); 1100 1101 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) 1102 .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); 1103 } 1104 } 1105 1106 bool HasFP = false; 1107 bool HasBP = false; 1108 uint32_t NumBytes = MFI.getStackSize(); 1109 uint32_t RoundedSize = NumBytes; 1110 1111 if (TRI.hasStackRealignment(MF)) 1112 HasFP = true; 1113 1114 Register FramePtrRegScratchCopy; 1115 if (!HasFP && !hasFP(MF)) { 1116 // Emit the CSR spill stores with SP base register. 1117 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, 1118 FuncInfo->isChainFunction() ? Register() : StackPtrReg, 1119 FramePtrRegScratchCopy); 1120 } else { 1121 // CSR spill stores will use FP as base register. 1122 Register SGPRForFPSaveRestoreCopy = 1123 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1124 1125 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 1126 if (SGPRForFPSaveRestoreCopy) { 1127 // Copy FP to the scratch register now and emit the CFI entry. It avoids 1128 // the extra FP copy needed in the other two cases when FP is spilled to 1129 // memory or to a VGPR lane. 1130 PrologEpilogSGPRSpillBuilder SB( 1131 FramePtrReg, 1132 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, 1133 DL, TII, TRI, LiveUnits, FramePtrReg); 1134 SB.save(); 1135 LiveUnits.addReg(SGPRForFPSaveRestoreCopy); 1136 } else { 1137 // Copy FP into a new scratch register so that its previous value can be 1138 // spilled after setting up the new frame. 1139 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1140 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); 1141 if (!FramePtrRegScratchCopy) 1142 report_fatal_error("failed to find free scratch register"); 1143 1144 LiveUnits.addReg(FramePtrRegScratchCopy); 1145 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) 1146 .addReg(FramePtrReg); 1147 } 1148 } 1149 1150 if (HasFP) { 1151 const unsigned Alignment = MFI.getMaxAlign().value(); 1152 1153 RoundedSize += Alignment; 1154 if (LiveUnits.empty()) { 1155 LiveUnits.init(TRI); 1156 LiveUnits.addLiveIns(MBB); 1157 } 1158 1159 // s_add_i32 s33, s32, NumBytes 1160 // s_and_b32 s33, s33, 0b111...0000 1161 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) 1162 .addReg(StackPtrReg) 1163 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 1164 .setMIFlag(MachineInstr::FrameSetup); 1165 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 1166 .addReg(FramePtrReg, RegState::Kill) 1167 .addImm(-Alignment * getScratchScaleFactor(ST)) 1168 .setMIFlag(MachineInstr::FrameSetup); 1169 And->getOperand(3).setIsDead(); // Mark SCC as dead. 1170 FuncInfo->setIsStackRealigned(true); 1171 } else if ((HasFP = hasFP(MF))) { 1172 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1173 .addReg(StackPtrReg) 1174 .setMIFlag(MachineInstr::FrameSetup); 1175 } 1176 1177 // If FP is used, emit the CSR spills with FP base register. 1178 if (HasFP) { 1179 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, 1180 FramePtrRegScratchCopy); 1181 if (FramePtrRegScratchCopy) 1182 LiveUnits.removeReg(FramePtrRegScratchCopy); 1183 } 1184 1185 // If we need a base pointer, set it up here. It's whatever the value of 1186 // the stack pointer is at this point. Any variable size objects will be 1187 // allocated after this, so we can still use the base pointer to reference 1188 // the incoming arguments. 1189 if ((HasBP = TRI.hasBasePointer(MF))) { 1190 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1191 .addReg(StackPtrReg) 1192 .setMIFlag(MachineInstr::FrameSetup); 1193 } 1194 1195 if (HasFP && RoundedSize != 0) { 1196 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1197 .addReg(StackPtrReg) 1198 .addImm(RoundedSize * getScratchScaleFactor(ST)) 1199 .setMIFlag(MachineInstr::FrameSetup); 1200 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1201 } 1202 1203 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1204 (void)FPSaved; 1205 assert((!HasFP || FPSaved) && 1206 "Needed to save FP but didn't save it anywhere"); 1207 1208 // If we allow spilling to AGPRs we may have saved FP but then spill 1209 // everything into AGPRs instead of the stack. 1210 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && 1211 "Saved FP but didn't need it"); 1212 1213 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg); 1214 (void)BPSaved; 1215 assert((!HasBP || BPSaved) && 1216 "Needed to save BP but didn't save it anywhere"); 1217 1218 assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); 1219 } 1220 1221 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 1222 MachineBasicBlock &MBB) const { 1223 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1224 if (FuncInfo->isEntryFunction()) 1225 return; 1226 1227 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1228 const SIInstrInfo *TII = ST.getInstrInfo(); 1229 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1230 MachineRegisterInfo &MRI = MF.getRegInfo(); 1231 LiveRegUnits LiveUnits; 1232 // Get the insert location for the epilogue. If there were no terminators in 1233 // the block, get the last instruction. 1234 MachineBasicBlock::iterator MBBI = MBB.end(); 1235 DebugLoc DL; 1236 if (!MBB.empty()) { 1237 MBBI = MBB.getLastNonDebugInstr(); 1238 if (MBBI != MBB.end()) 1239 DL = MBBI->getDebugLoc(); 1240 1241 MBBI = MBB.getFirstTerminator(); 1242 } 1243 1244 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1245 uint32_t NumBytes = MFI.getStackSize(); 1246 uint32_t RoundedSize = FuncInfo->isStackRealigned() 1247 ? NumBytes + MFI.getMaxAlign().value() 1248 : NumBytes; 1249 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 1250 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1251 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); 1252 1253 Register FramePtrRegScratchCopy; 1254 Register SGPRForFPSaveRestoreCopy = 1255 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1256 if (FPSaved) { 1257 // CSR spill restores should use FP as base register. If 1258 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP 1259 // into a new scratch register and copy to FP later when other registers are 1260 // restored from the current stack frame. 1261 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1262 if (SGPRForFPSaveRestoreCopy) { 1263 LiveUnits.addReg(SGPRForFPSaveRestoreCopy); 1264 } else { 1265 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( 1266 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); 1267 if (!FramePtrRegScratchCopy) 1268 report_fatal_error("failed to find free scratch register"); 1269 1270 LiveUnits.addReg(FramePtrRegScratchCopy); 1271 } 1272 1273 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, 1274 FramePtrRegScratchCopy); 1275 } 1276 1277 if (RoundedSize != 0 && hasFP(MF)) { 1278 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 1279 .addReg(StackPtrReg) 1280 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) 1281 .setMIFlag(MachineInstr::FrameDestroy); 1282 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1283 } 1284 1285 if (FPSaved) { 1286 // Insert the copy to restore FP. 1287 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy 1288 : FramePtrRegScratchCopy; 1289 MachineInstrBuilder MIB = 1290 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 1291 .addReg(SrcReg); 1292 if (SGPRForFPSaveRestoreCopy) 1293 MIB.setMIFlag(MachineInstr::FrameDestroy); 1294 } else { 1295 // Insert the CSR spill restores with SP as the base register. 1296 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg, 1297 FramePtrRegScratchCopy); 1298 } 1299 } 1300 1301 #ifndef NDEBUG 1302 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1303 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1304 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1305 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1306 I != E; ++I) { 1307 if (!MFI.isDeadObjectIndex(I) && 1308 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1309 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) { 1310 return false; 1311 } 1312 } 1313 1314 return true; 1315 } 1316 #endif 1317 1318 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1319 int FI, 1320 Register &FrameReg) const { 1321 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1322 1323 FrameReg = RI->getFrameRegister(MF); 1324 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1325 } 1326 1327 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1328 MachineFunction &MF, 1329 RegScavenger *RS) const { 1330 MachineFrameInfo &MFI = MF.getFrameInfo(); 1331 1332 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1333 const SIInstrInfo *TII = ST.getInstrInfo(); 1334 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1335 MachineRegisterInfo &MRI = MF.getRegInfo(); 1336 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1337 1338 // Allocate spill slots for WWM reserved VGPRs. 1339 // For chain functions, we only need to do this if we have calls to 1340 // llvm.amdgcn.cs.chain. 1341 bool IsChainWithoutCalls = 1342 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); 1343 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { 1344 for (Register Reg : FuncInfo->getWWMReservedRegs()) { 1345 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); 1346 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), 1347 TRI->getSpillAlign(*RC)); 1348 } 1349 } 1350 1351 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() 1352 && EnableSpillVGPRToAGPR; 1353 1354 if (SpillVGPRToAGPR) { 1355 // To track the spill frame indices handled in this pass. 1356 BitVector SpillFIs(MFI.getObjectIndexEnd(), false); 1357 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); 1358 1359 bool SeenDbgInstr = false; 1360 1361 for (MachineBasicBlock &MBB : MF) { 1362 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { 1363 int FrameIndex; 1364 if (MI.isDebugInstr()) 1365 SeenDbgInstr = true; 1366 1367 if (TII->isVGPRSpill(MI)) { 1368 // Try to eliminate stack used by VGPR spills before frame 1369 // finalization. 1370 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1371 AMDGPU::OpName::vaddr); 1372 int FI = MI.getOperand(FIOp).getIndex(); 1373 Register VReg = 1374 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 1375 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, 1376 TRI->isAGPR(MRI, VReg))) { 1377 assert(RS != nullptr); 1378 RS->enterBasicBlockEnd(MBB); 1379 RS->backward(std::next(MI.getIterator())); 1380 TRI->eliminateFrameIndex(MI, 0, FIOp, RS); 1381 SpillFIs.set(FI); 1382 continue; 1383 } 1384 } else if (TII->isStoreToStackSlot(MI, FrameIndex) || 1385 TII->isLoadFromStackSlot(MI, FrameIndex)) 1386 if (!MFI.isFixedObjectIndex(FrameIndex)) 1387 NonVGPRSpillFIs.set(FrameIndex); 1388 } 1389 } 1390 1391 // Stack slot coloring may assign different objects to the same stack slot. 1392 // If not, then the VGPR to AGPR spill slot is dead. 1393 for (unsigned FI : SpillFIs.set_bits()) 1394 if (!NonVGPRSpillFIs.test(FI)) 1395 FuncInfo->setVGPRToAGPRSpillDead(FI); 1396 1397 for (MachineBasicBlock &MBB : MF) { 1398 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) 1399 MBB.addLiveIn(Reg); 1400 1401 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) 1402 MBB.addLiveIn(Reg); 1403 1404 MBB.sortUniqueLiveIns(); 1405 1406 if (!SpillFIs.empty() && SeenDbgInstr) { 1407 // FIXME: The dead frame indices are replaced with a null register from 1408 // the debug value instructions. We should instead, update it with the 1409 // correct register value. But not sure the register value alone is 1410 for (MachineInstr &MI : MBB) { 1411 if (MI.isDebugValue() && MI.getOperand(0).isFI() && 1412 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) && 1413 SpillFIs[MI.getOperand(0).getIndex()]) { 1414 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); 1415 } 1416 } 1417 } 1418 } 1419 } 1420 1421 // At this point we've already allocated all spilled SGPRs to VGPRs if we 1422 // can. Any remaining SGPR spills will go to memory, so move them back to the 1423 // default stack. 1424 bool HaveSGPRToVMemSpill = 1425 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); 1426 assert(allSGPRSpillsAreDead(MF) && 1427 "SGPR spill should have been removed in SILowerSGPRSpills"); 1428 1429 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1430 // but currently hasNonSpillStackObjects is set only from source 1431 // allocas. Stack temps produced from legalization are not counted currently. 1432 if (!allStackObjectsAreDead(MFI)) { 1433 assert(RS && "RegScavenger required if spilling"); 1434 1435 // Add an emergency spill slot 1436 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1437 1438 // If we are spilling SGPRs to memory with a large frame, we may need a 1439 // second VGPR emergency frame index. 1440 if (HaveSGPRToVMemSpill && 1441 allocateScavengingFrameIndexesNearIncomingSP(MF)) { 1442 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false)); 1443 } 1444 } 1445 } 1446 1447 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( 1448 MachineFunction &MF, RegScavenger *RS) const { 1449 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1450 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1451 MachineRegisterInfo &MRI = MF.getRegInfo(); 1452 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1453 1454 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 1455 // On gfx908, we had initially reserved highest available VGPR for AGPR 1456 // copy. Now since we are done with RA, check if there exist an unused VGPR 1457 // which is lower than the eariler reserved VGPR before RA. If one exist, 1458 // use it for AGPR copy instead of one reserved before RA. 1459 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); 1460 Register UnusedLowVGPR = 1461 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); 1462 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < 1463 TRI->getHWRegIndex(VGPRForAGPRCopy))) { 1464 // Reserve this newly identified VGPR (for AGPR copy) 1465 // reserved registers should already be frozen at this point 1466 // so we can avoid calling MRI.freezeReservedRegs and just use 1467 // MRI.reserveReg 1468 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); 1469 MRI.reserveReg(UnusedLowVGPR, TRI); 1470 } 1471 } 1472 // We initally reserved the highest available SGPR pair for long branches 1473 // now, after RA, we shift down to a lower unused one if one exists 1474 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); 1475 Register UnusedLowSGPR = 1476 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); 1477 // If LongBranchReservedReg is null then we didn't find a long branch 1478 // and never reserved a register to begin with so there is nothing to 1479 // shift down. Then if UnusedLowSGPR is null, there isn't available lower 1480 // register to use so just keep the original one we set. 1481 if (LongBranchReservedReg && UnusedLowSGPR) { 1482 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); 1483 MRI.reserveReg(UnusedLowSGPR, TRI); 1484 } 1485 } 1486 1487 // The special SGPR spills like the one needed for FP, BP or any reserved 1488 // registers delayed until frame lowering. 1489 void SIFrameLowering::determinePrologEpilogSGPRSaves( 1490 MachineFunction &MF, BitVector &SavedVGPRs, 1491 bool NeedExecCopyReservedReg) const { 1492 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1493 MachineRegisterInfo &MRI = MF.getRegInfo(); 1494 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1495 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1496 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1497 LiveRegUnits LiveUnits; 1498 LiveUnits.init(*TRI); 1499 // Initially mark callee saved registers as used so we will not choose them 1500 // while looking for scratch SGPRs. 1501 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); 1502 for (unsigned I = 0; CSRegs[I]; ++I) 1503 LiveUnits.addReg(CSRegs[I]); 1504 1505 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); 1506 1507 if (NeedExecCopyReservedReg) { 1508 Register ReservedReg = MFI->getSGPRForEXECCopy(); 1509 assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); 1510 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); 1511 if (UnusedScratchReg) { 1512 // If found any unused scratch SGPR, reserve the register itself for Exec 1513 // copy and there is no need for any spills in that case. 1514 MFI->setSGPRForEXECCopy(UnusedScratchReg); 1515 LiveUnits.addReg(UnusedScratchReg); 1516 } else { 1517 // Needs spill. 1518 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && 1519 "Re-reserving spill slot for EXEC copy register"); 1520 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC, 1521 /*IncludeScratchCopy=*/false); 1522 } 1523 } 1524 1525 // hasFP only knows about stack objects that already exist. We're now 1526 // determining the stack slots that will be created, so we have to predict 1527 // them. Stack objects force FP usage with calls. 1528 // 1529 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1530 // don't want to report it here. 1531 // 1532 // FIXME: Is this really hasReservedCallFrame? 1533 const bool WillHaveFP = 1534 FrameInfo.hasCalls() && 1535 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1536 1537 if (WillHaveFP || hasFP(MF)) { 1538 Register FramePtrReg = MFI->getFrameOffsetReg(); 1539 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && 1540 "Re-reserving spill slot for FP"); 1541 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg); 1542 } 1543 1544 if (TRI->hasBasePointer(MF)) { 1545 Register BasePtrReg = TRI->getBaseRegister(); 1546 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && 1547 "Re-reserving spill slot for BP"); 1548 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg); 1549 } 1550 } 1551 1552 // Only report VGPRs to generic code. 1553 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1554 BitVector &SavedVGPRs, 1555 RegScavenger *RS) const { 1556 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1557 1558 // If this is a function with the amdgpu_cs_chain[_preserve] calling 1559 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then 1560 // we don't need to save and restore anything. 1561 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) 1562 return; 1563 1564 MFI->shiftSpillPhysVGPRsToLowestRange(MF); 1565 1566 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1567 if (MFI->isEntryFunction()) 1568 return; 1569 1570 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1571 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1572 const SIInstrInfo *TII = ST.getInstrInfo(); 1573 bool NeedExecCopyReservedReg = false; 1574 1575 MachineInstr *ReturnMI = nullptr; 1576 for (MachineBasicBlock &MBB : MF) { 1577 for (MachineInstr &MI : MBB) { 1578 // WRITELANE instructions used for SGPR spills can overwrite the inactive 1579 // lanes of VGPRs and callee must spill and restore them even if they are 1580 // marked Caller-saved. 1581 1582 // TODO: Handle this elsewhere at an early point. Walking through all MBBs 1583 // here would be a bad heuristic. A better way should be by calling 1584 // allocateWWMSpill during the regalloc pipeline whenever a physical 1585 // register is allocated for the intended virtual registers. 1586 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) 1587 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); 1588 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) 1589 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); 1590 else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) 1591 NeedExecCopyReservedReg = true; 1592 else if (MI.getOpcode() == AMDGPU::SI_RETURN || 1593 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1594 (MFI->isChainFunction() && 1595 TII->isChainCallOpcode(MI.getOpcode()))) { 1596 // We expect all return to be the same size. 1597 assert(!ReturnMI || 1598 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == 1599 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); 1600 ReturnMI = &MI; 1601 } 1602 } 1603 } 1604 1605 // Remove any VGPRs used in the return value because these do not need to be saved. 1606 // This prevents CSR restore from clobbering return VGPRs. 1607 if (ReturnMI) { 1608 for (auto &Op : ReturnMI->operands()) { 1609 if (Op.isReg()) 1610 SavedVGPRs.reset(Op.getReg()); 1611 } 1612 } 1613 1614 // Ignore the SGPRs the default implementation found. 1615 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1616 1617 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1618 // In gfx908 there was do AGPR loads and stores and thus spilling also 1619 // require a temporary VGPR. 1620 if (!ST.hasGFX90AInsts()) 1621 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1622 1623 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); 1624 1625 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't 1626 // allow the default insertion to handle them. 1627 for (auto &Reg : MFI->getWWMSpills()) 1628 SavedVGPRs.reset(Reg.first); 1629 1630 // Mark all lane VGPRs as BB LiveIns. 1631 for (MachineBasicBlock &MBB : MF) { 1632 for (auto &Reg : MFI->getWWMSpills()) 1633 MBB.addLiveIn(Reg.first); 1634 1635 MBB.sortUniqueLiveIns(); 1636 } 1637 } 1638 1639 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1640 BitVector &SavedRegs, 1641 RegScavenger *RS) const { 1642 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1643 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1644 if (MFI->isEntryFunction()) 1645 return; 1646 1647 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1648 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1649 1650 // The SP is specifically managed and we don't want extra spills of it. 1651 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1652 1653 const BitVector AllSavedRegs = SavedRegs; 1654 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1655 1656 // We have to anticipate introducing CSR VGPR spills or spill of caller 1657 // save VGPR reserved for SGPR spills as we now always create stack entry 1658 // for it, if we don't have any stack objects already, since we require a FP 1659 // if there is a call and stack. We will allocate a VGPR for SGPR spills if 1660 // there are any SGPR spills. Whether they are CSR spills or otherwise. 1661 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1662 const bool WillHaveFP = 1663 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); 1664 1665 // FP will be specially managed like SP. 1666 if (WillHaveFP || hasFP(MF)) 1667 SavedRegs.reset(MFI->getFrameOffsetReg()); 1668 1669 // Return address use with return instruction is hidden through the SI_RETURN 1670 // pseudo. Given that and since the IPRA computes actual register usage and 1671 // does not use CSR list, the clobbering of return address by function calls 1672 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register 1673 // usage collection. This will ensure save/restore of return address happens 1674 // in those scenarios. 1675 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1676 Register RetAddrReg = TRI->getReturnAddressReg(MF); 1677 if (!MFI->isEntryFunction() && 1678 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) { 1679 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); 1680 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); 1681 } 1682 } 1683 1684 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1685 MachineFunction &MF, const TargetRegisterInfo *TRI, 1686 std::vector<CalleeSavedInfo> &CSI) const { 1687 if (CSI.empty()) 1688 return true; // Early exit if no callee saved registers are modified! 1689 1690 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1691 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1692 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1693 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1694 Register BasePtrReg = RI->getBaseRegister(); 1695 Register SGPRForFPSaveRestoreCopy = 1696 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); 1697 Register SGPRForBPSaveRestoreCopy = 1698 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg); 1699 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) 1700 return false; 1701 1702 unsigned NumModifiedRegs = 0; 1703 1704 if (SGPRForFPSaveRestoreCopy) 1705 NumModifiedRegs++; 1706 if (SGPRForBPSaveRestoreCopy) 1707 NumModifiedRegs++; 1708 1709 for (auto &CS : CSI) { 1710 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { 1711 CS.setDstReg(SGPRForFPSaveRestoreCopy); 1712 if (--NumModifiedRegs) 1713 break; 1714 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { 1715 CS.setDstReg(SGPRForBPSaveRestoreCopy); 1716 if (--NumModifiedRegs) 1717 break; 1718 } 1719 } 1720 1721 return false; 1722 } 1723 1724 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( 1725 const MachineFunction &MF) const { 1726 1727 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1728 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1729 const SIInstrInfo *TII = ST.getInstrInfo(); 1730 uint64_t EstStackSize = MFI.estimateStackSize(MF); 1731 uint64_t MaxOffset = EstStackSize - 1; 1732 1733 // We need the emergency stack slots to be allocated in range of the 1734 // MUBUF/flat scratch immediate offset from the base register, so assign these 1735 // first at the incoming SP position. 1736 // 1737 // TODO: We could try sorting the objects to find a hole in the first bytes 1738 // rather than allocating as close to possible. This could save a lot of space 1739 // on frames with alignment requirements. 1740 if (ST.enableFlatScratch()) { 1741 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1742 SIInstrFlags::FlatScratch)) 1743 return false; 1744 } else { 1745 if (TII->isLegalMUBUFImmOffset(MaxOffset)) 1746 return false; 1747 } 1748 1749 return true; 1750 } 1751 1752 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1753 MachineFunction &MF, 1754 MachineBasicBlock &MBB, 1755 MachineBasicBlock::iterator I) const { 1756 int64_t Amount = I->getOperand(0).getImm(); 1757 if (Amount == 0) 1758 return MBB.erase(I); 1759 1760 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1761 const SIInstrInfo *TII = ST.getInstrInfo(); 1762 const DebugLoc &DL = I->getDebugLoc(); 1763 unsigned Opc = I->getOpcode(); 1764 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1765 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1766 1767 if (!hasReservedCallFrame(MF)) { 1768 Amount = alignTo(Amount, getStackAlign()); 1769 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1770 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1771 Register SPReg = MFI->getStackPtrOffsetReg(); 1772 1773 Amount *= getScratchScaleFactor(ST); 1774 if (IsDestroy) 1775 Amount = -Amount; 1776 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) 1777 .addReg(SPReg) 1778 .addImm(Amount); 1779 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1780 } else if (CalleePopAmount != 0) { 1781 llvm_unreachable("is this used?"); 1782 } 1783 1784 return MBB.erase(I); 1785 } 1786 1787 /// Returns true if the frame will require a reference to the stack pointer. 1788 /// 1789 /// This is the set of conditions common to setting up the stack pointer in a 1790 /// kernel, and for using a frame pointer in a callable function. 1791 /// 1792 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1793 /// references SP. 1794 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1795 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1796 } 1797 1798 // The FP for kernels is always known 0, so we never really need to setup an 1799 // explicit register for it. However, DisableFramePointerElim will force us to 1800 // use a register for it. 1801 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1802 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1803 1804 // For entry & chain functions we can use an immediate offset in most cases, 1805 // so the presence of calls doesn't imply we need a distinct frame pointer. 1806 if (MFI.hasCalls() && 1807 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1808 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { 1809 // All offsets are unsigned, so need to be addressed in the same direction 1810 // as stack growth. 1811 1812 // FIXME: This function is pretty broken, since it can be called before the 1813 // frame layout is determined or CSR spills are inserted. 1814 return MFI.getStackSize() != 0; 1815 } 1816 1817 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1818 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1819 MF) || 1820 MF.getTarget().Options.DisableFramePointerElim(MF); 1821 } 1822 1823 // This is essentially a reduced version of hasFP for entry functions. Since the 1824 // stack pointer is known 0 on entry to kernels, we never really need an FP 1825 // register. We may need to initialize the stack pointer depending on the frame 1826 // properties, which logically overlaps many of the cases where an ordinary 1827 // function would require an FP. 1828 // Also used for chain functions. While not technically entry functions, chain 1829 // functions may need to set up a stack pointer in some situations. 1830 bool SIFrameLowering::requiresStackPointerReference( 1831 const MachineFunction &MF) const { 1832 // Callable functions always require a stack pointer reference. 1833 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || 1834 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && 1835 "only expected to call this for entry points and chain functions"); 1836 1837 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1838 1839 // Entry points ordinarily don't need to initialize SP. We have to set it up 1840 // for callees if there are any. Also note tail calls are impossible/don't 1841 // make any sense for kernels. 1842 if (MFI.hasCalls()) 1843 return true; 1844 1845 // We still need to initialize the SP if we're doing anything weird that 1846 // references the SP, like variable sized stack objects. 1847 return frameTriviallyRequiresSP(MFI); 1848 } 1849