1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveRegUnits.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling SGPRs to VGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 namespace llvm { 49 50 // A temporary struct to spill SGPRs. 51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 52 // just v_writelane and v_readlane. 53 // 54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 55 // is saved to scratch (or the other way around for loads). 56 // For this, a VGPR is required where the needed lanes can be clobbered. The 57 // RegScavenger can provide a VGPR where currently active lanes can be 58 // clobbered, but we still need to save inactive lanes. 59 // The high-level steps are: 60 // - Try to scavenge SGPR(s) to save exec 61 // - Try to scavenge VGPR 62 // - Save needed, all or inactive lanes of a TmpVGPR 63 // - Spill/Restore SGPRs using TmpVGPR 64 // - Restore TmpVGPR 65 // 66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 67 // cannot scavenge temporary SGPRs to save exec, we use the following code: 68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 69 // s_not exec, exec 70 // buffer_store_dword TmpVGPR ; save inactive lanes 71 // s_not exec, exec 72 struct SGPRSpillBuilder { 73 struct PerVGPRData { 74 unsigned PerVGPR; 75 unsigned NumVGPRs; 76 int64_t VGPRLanes; 77 }; 78 79 // The SGPR to save 80 Register SuperReg; 81 MachineBasicBlock::iterator MI; 82 ArrayRef<int16_t> SplitParts; 83 unsigned NumSubRegs; 84 bool IsKill; 85 const DebugLoc &DL; 86 87 /* When spilling to stack */ 88 // The SGPRs are written into this VGPR, which is then written to scratch 89 // (or vice versa for loads). 90 Register TmpVGPR = AMDGPU::NoRegister; 91 // Temporary spill slot to save TmpVGPR to. 92 int TmpVGPRIndex = 0; 93 // If TmpVGPR is live before the spill or if it is scavenged. 94 bool TmpVGPRLive = false; 95 // Scavenged SGPR to save EXEC. 96 Register SavedExecReg = AMDGPU::NoRegister; 97 // Stack index to write the SGPRs to. 98 int Index; 99 unsigned EltSize = 4; 100 101 RegScavenger *RS; 102 MachineBasicBlock *MBB; 103 MachineFunction &MF; 104 SIMachineFunctionInfo &MFI; 105 const SIInstrInfo &TII; 106 const SIRegisterInfo &TRI; 107 bool IsWave32; 108 Register ExecReg; 109 unsigned MovOpc; 110 unsigned NotOpc; 111 112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 113 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 114 RegScavenger *RS) 115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 116 MI->getOperand(0).isKill(), Index, RS) {} 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 120 bool IsKill, int Index, RegScavenger *RS) 121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 124 IsWave32(IsWave32) { 125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 126 SplitParts = TRI.getRegSplitParts(RC, EltSize); 127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 128 129 if (IsWave32) { 130 ExecReg = AMDGPU::EXEC_LO; 131 MovOpc = AMDGPU::S_MOV_B32; 132 NotOpc = AMDGPU::S_NOT_B32; 133 } else { 134 ExecReg = AMDGPU::EXEC; 135 MovOpc = AMDGPU::S_MOV_B64; 136 NotOpc = AMDGPU::S_NOT_B64; 137 } 138 139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 141 SuperReg != AMDGPU::EXEC && "exec should never spill"); 142 } 143 144 PerVGPRData getPerVGPRData() { 145 PerVGPRData Data; 146 Data.PerVGPR = IsWave32 ? 32 : 64; 147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 149 return Data; 150 } 151 152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 153 // free. 154 // Writes these instructions if an SGPR can be scavenged: 155 // s_mov_b64 s[6:7], exec ; Save exec 156 // s_mov_b64 exec, 3 ; Wanted lanemask 157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 158 // 159 // Writes these instructions if no SGPR can be scavenged: 160 // buffer_store_dword v0 ; Only if no free VGPR was found 161 // s_not_b64 exec, exec 162 // buffer_store_dword v0 ; Save inactive lanes 163 // ; exec stays inverted, it is flipped back in 164 // ; restore. 165 void prepare() { 166 // Scavenged temporary VGPR to use. It must be scavenged once for any number 167 // of spilled subregs. 168 // FIXME: The liveness analysis is limited and does not tell if a register 169 // is in use in lanes that are currently inactive. We can never be sure if 170 // a register as actually in use in another lane, so we need to save all 171 // used lanes of the chosen VGPR. 172 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 174 0, false); 175 176 // Reserve temporary stack slot 177 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 178 if (TmpVGPR) { 179 // Found a register that is dead in the currently active lanes, we only 180 // need to spill inactive lanes. 181 TmpVGPRLive = false; 182 } else { 183 // Pick v0 because it doesn't make a difference. 184 TmpVGPR = AMDGPU::VGPR0; 185 TmpVGPRLive = true; 186 } 187 188 if (TmpVGPRLive) { 189 // We need to inform the scavenger that this index is already in use until 190 // we're done with the custom emergency spill. 191 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 192 } 193 194 // We may end up recursively calling the scavenger, and don't want to re-use 195 // the same register. 196 RS->setRegUsed(TmpVGPR); 197 198 // Try to scavenge SGPRs to save exec 199 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 200 const TargetRegisterClass &RC = 201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 202 RS->setRegUsed(SuperReg); 203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); 204 205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 206 207 if (SavedExecReg) { 208 RS->setRegUsed(SavedExecReg); 209 // Set exec to needed lanes 210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 211 auto I = 212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 213 if (!TmpVGPRLive) 214 I.addReg(TmpVGPR, RegState::ImplicitDefine); 215 // Spill needed lanes 216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 217 } else { 218 // The modify and restore of exec clobber SCC, which we would have to save 219 // and restore. FIXME: We probably would need to reserve a register for 220 // this. 221 if (RS->isRegUsed(AMDGPU::SCC)) 222 MI->emitError("unhandled SGPR spill to memory"); 223 224 // Spill active lanes 225 if (TmpVGPRLive) 226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 227 /*IsKill*/ false); 228 // Spill inactive lanes 229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 230 if (!TmpVGPRLive) 231 I.addReg(TmpVGPR, RegState::ImplicitDefine); 232 I->getOperand(2).setIsDead(); // Mark SCC as dead. 233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 234 } 235 } 236 237 // Writes these instructions if an SGPR can be scavenged: 238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 239 // s_waitcnt vmcnt(0) ; If a free VGPR was found 240 // s_mov_b64 exec, s[6:7] ; Save exec 241 // 242 // Writes these instructions if no SGPR can be scavenged: 243 // buffer_load_dword v0 ; Restore inactive lanes 244 // s_waitcnt vmcnt(0) ; If a free VGPR was found 245 // s_not_b64 exec, exec 246 // buffer_load_dword v0 ; Only if no free VGPR was found 247 void restore() { 248 if (SavedExecReg) { 249 // Restore used lanes 250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 251 /*IsKill*/ false); 252 // Restore exec 253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 254 .addReg(SavedExecReg, RegState::Kill); 255 // Add an implicit use of the load so it is not dead. 256 // FIXME This inserts an unnecessary waitcnt 257 if (!TmpVGPRLive) { 258 I.addReg(TmpVGPR, RegState::ImplicitKill); 259 } 260 } else { 261 // Restore inactive lanes 262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 263 /*IsKill*/ false); 264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 265 if (!TmpVGPRLive) 266 I.addReg(TmpVGPR, RegState::ImplicitKill); 267 I->getOperand(2).setIsDead(); // Mark SCC as dead. 268 269 // Restore active lanes 270 if (TmpVGPRLive) 271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 272 } 273 274 // Inform the scavenger where we're releasing our custom scavenged register. 275 if (TmpVGPRLive) { 276 MachineBasicBlock::iterator RestorePt = std::prev(MI); 277 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 278 } 279 } 280 281 // Write TmpVGPR to memory or read TmpVGPR from memory. 282 // Either using a single buffer_load/store if exec is set to the needed mask 283 // or using 284 // buffer_load 285 // s_not exec, exec 286 // buffer_load 287 // s_not exec, exec 288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 289 if (SavedExecReg) { 290 // Spill needed lanes 291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 292 } else { 293 // The modify and restore of exec clobber SCC, which we would have to save 294 // and restore. FIXME: We probably would need to reserve a register for 295 // this. 296 if (RS->isRegUsed(AMDGPU::SCC)) 297 MI->emitError("unhandled SGPR spill to memory"); 298 299 // Spill active lanes 300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 301 /*IsKill*/ false); 302 // Spill inactive lanes 303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 308 } 309 } 310 311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 312 assert(MBB->getParent() == &MF); 313 MI = NewMI; 314 MBB = NewMBB; 315 } 316 }; 317 318 } // namespace llvm 319 320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 322 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 323 324 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 325 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 326 (getSubRegIndexLaneMask(AMDGPU::lo16) | 327 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 328 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 329 "getNumCoveredRegs() will not work with generated subreg masks!"); 330 331 RegPressureIgnoredUnits.resize(getNumRegUnits()); 332 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); 333 for (auto Reg : AMDGPU::VGPR_16RegClass) { 334 if (AMDGPU::isHi(Reg, *this)) 335 RegPressureIgnoredUnits.set(*regunits(Reg).begin()); 336 } 337 338 // HACK: Until this is fully tablegen'd. 339 static llvm::once_flag InitializeRegSplitPartsFlag; 340 341 static auto InitializeRegSplitPartsOnce = [this]() { 342 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 343 unsigned Size = getSubRegIdxSize(Idx); 344 if (Size & 31) 345 continue; 346 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 347 unsigned Pos = getSubRegIdxOffset(Idx); 348 if (Pos % Size) 349 continue; 350 Pos /= Size; 351 if (Vec.empty()) { 352 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 353 Vec.resize(MaxNumParts); 354 } 355 Vec[Pos] = Idx; 356 } 357 }; 358 359 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 360 361 static auto InitializeSubRegFromChannelTableOnce = [this]() { 362 for (auto &Row : SubRegFromChannelTable) 363 Row.fill(AMDGPU::NoSubRegister); 364 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 365 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 366 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 367 assert(Width < SubRegFromChannelTableWidthMap.size()); 368 Width = SubRegFromChannelTableWidthMap[Width]; 369 if (Width == 0) 370 continue; 371 unsigned TableIdx = Width - 1; 372 assert(TableIdx < SubRegFromChannelTable.size()); 373 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 374 SubRegFromChannelTable[TableIdx][Offset] = Idx; 375 } 376 }; 377 378 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 379 llvm::call_once(InitializeSubRegFromChannelTableFlag, 380 InitializeSubRegFromChannelTableOnce); 381 } 382 383 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 384 MCRegister Reg) const { 385 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) 386 Reserved.set(*R); 387 } 388 389 // Forced to be here by one .inc 390 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 391 const MachineFunction *MF) const { 392 CallingConv::ID CC = MF->getFunction().getCallingConv(); 393 switch (CC) { 394 case CallingConv::C: 395 case CallingConv::Fast: 396 case CallingConv::Cold: 397 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 398 : CSR_AMDGPU_SaveList; 399 case CallingConv::AMDGPU_Gfx: 400 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 401 : CSR_AMDGPU_SI_Gfx_SaveList; 402 case CallingConv::AMDGPU_CS_ChainPreserve: 403 return CSR_AMDGPU_CS_ChainPreserve_SaveList; 404 default: { 405 // Dummy to not crash RegisterClassInfo. 406 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 407 return &NoCalleeSavedReg; 408 } 409 } 410 } 411 412 const MCPhysReg * 413 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 414 return nullptr; 415 } 416 417 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 418 CallingConv::ID CC) const { 419 switch (CC) { 420 case CallingConv::C: 421 case CallingConv::Fast: 422 case CallingConv::Cold: 423 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 424 : CSR_AMDGPU_RegMask; 425 case CallingConv::AMDGPU_Gfx: 426 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 427 : CSR_AMDGPU_SI_Gfx_RegMask; 428 case CallingConv::AMDGPU_CS_Chain: 429 case CallingConv::AMDGPU_CS_ChainPreserve: 430 // Calls to these functions never return, so we can pretend everything is 431 // preserved. 432 return AMDGPU_AllVGPRs_RegMask; 433 default: 434 return nullptr; 435 } 436 } 437 438 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 439 return CSR_AMDGPU_NoRegs_RegMask; 440 } 441 442 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { 443 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; 444 } 445 446 const TargetRegisterClass * 447 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 448 const MachineFunction &MF) const { 449 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 450 // equivalent AV class. If used one, the verifier will crash after 451 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 452 // until Instruction selection. 453 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 454 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 455 return &AMDGPU::AV_32RegClass; 456 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 457 return &AMDGPU::AV_64RegClass; 458 if (RC == &AMDGPU::VReg_64_Align2RegClass || 459 RC == &AMDGPU::AReg_64_Align2RegClass) 460 return &AMDGPU::AV_64_Align2RegClass; 461 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 462 return &AMDGPU::AV_96RegClass; 463 if (RC == &AMDGPU::VReg_96_Align2RegClass || 464 RC == &AMDGPU::AReg_96_Align2RegClass) 465 return &AMDGPU::AV_96_Align2RegClass; 466 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 467 return &AMDGPU::AV_128RegClass; 468 if (RC == &AMDGPU::VReg_128_Align2RegClass || 469 RC == &AMDGPU::AReg_128_Align2RegClass) 470 return &AMDGPU::AV_128_Align2RegClass; 471 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 472 return &AMDGPU::AV_160RegClass; 473 if (RC == &AMDGPU::VReg_160_Align2RegClass || 474 RC == &AMDGPU::AReg_160_Align2RegClass) 475 return &AMDGPU::AV_160_Align2RegClass; 476 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 477 return &AMDGPU::AV_192RegClass; 478 if (RC == &AMDGPU::VReg_192_Align2RegClass || 479 RC == &AMDGPU::AReg_192_Align2RegClass) 480 return &AMDGPU::AV_192_Align2RegClass; 481 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 482 return &AMDGPU::AV_256RegClass; 483 if (RC == &AMDGPU::VReg_256_Align2RegClass || 484 RC == &AMDGPU::AReg_256_Align2RegClass) 485 return &AMDGPU::AV_256_Align2RegClass; 486 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 487 return &AMDGPU::AV_512RegClass; 488 if (RC == &AMDGPU::VReg_512_Align2RegClass || 489 RC == &AMDGPU::AReg_512_Align2RegClass) 490 return &AMDGPU::AV_512_Align2RegClass; 491 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 492 return &AMDGPU::AV_1024RegClass; 493 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 494 RC == &AMDGPU::AReg_1024_Align2RegClass) 495 return &AMDGPU::AV_1024_Align2RegClass; 496 } 497 498 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 499 } 500 501 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 502 const SIFrameLowering *TFI = ST.getFrameLowering(); 503 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 504 // During ISel lowering we always reserve the stack pointer in entry and chain 505 // functions, but never actually want to reference it when accessing our own 506 // frame. If we need a frame pointer we use it, but otherwise we can just use 507 // an immediate "0" which we represent by returning NoRegister. 508 if (FuncInfo->isBottomOfStack()) { 509 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 510 } 511 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 512 : FuncInfo->getStackPtrOffsetReg(); 513 } 514 515 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 516 // When we need stack realignment, we can't reference off of the 517 // stack pointer, so we reserve a base pointer. 518 const MachineFrameInfo &MFI = MF.getFrameInfo(); 519 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 520 } 521 522 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 523 524 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 525 return AMDGPU_AllVGPRs_RegMask; 526 } 527 528 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 529 return AMDGPU_AllAGPRs_RegMask; 530 } 531 532 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 533 return AMDGPU_AllVectorRegs_RegMask; 534 } 535 536 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 537 return AMDGPU_AllAllocatableSRegs_RegMask; 538 } 539 540 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 541 unsigned NumRegs) { 542 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 543 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 544 assert(NumRegIndex && "Not implemented"); 545 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 546 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 547 } 548 549 MCRegister 550 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, 551 const unsigned Align, 552 const TargetRegisterClass *RC) const { 553 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; 554 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 555 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); 556 } 557 558 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 559 const MachineFunction &MF) const { 560 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); 561 } 562 563 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 564 BitVector Reserved(getNumRegs()); 565 Reserved.set(AMDGPU::MODE); 566 567 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 568 569 // Reserve special purpose registers. 570 // 571 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 572 // this seems likely to result in bugs, so I'm marking them as reserved. 573 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 574 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 575 576 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 577 reserveRegisterTuples(Reserved, AMDGPU::M0); 578 579 // Reserve src_vccz, src_execz, src_scc. 580 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 581 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 582 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 583 584 // Reserve the memory aperture registers 585 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 586 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 587 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 588 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 589 590 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 591 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 592 593 // Reserve xnack_mask registers - support is not implemented in Codegen. 594 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 595 596 // Reserve lds_direct register - support is not implemented in Codegen. 597 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 598 599 // Reserve Trap Handler registers - support is not implemented in Codegen. 600 reserveRegisterTuples(Reserved, AMDGPU::TBA); 601 reserveRegisterTuples(Reserved, AMDGPU::TMA); 602 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 603 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 604 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 605 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 606 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 607 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 608 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 609 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 610 611 // Reserve null register - it shall never be allocated 612 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 613 614 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 615 // will result in bugs. 616 if (isWave32) { 617 Reserved.set(AMDGPU::VCC); 618 Reserved.set(AMDGPU::VCC_HI); 619 } 620 621 // Reserve SGPRs. 622 // 623 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 624 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 625 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 626 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 627 reserveRegisterTuples(Reserved, Reg); 628 } 629 630 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 631 if (ScratchRSrcReg != AMDGPU::NoRegister) { 632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 633 // need to spill. 634 // TODO: May need to reserve a VGPR if doing LDS spilling. 635 reserveRegisterTuples(Reserved, ScratchRSrcReg); 636 } 637 638 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 639 if (LongBranchReservedReg) 640 reserveRegisterTuples(Reserved, LongBranchReservedReg); 641 642 // We have to assume the SP is needed in case there are calls in the function, 643 // which is detected after the function is lowered. If we aren't really going 644 // to need SP, don't bother reserving it. 645 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 646 if (StackPtrReg) { 647 reserveRegisterTuples(Reserved, StackPtrReg); 648 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 649 } 650 651 MCRegister FrameReg = MFI->getFrameOffsetReg(); 652 if (FrameReg) { 653 reserveRegisterTuples(Reserved, FrameReg); 654 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 655 } 656 657 if (hasBasePointer(MF)) { 658 MCRegister BasePtrReg = getBaseRegister(); 659 reserveRegisterTuples(Reserved, BasePtrReg); 660 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 661 } 662 663 // FIXME: Use same reserved register introduced in D149775 664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. 665 Register ExecCopyReg = MFI->getSGPRForEXECCopy(); 666 if (ExecCopyReg) 667 reserveRegisterTuples(Reserved, ExecCopyReg); 668 669 // Reserve VGPRs/AGPRs. 670 // 671 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 672 unsigned MaxNumAGPRs = MaxNumVGPRs; 673 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 674 675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 676 // a wave may have up to 512 total vector registers combining together both 677 // VGPRs and AGPRs. Hence, in an entry function without calls and without 678 // AGPRs used within it, it is possible to use the whole vector register 679 // budget for VGPRs. 680 // 681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 682 // register file accordingly. 683 if (ST.hasGFX90AInsts()) { 684 if (MFI->usesAGPRs(MF)) { 685 MaxNumVGPRs /= 2; 686 MaxNumAGPRs = MaxNumVGPRs; 687 } else { 688 if (MaxNumVGPRs > TotalNumVGPRs) { 689 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 690 MaxNumVGPRs = TotalNumVGPRs; 691 } else 692 MaxNumAGPRs = 0; 693 } 694 } 695 696 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 697 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 698 reserveRegisterTuples(Reserved, Reg); 699 } 700 701 if (ST.hasMAIInsts()) { 702 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 703 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 704 reserveRegisterTuples(Reserved, Reg); 705 } 706 } else { 707 // Reserve all the AGPRs if there are no instructions to use it. 708 for (MCRegister Reg : AMDGPU::AGPR_32RegClass) 709 reserveRegisterTuples(Reserved, Reg); 710 } 711 712 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 713 // VGPR available at all times. 714 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 715 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 716 } 717 718 for (Register Reg : MFI->getWWMReservedRegs()) 719 reserveRegisterTuples(Reserved, Reg); 720 721 // FIXME: Stop using reserved registers for this. 722 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 723 reserveRegisterTuples(Reserved, Reg); 724 725 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 726 reserveRegisterTuples(Reserved, Reg); 727 728 return Reserved; 729 } 730 731 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 732 MCRegister PhysReg) const { 733 return !MF.getRegInfo().isReserved(PhysReg); 734 } 735 736 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 737 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 738 // On entry or in chain functions, the base address is 0, so it can't possibly 739 // need any more alignment. 740 741 // FIXME: Should be able to specify the entry frame alignment per calling 742 // convention instead. 743 if (Info->isBottomOfStack()) 744 return false; 745 746 return TargetRegisterInfo::shouldRealignStack(MF); 747 } 748 749 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 750 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 751 if (Info->isEntryFunction()) { 752 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 753 return MFI.hasStackObjects() || MFI.hasCalls(); 754 } 755 756 // May need scavenger for dealing with callee saved registers. 757 return true; 758 } 759 760 bool SIRegisterInfo::requiresFrameIndexScavenging( 761 const MachineFunction &MF) const { 762 // Do not use frame virtual registers. They used to be used for SGPRs, but 763 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 764 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 765 // spill. 766 return false; 767 } 768 769 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 770 const MachineFunction &MF) const { 771 const MachineFrameInfo &MFI = MF.getFrameInfo(); 772 return MFI.hasStackObjects(); 773 } 774 775 bool SIRegisterInfo::requiresVirtualBaseRegisters( 776 const MachineFunction &) const { 777 // There are no special dedicated stack or frame pointers. 778 return true; 779 } 780 781 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 782 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 783 784 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 785 AMDGPU::OpName::offset); 786 return MI->getOperand(OffIdx).getImm(); 787 } 788 789 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 790 int Idx) const { 791 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 792 return 0; 793 794 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 795 AMDGPU::OpName::vaddr) || 796 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 797 AMDGPU::OpName::saddr))) && 798 "Should never see frame index on non-address operand"); 799 800 return getScratchInstrOffset(MI); 801 } 802 803 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 804 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 805 return false; 806 807 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 808 809 const SIInstrInfo *TII = ST.getInstrInfo(); 810 if (SIInstrInfo::isMUBUF(*MI)) 811 return !TII->isLegalMUBUFImmOffset(FullOffset); 812 813 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 814 SIInstrFlags::FlatScratch); 815 } 816 817 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 818 int FrameIdx, 819 int64_t Offset) const { 820 MachineBasicBlock::iterator Ins = MBB->begin(); 821 DebugLoc DL; // Defaults to "unknown" 822 823 if (Ins != MBB->end()) 824 DL = Ins->getDebugLoc(); 825 826 MachineFunction *MF = MBB->getParent(); 827 const SIInstrInfo *TII = ST.getInstrInfo(); 828 MachineRegisterInfo &MRI = MF->getRegInfo(); 829 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 830 : AMDGPU::V_MOV_B32_e32; 831 832 Register BaseReg = MRI.createVirtualRegister( 833 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 834 : &AMDGPU::VGPR_32RegClass); 835 836 if (Offset == 0) { 837 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 838 .addFrameIndex(FrameIdx); 839 return BaseReg; 840 } 841 842 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 843 844 Register FIReg = MRI.createVirtualRegister( 845 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 846 : &AMDGPU::VGPR_32RegClass); 847 848 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 849 .addImm(Offset); 850 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 851 .addFrameIndex(FrameIdx); 852 853 if (ST.enableFlatScratch() ) { 854 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 855 .addReg(OffsetReg, RegState::Kill) 856 .addReg(FIReg); 857 return BaseReg; 858 } 859 860 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 861 .addReg(OffsetReg, RegState::Kill) 862 .addReg(FIReg) 863 .addImm(0); // clamp bit 864 865 return BaseReg; 866 } 867 868 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 869 int64_t Offset) const { 870 const SIInstrInfo *TII = ST.getInstrInfo(); 871 bool IsFlat = TII->isFLATScratch(MI); 872 873 #ifndef NDEBUG 874 // FIXME: Is it possible to be storing a frame index to itself? 875 bool SeenFI = false; 876 for (const MachineOperand &MO: MI.operands()) { 877 if (MO.isFI()) { 878 if (SeenFI) 879 llvm_unreachable("should not see multiple frame indices"); 880 881 SeenFI = true; 882 } 883 } 884 #endif 885 886 MachineOperand *FIOp = 887 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 888 : AMDGPU::OpName::vaddr); 889 890 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 891 int64_t NewOffset = OffsetOp->getImm() + Offset; 892 893 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 894 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 895 896 if (IsFlat) { 897 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 898 SIInstrFlags::FlatScratch) && 899 "offset should be legal"); 900 FIOp->ChangeToRegister(BaseReg, false); 901 OffsetOp->setImm(NewOffset); 902 return; 903 } 904 905 #ifndef NDEBUG 906 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 907 assert(SOffset->isImm() && SOffset->getImm() == 0); 908 #endif 909 910 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); 911 912 FIOp->ChangeToRegister(BaseReg, false); 913 OffsetOp->setImm(NewOffset); 914 } 915 916 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 917 Register BaseReg, 918 int64_t Offset) const { 919 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 920 return false; 921 922 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 923 924 const SIInstrInfo *TII = ST.getInstrInfo(); 925 if (SIInstrInfo::isMUBUF(*MI)) 926 return TII->isLegalMUBUFImmOffset(NewOffset); 927 928 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 929 SIInstrFlags::FlatScratch); 930 } 931 932 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 933 const MachineFunction &MF, unsigned Kind) const { 934 // This is inaccurate. It depends on the instruction and address space. The 935 // only place where we should hit this is for dealing with frame indexes / 936 // private accesses, so this is correct in that case. 937 return &AMDGPU::VGPR_32RegClass; 938 } 939 940 const TargetRegisterClass * 941 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 942 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 943 return getEquivalentVGPRClass(RC); 944 if (RC == &AMDGPU::SCC_CLASSRegClass) 945 return getWaveMaskRegClass(); 946 947 return RC; 948 } 949 950 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 951 952 switch (Op) { 953 case AMDGPU::SI_SPILL_S1024_SAVE: 954 case AMDGPU::SI_SPILL_S1024_RESTORE: 955 case AMDGPU::SI_SPILL_V1024_SAVE: 956 case AMDGPU::SI_SPILL_V1024_RESTORE: 957 case AMDGPU::SI_SPILL_A1024_SAVE: 958 case AMDGPU::SI_SPILL_A1024_RESTORE: 959 case AMDGPU::SI_SPILL_AV1024_SAVE: 960 case AMDGPU::SI_SPILL_AV1024_RESTORE: 961 return 32; 962 case AMDGPU::SI_SPILL_S512_SAVE: 963 case AMDGPU::SI_SPILL_S512_RESTORE: 964 case AMDGPU::SI_SPILL_V512_SAVE: 965 case AMDGPU::SI_SPILL_V512_RESTORE: 966 case AMDGPU::SI_SPILL_A512_SAVE: 967 case AMDGPU::SI_SPILL_A512_RESTORE: 968 case AMDGPU::SI_SPILL_AV512_SAVE: 969 case AMDGPU::SI_SPILL_AV512_RESTORE: 970 return 16; 971 case AMDGPU::SI_SPILL_S384_SAVE: 972 case AMDGPU::SI_SPILL_S384_RESTORE: 973 case AMDGPU::SI_SPILL_V384_SAVE: 974 case AMDGPU::SI_SPILL_V384_RESTORE: 975 case AMDGPU::SI_SPILL_A384_SAVE: 976 case AMDGPU::SI_SPILL_A384_RESTORE: 977 case AMDGPU::SI_SPILL_AV384_SAVE: 978 case AMDGPU::SI_SPILL_AV384_RESTORE: 979 return 12; 980 case AMDGPU::SI_SPILL_S352_SAVE: 981 case AMDGPU::SI_SPILL_S352_RESTORE: 982 case AMDGPU::SI_SPILL_V352_SAVE: 983 case AMDGPU::SI_SPILL_V352_RESTORE: 984 case AMDGPU::SI_SPILL_A352_SAVE: 985 case AMDGPU::SI_SPILL_A352_RESTORE: 986 case AMDGPU::SI_SPILL_AV352_SAVE: 987 case AMDGPU::SI_SPILL_AV352_RESTORE: 988 return 11; 989 case AMDGPU::SI_SPILL_S320_SAVE: 990 case AMDGPU::SI_SPILL_S320_RESTORE: 991 case AMDGPU::SI_SPILL_V320_SAVE: 992 case AMDGPU::SI_SPILL_V320_RESTORE: 993 case AMDGPU::SI_SPILL_A320_SAVE: 994 case AMDGPU::SI_SPILL_A320_RESTORE: 995 case AMDGPU::SI_SPILL_AV320_SAVE: 996 case AMDGPU::SI_SPILL_AV320_RESTORE: 997 return 10; 998 case AMDGPU::SI_SPILL_S288_SAVE: 999 case AMDGPU::SI_SPILL_S288_RESTORE: 1000 case AMDGPU::SI_SPILL_V288_SAVE: 1001 case AMDGPU::SI_SPILL_V288_RESTORE: 1002 case AMDGPU::SI_SPILL_A288_SAVE: 1003 case AMDGPU::SI_SPILL_A288_RESTORE: 1004 case AMDGPU::SI_SPILL_AV288_SAVE: 1005 case AMDGPU::SI_SPILL_AV288_RESTORE: 1006 return 9; 1007 case AMDGPU::SI_SPILL_S256_SAVE: 1008 case AMDGPU::SI_SPILL_S256_RESTORE: 1009 case AMDGPU::SI_SPILL_V256_SAVE: 1010 case AMDGPU::SI_SPILL_V256_RESTORE: 1011 case AMDGPU::SI_SPILL_A256_SAVE: 1012 case AMDGPU::SI_SPILL_A256_RESTORE: 1013 case AMDGPU::SI_SPILL_AV256_SAVE: 1014 case AMDGPU::SI_SPILL_AV256_RESTORE: 1015 return 8; 1016 case AMDGPU::SI_SPILL_S224_SAVE: 1017 case AMDGPU::SI_SPILL_S224_RESTORE: 1018 case AMDGPU::SI_SPILL_V224_SAVE: 1019 case AMDGPU::SI_SPILL_V224_RESTORE: 1020 case AMDGPU::SI_SPILL_A224_SAVE: 1021 case AMDGPU::SI_SPILL_A224_RESTORE: 1022 case AMDGPU::SI_SPILL_AV224_SAVE: 1023 case AMDGPU::SI_SPILL_AV224_RESTORE: 1024 return 7; 1025 case AMDGPU::SI_SPILL_S192_SAVE: 1026 case AMDGPU::SI_SPILL_S192_RESTORE: 1027 case AMDGPU::SI_SPILL_V192_SAVE: 1028 case AMDGPU::SI_SPILL_V192_RESTORE: 1029 case AMDGPU::SI_SPILL_A192_SAVE: 1030 case AMDGPU::SI_SPILL_A192_RESTORE: 1031 case AMDGPU::SI_SPILL_AV192_SAVE: 1032 case AMDGPU::SI_SPILL_AV192_RESTORE: 1033 return 6; 1034 case AMDGPU::SI_SPILL_S160_SAVE: 1035 case AMDGPU::SI_SPILL_S160_RESTORE: 1036 case AMDGPU::SI_SPILL_V160_SAVE: 1037 case AMDGPU::SI_SPILL_V160_RESTORE: 1038 case AMDGPU::SI_SPILL_A160_SAVE: 1039 case AMDGPU::SI_SPILL_A160_RESTORE: 1040 case AMDGPU::SI_SPILL_AV160_SAVE: 1041 case AMDGPU::SI_SPILL_AV160_RESTORE: 1042 return 5; 1043 case AMDGPU::SI_SPILL_S128_SAVE: 1044 case AMDGPU::SI_SPILL_S128_RESTORE: 1045 case AMDGPU::SI_SPILL_V128_SAVE: 1046 case AMDGPU::SI_SPILL_V128_RESTORE: 1047 case AMDGPU::SI_SPILL_A128_SAVE: 1048 case AMDGPU::SI_SPILL_A128_RESTORE: 1049 case AMDGPU::SI_SPILL_AV128_SAVE: 1050 case AMDGPU::SI_SPILL_AV128_RESTORE: 1051 return 4; 1052 case AMDGPU::SI_SPILL_S96_SAVE: 1053 case AMDGPU::SI_SPILL_S96_RESTORE: 1054 case AMDGPU::SI_SPILL_V96_SAVE: 1055 case AMDGPU::SI_SPILL_V96_RESTORE: 1056 case AMDGPU::SI_SPILL_A96_SAVE: 1057 case AMDGPU::SI_SPILL_A96_RESTORE: 1058 case AMDGPU::SI_SPILL_AV96_SAVE: 1059 case AMDGPU::SI_SPILL_AV96_RESTORE: 1060 return 3; 1061 case AMDGPU::SI_SPILL_S64_SAVE: 1062 case AMDGPU::SI_SPILL_S64_RESTORE: 1063 case AMDGPU::SI_SPILL_V64_SAVE: 1064 case AMDGPU::SI_SPILL_V64_RESTORE: 1065 case AMDGPU::SI_SPILL_A64_SAVE: 1066 case AMDGPU::SI_SPILL_A64_RESTORE: 1067 case AMDGPU::SI_SPILL_AV64_SAVE: 1068 case AMDGPU::SI_SPILL_AV64_RESTORE: 1069 return 2; 1070 case AMDGPU::SI_SPILL_S32_SAVE: 1071 case AMDGPU::SI_SPILL_S32_RESTORE: 1072 case AMDGPU::SI_SPILL_V32_SAVE: 1073 case AMDGPU::SI_SPILL_V32_RESTORE: 1074 case AMDGPU::SI_SPILL_A32_SAVE: 1075 case AMDGPU::SI_SPILL_A32_RESTORE: 1076 case AMDGPU::SI_SPILL_AV32_SAVE: 1077 case AMDGPU::SI_SPILL_AV32_RESTORE: 1078 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 1079 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 1080 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: 1081 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: 1082 return 1; 1083 default: llvm_unreachable("Invalid spill opcode"); 1084 } 1085 } 1086 1087 static int getOffsetMUBUFStore(unsigned Opc) { 1088 switch (Opc) { 1089 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1090 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1091 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1092 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1093 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1094 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1095 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1096 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1097 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1098 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1099 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1100 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1101 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1102 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1103 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1104 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1105 default: 1106 return -1; 1107 } 1108 } 1109 1110 static int getOffsetMUBUFLoad(unsigned Opc) { 1111 switch (Opc) { 1112 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1113 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1114 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1115 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1116 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1117 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1118 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1119 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1120 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1121 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1122 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1123 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1124 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1125 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1126 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1127 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1128 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1129 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1130 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1131 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1132 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1133 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1134 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1135 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1136 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1137 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1138 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1139 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1140 default: 1141 return -1; 1142 } 1143 } 1144 1145 static int getOffenMUBUFStore(unsigned Opc) { 1146 switch (Opc) { 1147 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1148 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1149 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1150 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1151 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1152 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1153 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1154 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1155 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1156 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1157 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1158 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1159 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1160 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1161 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1162 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1163 default: 1164 return -1; 1165 } 1166 } 1167 1168 static int getOffenMUBUFLoad(unsigned Opc) { 1169 switch (Opc) { 1170 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1171 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1172 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1173 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1174 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1175 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1176 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1177 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1178 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1179 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1180 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1181 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1182 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1183 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1184 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1185 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1186 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1187 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1188 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1189 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1190 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1191 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1192 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1193 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1194 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1195 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1196 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1197 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1198 default: 1199 return -1; 1200 } 1201 } 1202 1203 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1204 MachineBasicBlock &MBB, 1205 MachineBasicBlock::iterator MI, 1206 int Index, unsigned Lane, 1207 unsigned ValueReg, bool IsKill) { 1208 MachineFunction *MF = MBB.getParent(); 1209 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1210 const SIInstrInfo *TII = ST.getInstrInfo(); 1211 1212 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1213 1214 if (Reg == AMDGPU::NoRegister) 1215 return MachineInstrBuilder(); 1216 1217 bool IsStore = MI->mayStore(); 1218 MachineRegisterInfo &MRI = MF->getRegInfo(); 1219 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1220 1221 unsigned Dst = IsStore ? Reg : ValueReg; 1222 unsigned Src = IsStore ? ValueReg : Reg; 1223 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1224 DebugLoc DL = MI->getDebugLoc(); 1225 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1226 // Spiller during regalloc may restore a spilled register to its superclass. 1227 // It could result in AGPR spills restored to VGPRs or the other way around, 1228 // making the src and dst with identical regclasses at this point. It just 1229 // needs a copy in such cases. 1230 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1231 .addReg(Src, getKillRegState(IsKill)); 1232 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1233 return CopyMIB; 1234 } 1235 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1236 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1237 1238 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1239 .addReg(Src, getKillRegState(IsKill)); 1240 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1241 return MIB; 1242 } 1243 1244 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1245 // need to handle the case where an SGPR may need to be spilled while spilling. 1246 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1247 MachineFrameInfo &MFI, 1248 MachineBasicBlock::iterator MI, 1249 int Index, 1250 int64_t Offset) { 1251 const SIInstrInfo *TII = ST.getInstrInfo(); 1252 MachineBasicBlock *MBB = MI->getParent(); 1253 const DebugLoc &DL = MI->getDebugLoc(); 1254 bool IsStore = MI->mayStore(); 1255 1256 unsigned Opc = MI->getOpcode(); 1257 int LoadStoreOp = IsStore ? 1258 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1259 if (LoadStoreOp == -1) 1260 return false; 1261 1262 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1263 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1264 return true; 1265 1266 MachineInstrBuilder NewMI = 1267 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1268 .add(*Reg) 1269 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1270 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1271 .addImm(Offset) 1272 .addImm(0) // cpol 1273 .addImm(0) // swz 1274 .cloneMemRefs(*MI); 1275 1276 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1277 AMDGPU::OpName::vdata_in); 1278 if (VDataIn) 1279 NewMI.add(*VDataIn); 1280 return true; 1281 } 1282 1283 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1284 unsigned LoadStoreOp, 1285 unsigned EltSize) { 1286 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1287 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1288 bool UseST = 1289 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1290 1291 switch (EltSize) { 1292 case 4: 1293 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1294 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1295 break; 1296 case 8: 1297 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1298 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1299 break; 1300 case 12: 1301 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1302 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1303 break; 1304 case 16: 1305 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1306 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1307 break; 1308 default: 1309 llvm_unreachable("Unexpected spill load/store size!"); 1310 } 1311 1312 if (HasVAddr) 1313 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1314 else if (UseST) 1315 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1316 1317 return LoadStoreOp; 1318 } 1319 1320 void SIRegisterInfo::buildSpillLoadStore( 1321 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1322 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1323 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1324 RegScavenger *RS, LiveRegUnits *LiveUnits) const { 1325 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both"); 1326 1327 MachineFunction *MF = MBB.getParent(); 1328 const SIInstrInfo *TII = ST.getInstrInfo(); 1329 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1330 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1331 1332 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1333 bool IsStore = Desc->mayStore(); 1334 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1335 1336 bool CanClobberSCC = false; 1337 bool Scavenged = false; 1338 MCRegister SOffset = ScratchOffsetReg; 1339 1340 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1341 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1342 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1343 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; 1344 1345 // Always use 4 byte operations for AGPRs because we need to scavenge 1346 // a temporary VGPR. 1347 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1348 unsigned NumSubRegs = RegWidth / EltSize; 1349 unsigned Size = NumSubRegs * EltSize; 1350 unsigned RemSize = RegWidth - Size; 1351 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1352 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1353 int64_t MaterializedOffset = Offset; 1354 1355 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1356 int64_t ScratchOffsetRegDelta = 0; 1357 1358 if (IsFlat && EltSize > 4) { 1359 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1360 Desc = &TII->get(LoadStoreOp); 1361 } 1362 1363 Align Alignment = MFI.getObjectAlign(Index); 1364 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1365 1366 assert((IsFlat || ((Offset % EltSize) == 0)) && 1367 "unexpected VGPR spill offset"); 1368 1369 // Track a VGPR to use for a constant offset we need to materialize. 1370 Register TmpOffsetVGPR; 1371 1372 // Track a VGPR to use as an intermediate value. 1373 Register TmpIntermediateVGPR; 1374 bool UseVGPROffset = false; 1375 1376 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1377 // combination. 1378 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1379 int64_t VOffset) { 1380 // We are using a VGPR offset 1381 if (IsFlat && SGPRBase) { 1382 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1383 // SGPR, so perform the add as vector. 1384 // We don't need a base SGPR in the kernel. 1385 1386 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1387 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1388 .addReg(SGPRBase) 1389 .addImm(VOffset) 1390 .addImm(0); // clamp 1391 } else { 1392 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1393 .addReg(SGPRBase); 1394 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1395 .addImm(VOffset) 1396 .addReg(TmpOffsetVGPR); 1397 } 1398 } else { 1399 assert(TmpOffsetVGPR); 1400 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1401 .addImm(VOffset); 1402 } 1403 }; 1404 1405 bool IsOffsetLegal = 1406 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1407 SIInstrFlags::FlatScratch) 1408 : TII->isLegalMUBUFImmOffset(MaxOffset); 1409 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1410 SOffset = MCRegister(); 1411 1412 // We don't have access to the register scavenger if this function is called 1413 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. 1414 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1415 // entry. 1416 if (RS) { 1417 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1418 1419 // Piggy back on the liveness scan we just did see if SCC is dead. 1420 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1421 } else if (LiveUnits) { 1422 CanClobberSCC = LiveUnits->available(AMDGPU::SCC); 1423 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1424 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1425 SOffset = Reg; 1426 break; 1427 } 1428 } 1429 } 1430 1431 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1432 SOffset = Register(); 1433 1434 if (!SOffset) { 1435 UseVGPROffset = true; 1436 1437 if (RS) { 1438 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1439 } else { 1440 assert(LiveUnits); 1441 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1442 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1443 TmpOffsetVGPR = Reg; 1444 break; 1445 } 1446 } 1447 } 1448 1449 assert(TmpOffsetVGPR); 1450 } else if (!SOffset && CanClobberSCC) { 1451 // There are no free SGPRs, and since we are in the process of spilling 1452 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1453 // on SI/CI and on VI it is true until we implement spilling using scalar 1454 // stores), we have no way to free up an SGPR. Our solution here is to 1455 // add the offset directly to the ScratchOffset or StackPtrOffset 1456 // register, and then subtract the offset after the spill to return the 1457 // register to it's original value. 1458 1459 // TODO: If we don't have to do an emergency stack slot spill, converting 1460 // to use the VGPR offset is fewer instructions. 1461 if (!ScratchOffsetReg) 1462 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1463 SOffset = ScratchOffsetReg; 1464 ScratchOffsetRegDelta = Offset; 1465 } else { 1466 Scavenged = true; 1467 } 1468 1469 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1470 // we can simplify the adjustment of Offset here to just scale with 1471 // WavefrontSize. 1472 if (!IsFlat && !UseVGPROffset) 1473 Offset *= ST.getWavefrontSize(); 1474 1475 if (!UseVGPROffset && !SOffset) 1476 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1477 1478 if (UseVGPROffset) { 1479 // We are using a VGPR offset 1480 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1481 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1482 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1483 } else { 1484 assert(Offset != 0); 1485 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1486 .addReg(ScratchOffsetReg) 1487 .addImm(Offset); 1488 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1489 } 1490 1491 Offset = 0; 1492 } 1493 1494 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1495 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1496 && "Unexpected vaddr for flat scratch with a FI operand"); 1497 1498 if (UseVGPROffset) { 1499 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1500 } else { 1501 assert(ST.hasFlatScratchSTMode()); 1502 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1503 } 1504 1505 Desc = &TII->get(LoadStoreOp); 1506 } 1507 1508 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1509 ++i, RegOffset += EltSize) { 1510 if (i == NumSubRegs) { 1511 EltSize = RemSize; 1512 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1513 } 1514 Desc = &TII->get(LoadStoreOp); 1515 1516 if (!IsFlat && UseVGPROffset) { 1517 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1518 : getOffenMUBUFLoad(LoadStoreOp); 1519 Desc = &TII->get(NewLoadStoreOp); 1520 } 1521 1522 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1523 // If we are spilling an AGPR beyond the range of the memory instruction 1524 // offset and need to use a VGPR offset, we ideally have at least 2 1525 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1526 // recycle the VGPR used for the offset which requires resetting after 1527 // each subregister. 1528 1529 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1530 } 1531 1532 unsigned NumRegs = EltSize / 4; 1533 Register SubReg = e == 1 1534 ? ValueReg 1535 : Register(getSubReg(ValueReg, 1536 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1537 1538 unsigned SOffsetRegState = 0; 1539 unsigned SrcDstRegState = getDefRegState(!IsStore); 1540 const bool IsLastSubReg = i + 1 == e; 1541 const bool IsFirstSubReg = i == 0; 1542 if (IsLastSubReg) { 1543 SOffsetRegState |= getKillRegState(Scavenged); 1544 // The last implicit use carries the "Kill" flag. 1545 SrcDstRegState |= getKillRegState(IsKill); 1546 } 1547 1548 // Make sure the whole register is defined if there are undef components by 1549 // adding an implicit def of the super-reg on the first instruction. 1550 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1551 bool NeedSuperRegImpOperand = e > 1; 1552 1553 // Remaining element size to spill into memory after some parts of it 1554 // spilled into either AGPRs or VGPRs. 1555 unsigned RemEltSize = EltSize; 1556 1557 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1558 // starting from the last lane. In case if a register cannot be completely 1559 // spilled into another register that will ensure its alignment does not 1560 // change. For targets with VGPR alignment requirement this is important 1561 // in case of flat scratch usage as we might get a scratch_load or 1562 // scratch_store of an unaligned register otherwise. 1563 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1564 LaneE = RegOffset / 4; 1565 Lane >= LaneE; --Lane) { 1566 bool IsSubReg = e > 1 || EltSize > 4; 1567 Register Sub = IsSubReg 1568 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1569 : ValueReg; 1570 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1571 if (!MIB.getInstr()) 1572 break; 1573 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1574 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1575 NeedSuperRegDef = false; 1576 } 1577 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1578 NeedSuperRegImpOperand = true; 1579 unsigned State = SrcDstRegState; 1580 if (!IsLastSubReg || (Lane != LaneE)) 1581 State &= ~RegState::Kill; 1582 if (!IsFirstSubReg || (Lane != LaneS)) 1583 State &= ~RegState::Define; 1584 MIB.addReg(ValueReg, RegState::Implicit | State); 1585 } 1586 RemEltSize -= 4; 1587 } 1588 1589 if (!RemEltSize) // Fully spilled into AGPRs. 1590 continue; 1591 1592 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1593 assert(IsFlat && EltSize > 4); 1594 1595 unsigned NumRegs = RemEltSize / 4; 1596 SubReg = Register(getSubReg(ValueReg, 1597 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1598 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1599 Desc = &TII->get(Opc); 1600 } 1601 1602 unsigned FinalReg = SubReg; 1603 1604 if (IsAGPR) { 1605 assert(EltSize == 4); 1606 1607 if (!TmpIntermediateVGPR) { 1608 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1609 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1610 } 1611 if (IsStore) { 1612 auto AccRead = BuildMI(MBB, MI, DL, 1613 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1614 TmpIntermediateVGPR) 1615 .addReg(SubReg, getKillRegState(IsKill)); 1616 if (NeedSuperRegDef) 1617 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1618 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1619 } 1620 SubReg = TmpIntermediateVGPR; 1621 } else if (UseVGPROffset) { 1622 // FIXME: change to scavengeRegisterBackwards() 1623 if (!TmpOffsetVGPR) { 1624 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 1625 MI, false, 0); 1626 RS->setRegUsed(TmpOffsetVGPR); 1627 } 1628 } 1629 1630 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1631 MachineMemOperand *NewMMO = 1632 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1633 commonAlignment(Alignment, RegOffset)); 1634 1635 auto MIB = 1636 BuildMI(MBB, MI, DL, *Desc) 1637 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1638 1639 if (UseVGPROffset) { 1640 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1641 // intermediate accvgpr_write. 1642 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1643 } 1644 1645 if (!IsFlat) 1646 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1647 1648 if (SOffset == AMDGPU::NoRegister) { 1649 if (!IsFlat) { 1650 if (UseVGPROffset && ScratchOffsetReg) { 1651 MIB.addReg(ScratchOffsetReg); 1652 } else { 1653 assert(FuncInfo->isBottomOfStack()); 1654 MIB.addImm(0); 1655 } 1656 } 1657 } else { 1658 MIB.addReg(SOffset, SOffsetRegState); 1659 } 1660 1661 MIB.addImm(Offset + RegOffset); 1662 1663 bool LastUse = MMO->getFlags() & MOLastUse; 1664 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol 1665 1666 if (!IsFlat) 1667 MIB.addImm(0); // swz 1668 MIB.addMemOperand(NewMMO); 1669 1670 if (!IsAGPR && NeedSuperRegDef) 1671 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1672 1673 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1674 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1675 FinalReg) 1676 .addReg(TmpIntermediateVGPR, RegState::Kill); 1677 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1678 } 1679 1680 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1681 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1682 1683 // The epilog restore of a wwm-scratch register can cause undesired 1684 // optimization during machine-cp post PrologEpilogInserter if the same 1685 // register was assigned for return value ABI lowering with a COPY 1686 // instruction. As given below, with the epilog reload, the earlier COPY 1687 // appeared to be dead during machine-cp. 1688 // ... 1689 // v0 in WWM operation, needs the WWM spill at prolog/epilog. 1690 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 1691 // ... 1692 // Epilog block: 1693 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 1694 // ... 1695 // WWM spill restore to preserve the inactive lanes of v0. 1696 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 1697 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 1698 // $exec = S_MOV_B64 killed $sgpr4_sgpr5 1699 // ... 1700 // SI_RETURN implicit $vgpr0 1701 // ... 1702 // To fix it, mark the same reg as a tied op for such restore instructions 1703 // so that it marks a usage for the preceding COPY. 1704 if (!IsStore && MI != MBB.end() && MI->isReturn() && 1705 MI->readsRegister(SubReg, this)) { 1706 MIB.addReg(SubReg, RegState::Implicit); 1707 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1708 } 1709 } 1710 1711 if (ScratchOffsetRegDelta != 0) { 1712 // Subtract the offset we added to the ScratchOffset register. 1713 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1714 .addReg(SOffset) 1715 .addImm(-ScratchOffsetRegDelta); 1716 } 1717 } 1718 1719 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1720 int Offset, bool IsLoad, 1721 bool IsKill) const { 1722 // Load/store VGPR 1723 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1724 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1725 1726 Register FrameReg = 1727 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1728 ? getBaseRegister() 1729 : getFrameRegister(SB.MF); 1730 1731 Align Alignment = FrameInfo.getObjectAlign(Index); 1732 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1733 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1734 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1735 SB.EltSize, Alignment); 1736 1737 if (IsLoad) { 1738 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1739 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1740 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1741 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1742 } else { 1743 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1744 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1745 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1746 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1747 // This only ever adds one VGPR spill 1748 SB.MFI.addToSpilledVGPRs(1); 1749 } 1750 } 1751 1752 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 1753 RegScavenger *RS, SlotIndexes *Indexes, 1754 LiveIntervals *LIS, bool OnlyToVGPR, 1755 bool SpillToPhysVGPRLane) const { 1756 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1757 1758 ArrayRef<SpilledReg> VGPRSpills = 1759 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1760 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1761 bool SpillToVGPR = !VGPRSpills.empty(); 1762 if (OnlyToVGPR && !SpillToVGPR) 1763 return false; 1764 1765 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1766 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1767 1768 if (SpillToVGPR) { 1769 1770 assert(SB.NumSubRegs == VGPRSpills.size() && 1771 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1772 1773 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1774 Register SubReg = 1775 SB.NumSubRegs == 1 1776 ? SB.SuperReg 1777 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1778 SpilledReg Spill = VGPRSpills[i]; 1779 1780 bool IsFirstSubreg = i == 0; 1781 bool IsLastSubreg = i == SB.NumSubRegs - 1; 1782 bool UseKill = SB.IsKill && IsLastSubreg; 1783 1784 1785 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1786 // spill to this specific vgpr in the first basic block. 1787 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1788 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR) 1789 .addReg(SubReg, getKillRegState(UseKill)) 1790 .addImm(Spill.Lane) 1791 .addReg(Spill.VGPR); 1792 if (Indexes) { 1793 if (IsFirstSubreg) 1794 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1795 else 1796 Indexes->insertMachineInstrInMaps(*MIB); 1797 } 1798 1799 if (IsFirstSubreg && SB.NumSubRegs > 1) { 1800 // We may be spilling a super-register which is only partially defined, 1801 // and need to ensure later spills think the value is defined. 1802 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1803 } 1804 1805 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 1806 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1807 1808 // FIXME: Since this spills to another register instead of an actual 1809 // frame index, we should delete the frame index when all references to 1810 // it are fixed. 1811 } 1812 } else { 1813 SB.prepare(); 1814 1815 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1816 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1817 1818 // Per VGPR helper data 1819 auto PVD = SB.getPerVGPRData(); 1820 1821 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1822 unsigned TmpVGPRFlags = RegState::Undef; 1823 1824 // Write sub registers into the VGPR 1825 for (unsigned i = Offset * PVD.PerVGPR, 1826 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1827 i < e; ++i) { 1828 Register SubReg = 1829 SB.NumSubRegs == 1 1830 ? SB.SuperReg 1831 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1832 1833 MachineInstrBuilder WriteLane = 1834 BuildMI(*SB.MBB, MI, SB.DL, 1835 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR) 1836 .addReg(SubReg, SubKillState) 1837 .addImm(i % PVD.PerVGPR) 1838 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1839 TmpVGPRFlags = 0; 1840 1841 if (Indexes) { 1842 if (i == 0) 1843 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 1844 else 1845 Indexes->insertMachineInstrInMaps(*WriteLane); 1846 } 1847 1848 // There could be undef components of a spilled super register. 1849 // TODO: Can we detect this and skip the spill? 1850 if (SB.NumSubRegs > 1) { 1851 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1852 unsigned SuperKillState = 0; 1853 if (i + 1 == SB.NumSubRegs) 1854 SuperKillState |= getKillRegState(SB.IsKill); 1855 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1856 } 1857 } 1858 1859 // Write out VGPR 1860 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1861 } 1862 1863 SB.restore(); 1864 } 1865 1866 MI->eraseFromParent(); 1867 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1868 1869 if (LIS) 1870 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1871 1872 return true; 1873 } 1874 1875 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 1876 RegScavenger *RS, SlotIndexes *Indexes, 1877 LiveIntervals *LIS, bool OnlyToVGPR, 1878 bool SpillToPhysVGPRLane) const { 1879 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1880 1881 ArrayRef<SpilledReg> VGPRSpills = 1882 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1883 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1884 bool SpillToVGPR = !VGPRSpills.empty(); 1885 if (OnlyToVGPR && !SpillToVGPR) 1886 return false; 1887 1888 if (SpillToVGPR) { 1889 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1890 Register SubReg = 1891 SB.NumSubRegs == 1 1892 ? SB.SuperReg 1893 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1894 1895 SpilledReg Spill = VGPRSpills[i]; 1896 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1897 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 1898 .addReg(Spill.VGPR) 1899 .addImm(Spill.Lane); 1900 if (SB.NumSubRegs > 1 && i == 0) 1901 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1902 if (Indexes) { 1903 if (i == e - 1) 1904 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1905 else 1906 Indexes->insertMachineInstrInMaps(*MIB); 1907 } 1908 } 1909 } else { 1910 SB.prepare(); 1911 1912 // Per VGPR helper data 1913 auto PVD = SB.getPerVGPRData(); 1914 1915 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1916 // Load in VGPR data 1917 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1918 1919 // Unpack lanes 1920 for (unsigned i = Offset * PVD.PerVGPR, 1921 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1922 i < e; ++i) { 1923 Register SubReg = 1924 SB.NumSubRegs == 1 1925 ? SB.SuperReg 1926 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1927 1928 bool LastSubReg = (i + 1 == e); 1929 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1930 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 1931 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1932 .addImm(i); 1933 if (SB.NumSubRegs > 1 && i == 0) 1934 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1935 if (Indexes) { 1936 if (i == e - 1) 1937 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1938 else 1939 Indexes->insertMachineInstrInMaps(*MIB); 1940 } 1941 } 1942 } 1943 1944 SB.restore(); 1945 } 1946 1947 MI->eraseFromParent(); 1948 1949 if (LIS) 1950 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1951 1952 return true; 1953 } 1954 1955 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1956 MachineBasicBlock &RestoreMBB, 1957 Register SGPR, RegScavenger *RS) const { 1958 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1959 RS); 1960 SB.prepare(); 1961 // Generate the spill of SGPR to SB.TmpVGPR. 1962 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1963 auto PVD = SB.getPerVGPRData(); 1964 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1965 unsigned TmpVGPRFlags = RegState::Undef; 1966 // Write sub registers into the VGPR 1967 for (unsigned i = Offset * PVD.PerVGPR, 1968 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1969 i < e; ++i) { 1970 Register SubReg = 1971 SB.NumSubRegs == 1 1972 ? SB.SuperReg 1973 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1974 1975 MachineInstrBuilder WriteLane = 1976 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1977 SB.TmpVGPR) 1978 .addReg(SubReg, SubKillState) 1979 .addImm(i % PVD.PerVGPR) 1980 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1981 TmpVGPRFlags = 0; 1982 // There could be undef components of a spilled super register. 1983 // TODO: Can we detect this and skip the spill? 1984 if (SB.NumSubRegs > 1) { 1985 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1986 unsigned SuperKillState = 0; 1987 if (i + 1 == SB.NumSubRegs) 1988 SuperKillState |= getKillRegState(SB.IsKill); 1989 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1990 } 1991 } 1992 // Don't need to write VGPR out. 1993 } 1994 1995 // Restore clobbered registers in the specified restore block. 1996 MI = RestoreMBB.end(); 1997 SB.setMI(&RestoreMBB, MI); 1998 // Generate the restore of SGPR from SB.TmpVGPR. 1999 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2000 // Don't need to load VGPR in. 2001 // Unpack lanes 2002 for (unsigned i = Offset * PVD.PerVGPR, 2003 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2004 i < e; ++i) { 2005 Register SubReg = 2006 SB.NumSubRegs == 1 2007 ? SB.SuperReg 2008 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2009 bool LastSubReg = (i + 1 == e); 2010 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 2011 SubReg) 2012 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2013 .addImm(i); 2014 if (SB.NumSubRegs > 1 && i == 0) 2015 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2016 } 2017 } 2018 SB.restore(); 2019 2020 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2021 return false; 2022 } 2023 2024 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 2025 /// a VGPR and the stack slot can be safely eliminated when all other users are 2026 /// handled. 2027 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 2028 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 2029 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { 2030 switch (MI->getOpcode()) { 2031 case AMDGPU::SI_SPILL_S1024_SAVE: 2032 case AMDGPU::SI_SPILL_S512_SAVE: 2033 case AMDGPU::SI_SPILL_S384_SAVE: 2034 case AMDGPU::SI_SPILL_S352_SAVE: 2035 case AMDGPU::SI_SPILL_S320_SAVE: 2036 case AMDGPU::SI_SPILL_S288_SAVE: 2037 case AMDGPU::SI_SPILL_S256_SAVE: 2038 case AMDGPU::SI_SPILL_S224_SAVE: 2039 case AMDGPU::SI_SPILL_S192_SAVE: 2040 case AMDGPU::SI_SPILL_S160_SAVE: 2041 case AMDGPU::SI_SPILL_S128_SAVE: 2042 case AMDGPU::SI_SPILL_S96_SAVE: 2043 case AMDGPU::SI_SPILL_S64_SAVE: 2044 case AMDGPU::SI_SPILL_S32_SAVE: 2045 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2046 case AMDGPU::SI_SPILL_S1024_RESTORE: 2047 case AMDGPU::SI_SPILL_S512_RESTORE: 2048 case AMDGPU::SI_SPILL_S384_RESTORE: 2049 case AMDGPU::SI_SPILL_S352_RESTORE: 2050 case AMDGPU::SI_SPILL_S320_RESTORE: 2051 case AMDGPU::SI_SPILL_S288_RESTORE: 2052 case AMDGPU::SI_SPILL_S256_RESTORE: 2053 case AMDGPU::SI_SPILL_S224_RESTORE: 2054 case AMDGPU::SI_SPILL_S192_RESTORE: 2055 case AMDGPU::SI_SPILL_S160_RESTORE: 2056 case AMDGPU::SI_SPILL_S128_RESTORE: 2057 case AMDGPU::SI_SPILL_S96_RESTORE: 2058 case AMDGPU::SI_SPILL_S64_RESTORE: 2059 case AMDGPU::SI_SPILL_S32_RESTORE: 2060 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2061 default: 2062 llvm_unreachable("not an SGPR spill instruction"); 2063 } 2064 } 2065 2066 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2067 int SPAdj, unsigned FIOperandNum, 2068 RegScavenger *RS) const { 2069 MachineFunction *MF = MI->getParent()->getParent(); 2070 MachineBasicBlock *MBB = MI->getParent(); 2071 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2072 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2073 const SIInstrInfo *TII = ST.getInstrInfo(); 2074 DebugLoc DL = MI->getDebugLoc(); 2075 2076 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2077 2078 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 2079 int Index = MI->getOperand(FIOperandNum).getIndex(); 2080 2081 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2082 ? getBaseRegister() 2083 : getFrameRegister(*MF); 2084 2085 switch (MI->getOpcode()) { 2086 // SGPR register spill 2087 case AMDGPU::SI_SPILL_S1024_SAVE: 2088 case AMDGPU::SI_SPILL_S512_SAVE: 2089 case AMDGPU::SI_SPILL_S384_SAVE: 2090 case AMDGPU::SI_SPILL_S352_SAVE: 2091 case AMDGPU::SI_SPILL_S320_SAVE: 2092 case AMDGPU::SI_SPILL_S288_SAVE: 2093 case AMDGPU::SI_SPILL_S256_SAVE: 2094 case AMDGPU::SI_SPILL_S224_SAVE: 2095 case AMDGPU::SI_SPILL_S192_SAVE: 2096 case AMDGPU::SI_SPILL_S160_SAVE: 2097 case AMDGPU::SI_SPILL_S128_SAVE: 2098 case AMDGPU::SI_SPILL_S96_SAVE: 2099 case AMDGPU::SI_SPILL_S64_SAVE: 2100 case AMDGPU::SI_SPILL_S32_SAVE: { 2101 return spillSGPR(MI, Index, RS); 2102 } 2103 2104 // SGPR register restore 2105 case AMDGPU::SI_SPILL_S1024_RESTORE: 2106 case AMDGPU::SI_SPILL_S512_RESTORE: 2107 case AMDGPU::SI_SPILL_S384_RESTORE: 2108 case AMDGPU::SI_SPILL_S352_RESTORE: 2109 case AMDGPU::SI_SPILL_S320_RESTORE: 2110 case AMDGPU::SI_SPILL_S288_RESTORE: 2111 case AMDGPU::SI_SPILL_S256_RESTORE: 2112 case AMDGPU::SI_SPILL_S224_RESTORE: 2113 case AMDGPU::SI_SPILL_S192_RESTORE: 2114 case AMDGPU::SI_SPILL_S160_RESTORE: 2115 case AMDGPU::SI_SPILL_S128_RESTORE: 2116 case AMDGPU::SI_SPILL_S96_RESTORE: 2117 case AMDGPU::SI_SPILL_S64_RESTORE: 2118 case AMDGPU::SI_SPILL_S32_RESTORE: { 2119 return restoreSGPR(MI, Index, RS); 2120 } 2121 2122 // VGPR register spill 2123 case AMDGPU::SI_SPILL_V1024_SAVE: 2124 case AMDGPU::SI_SPILL_V512_SAVE: 2125 case AMDGPU::SI_SPILL_V384_SAVE: 2126 case AMDGPU::SI_SPILL_V352_SAVE: 2127 case AMDGPU::SI_SPILL_V320_SAVE: 2128 case AMDGPU::SI_SPILL_V288_SAVE: 2129 case AMDGPU::SI_SPILL_V256_SAVE: 2130 case AMDGPU::SI_SPILL_V224_SAVE: 2131 case AMDGPU::SI_SPILL_V192_SAVE: 2132 case AMDGPU::SI_SPILL_V160_SAVE: 2133 case AMDGPU::SI_SPILL_V128_SAVE: 2134 case AMDGPU::SI_SPILL_V96_SAVE: 2135 case AMDGPU::SI_SPILL_V64_SAVE: 2136 case AMDGPU::SI_SPILL_V32_SAVE: 2137 case AMDGPU::SI_SPILL_A1024_SAVE: 2138 case AMDGPU::SI_SPILL_A512_SAVE: 2139 case AMDGPU::SI_SPILL_A384_SAVE: 2140 case AMDGPU::SI_SPILL_A352_SAVE: 2141 case AMDGPU::SI_SPILL_A320_SAVE: 2142 case AMDGPU::SI_SPILL_A288_SAVE: 2143 case AMDGPU::SI_SPILL_A256_SAVE: 2144 case AMDGPU::SI_SPILL_A224_SAVE: 2145 case AMDGPU::SI_SPILL_A192_SAVE: 2146 case AMDGPU::SI_SPILL_A160_SAVE: 2147 case AMDGPU::SI_SPILL_A128_SAVE: 2148 case AMDGPU::SI_SPILL_A96_SAVE: 2149 case AMDGPU::SI_SPILL_A64_SAVE: 2150 case AMDGPU::SI_SPILL_A32_SAVE: 2151 case AMDGPU::SI_SPILL_AV1024_SAVE: 2152 case AMDGPU::SI_SPILL_AV512_SAVE: 2153 case AMDGPU::SI_SPILL_AV384_SAVE: 2154 case AMDGPU::SI_SPILL_AV352_SAVE: 2155 case AMDGPU::SI_SPILL_AV320_SAVE: 2156 case AMDGPU::SI_SPILL_AV288_SAVE: 2157 case AMDGPU::SI_SPILL_AV256_SAVE: 2158 case AMDGPU::SI_SPILL_AV224_SAVE: 2159 case AMDGPU::SI_SPILL_AV192_SAVE: 2160 case AMDGPU::SI_SPILL_AV160_SAVE: 2161 case AMDGPU::SI_SPILL_AV128_SAVE: 2162 case AMDGPU::SI_SPILL_AV96_SAVE: 2163 case AMDGPU::SI_SPILL_AV64_SAVE: 2164 case AMDGPU::SI_SPILL_AV32_SAVE: 2165 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 2166 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { 2167 const MachineOperand *VData = TII->getNamedOperand(*MI, 2168 AMDGPU::OpName::vdata); 2169 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2170 MFI->getStackPtrOffsetReg()); 2171 2172 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2173 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2174 auto *MBB = MI->getParent(); 2175 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2176 if (IsWWMRegSpill) { 2177 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2178 RS->isRegUsed(AMDGPU::SCC)); 2179 } 2180 buildSpillLoadStore( 2181 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2182 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2183 *MI->memoperands_begin(), RS); 2184 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2185 if (IsWWMRegSpill) 2186 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2187 2188 MI->eraseFromParent(); 2189 return true; 2190 } 2191 case AMDGPU::SI_SPILL_V32_RESTORE: 2192 case AMDGPU::SI_SPILL_V64_RESTORE: 2193 case AMDGPU::SI_SPILL_V96_RESTORE: 2194 case AMDGPU::SI_SPILL_V128_RESTORE: 2195 case AMDGPU::SI_SPILL_V160_RESTORE: 2196 case AMDGPU::SI_SPILL_V192_RESTORE: 2197 case AMDGPU::SI_SPILL_V224_RESTORE: 2198 case AMDGPU::SI_SPILL_V256_RESTORE: 2199 case AMDGPU::SI_SPILL_V288_RESTORE: 2200 case AMDGPU::SI_SPILL_V320_RESTORE: 2201 case AMDGPU::SI_SPILL_V352_RESTORE: 2202 case AMDGPU::SI_SPILL_V384_RESTORE: 2203 case AMDGPU::SI_SPILL_V512_RESTORE: 2204 case AMDGPU::SI_SPILL_V1024_RESTORE: 2205 case AMDGPU::SI_SPILL_A32_RESTORE: 2206 case AMDGPU::SI_SPILL_A64_RESTORE: 2207 case AMDGPU::SI_SPILL_A96_RESTORE: 2208 case AMDGPU::SI_SPILL_A128_RESTORE: 2209 case AMDGPU::SI_SPILL_A160_RESTORE: 2210 case AMDGPU::SI_SPILL_A192_RESTORE: 2211 case AMDGPU::SI_SPILL_A224_RESTORE: 2212 case AMDGPU::SI_SPILL_A256_RESTORE: 2213 case AMDGPU::SI_SPILL_A288_RESTORE: 2214 case AMDGPU::SI_SPILL_A320_RESTORE: 2215 case AMDGPU::SI_SPILL_A352_RESTORE: 2216 case AMDGPU::SI_SPILL_A384_RESTORE: 2217 case AMDGPU::SI_SPILL_A512_RESTORE: 2218 case AMDGPU::SI_SPILL_A1024_RESTORE: 2219 case AMDGPU::SI_SPILL_AV32_RESTORE: 2220 case AMDGPU::SI_SPILL_AV64_RESTORE: 2221 case AMDGPU::SI_SPILL_AV96_RESTORE: 2222 case AMDGPU::SI_SPILL_AV128_RESTORE: 2223 case AMDGPU::SI_SPILL_AV160_RESTORE: 2224 case AMDGPU::SI_SPILL_AV192_RESTORE: 2225 case AMDGPU::SI_SPILL_AV224_RESTORE: 2226 case AMDGPU::SI_SPILL_AV256_RESTORE: 2227 case AMDGPU::SI_SPILL_AV288_RESTORE: 2228 case AMDGPU::SI_SPILL_AV320_RESTORE: 2229 case AMDGPU::SI_SPILL_AV352_RESTORE: 2230 case AMDGPU::SI_SPILL_AV384_RESTORE: 2231 case AMDGPU::SI_SPILL_AV512_RESTORE: 2232 case AMDGPU::SI_SPILL_AV1024_RESTORE: 2233 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 2234 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { 2235 const MachineOperand *VData = TII->getNamedOperand(*MI, 2236 AMDGPU::OpName::vdata); 2237 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2238 MFI->getStackPtrOffsetReg()); 2239 2240 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2241 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2242 auto *MBB = MI->getParent(); 2243 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2244 if (IsWWMRegSpill) { 2245 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2246 RS->isRegUsed(AMDGPU::SCC)); 2247 } 2248 2249 buildSpillLoadStore( 2250 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2251 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2252 *MI->memoperands_begin(), RS); 2253 2254 if (IsWWMRegSpill) 2255 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2256 2257 MI->eraseFromParent(); 2258 return true; 2259 } 2260 2261 default: { 2262 // Other access to frame index 2263 const DebugLoc &DL = MI->getDebugLoc(); 2264 2265 int64_t Offset = FrameInfo.getObjectOffset(Index); 2266 if (ST.enableFlatScratch()) { 2267 if (TII->isFLATScratch(*MI)) { 2268 assert((int16_t)FIOperandNum == 2269 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2270 AMDGPU::OpName::saddr)); 2271 2272 // The offset is always swizzled, just replace it 2273 if (FrameReg) 2274 FIOp.ChangeToRegister(FrameReg, false); 2275 2276 if (!Offset) 2277 return false; 2278 2279 MachineOperand *OffsetOp = 2280 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2281 int64_t NewOffset = Offset + OffsetOp->getImm(); 2282 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2283 SIInstrFlags::FlatScratch)) { 2284 OffsetOp->setImm(NewOffset); 2285 if (FrameReg) 2286 return false; 2287 Offset = 0; 2288 } 2289 2290 if (!Offset) { 2291 unsigned Opc = MI->getOpcode(); 2292 int NewOpc = -1; 2293 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2294 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2295 } else if (ST.hasFlatScratchSTMode()) { 2296 // On GFX10 we have ST mode to use no registers for an address. 2297 // Otherwise we need to materialize 0 into an SGPR. 2298 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2299 } 2300 2301 if (NewOpc != -1) { 2302 // removeOperand doesn't fixup tied operand indexes as it goes, so 2303 // it asserts. Untie vdst_in for now and retie them afterwards. 2304 int VDstIn = AMDGPU::getNamedOperandIdx(Opc, 2305 AMDGPU::OpName::vdst_in); 2306 bool TiedVDst = VDstIn != -1 && 2307 MI->getOperand(VDstIn).isReg() && 2308 MI->getOperand(VDstIn).isTied(); 2309 if (TiedVDst) 2310 MI->untieRegOperand(VDstIn); 2311 2312 MI->removeOperand( 2313 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2314 2315 if (TiedVDst) { 2316 int NewVDst = 2317 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2318 int NewVDstIn = 2319 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2320 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2321 MI->tieOperands(NewVDst, NewVDstIn); 2322 } 2323 MI->setDesc(TII->get(NewOpc)); 2324 return false; 2325 } 2326 } 2327 } 2328 2329 if (!FrameReg) { 2330 FIOp.ChangeToImmediate(Offset); 2331 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2332 return false; 2333 } 2334 2335 // We need to use register here. Check if we can use an SGPR or need 2336 // a VGPR. 2337 FIOp.ChangeToRegister(AMDGPU::M0, false); 2338 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2339 2340 if (!Offset && FrameReg && UseSGPR) { 2341 FIOp.setReg(FrameReg); 2342 return false; 2343 } 2344 2345 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2346 : &AMDGPU::VGPR_32RegClass; 2347 2348 Register TmpReg = 2349 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); 2350 FIOp.setReg(TmpReg); 2351 FIOp.setIsKill(); 2352 2353 if ((!FrameReg || !Offset) && TmpReg) { 2354 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2355 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2356 if (FrameReg) 2357 MIB.addReg(FrameReg); 2358 else 2359 MIB.addImm(Offset); 2360 2361 return false; 2362 } 2363 2364 bool NeedSaveSCC = 2365 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2366 2367 Register TmpSReg = 2368 UseSGPR ? TmpReg 2369 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2370 MI, false, 0, !UseSGPR); 2371 2372 // TODO: for flat scratch another attempt can be made with a VGPR index 2373 // if no SGPRs can be scavenged. 2374 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2375 report_fatal_error("Cannot scavenge register in FI elimination!"); 2376 2377 if (!TmpSReg) { 2378 // Use frame register and restore it after. 2379 TmpSReg = FrameReg; 2380 FIOp.setReg(FrameReg); 2381 FIOp.setIsKill(false); 2382 } 2383 2384 if (NeedSaveSCC) { 2385 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 2386 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 2387 .addReg(FrameReg) 2388 .addImm(Offset); 2389 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2390 .addReg(TmpSReg) 2391 .addImm(0); 2392 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 2393 .addImm(0) 2394 .addReg(TmpSReg); 2395 } else { 2396 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2397 .addReg(FrameReg) 2398 .addImm(Offset); 2399 } 2400 2401 if (!UseSGPR) 2402 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2403 .addReg(TmpSReg, RegState::Kill); 2404 2405 if (TmpSReg == FrameReg) { 2406 // Undo frame register modification. 2407 if (NeedSaveSCC && !MI->registerDefIsDead(AMDGPU::SCC)) { 2408 MachineBasicBlock::iterator I = 2409 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 2410 TmpSReg) 2411 .addReg(FrameReg) 2412 .addImm(-Offset); 2413 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2414 .addReg(TmpSReg) 2415 .addImm(0); 2416 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 2417 TmpSReg) 2418 .addImm(0) 2419 .addReg(TmpSReg); 2420 } else { 2421 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2422 FrameReg) 2423 .addReg(FrameReg) 2424 .addImm(-Offset); 2425 } 2426 } 2427 2428 return false; 2429 } 2430 2431 bool IsMUBUF = TII->isMUBUF(*MI); 2432 2433 if (!IsMUBUF && !MFI->isBottomOfStack()) { 2434 // Convert to a swizzled stack address by scaling by the wave size. 2435 // In an entry function/kernel the offset is already swizzled. 2436 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 2437 bool LiveSCC = 2438 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2439 const TargetRegisterClass *RC = IsSALU && !LiveSCC 2440 ? &AMDGPU::SReg_32RegClass 2441 : &AMDGPU::VGPR_32RegClass; 2442 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 2443 MI->getOpcode() == AMDGPU::V_MOV_B32_e64; 2444 Register ResultReg = 2445 IsCopy ? MI->getOperand(0).getReg() 2446 : RS->scavengeRegisterBackwards(*RC, MI, false, 0); 2447 2448 int64_t Offset = FrameInfo.getObjectOffset(Index); 2449 if (Offset == 0) { 2450 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 2451 : AMDGPU::V_LSHRREV_B32_e64; 2452 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg); 2453 if (OpCode == AMDGPU::V_LSHRREV_B32_e64) 2454 // For V_LSHRREV, the operands are reversed (the shift count goes 2455 // first). 2456 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg); 2457 else 2458 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2()); 2459 if (IsSALU && !LiveSCC) 2460 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 2461 if (IsSALU && LiveSCC) { 2462 Register NewDest = RS->scavengeRegisterBackwards( 2463 AMDGPU::SReg_32RegClass, Shift, false, 0); 2464 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 2465 NewDest) 2466 .addReg(ResultReg); 2467 ResultReg = NewDest; 2468 } 2469 } else { 2470 MachineInstrBuilder MIB; 2471 if (!IsSALU) { 2472 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 2473 nullptr) { 2474 // Reuse ResultReg in intermediate step. 2475 Register ScaledReg = ResultReg; 2476 2477 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2478 ScaledReg) 2479 .addImm(ST.getWavefrontSizeLog2()) 2480 .addReg(FrameReg); 2481 2482 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2483 2484 // TODO: Fold if use instruction is another add of a constant. 2485 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2486 // FIXME: This can fail 2487 MIB.addImm(Offset); 2488 MIB.addReg(ScaledReg, RegState::Kill); 2489 if (!IsVOP2) 2490 MIB.addImm(0); // clamp bit 2491 } else { 2492 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2493 "Need to reuse carry out register"); 2494 2495 // Use scavenged unused carry out as offset register. 2496 Register ConstOffsetReg; 2497 if (!isWave32) 2498 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2499 else 2500 ConstOffsetReg = MIB.getReg(1); 2501 2502 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2503 .addImm(Offset); 2504 MIB.addReg(ConstOffsetReg, RegState::Kill); 2505 MIB.addReg(ScaledReg, RegState::Kill); 2506 MIB.addImm(0); // clamp bit 2507 } 2508 } 2509 } 2510 if (!MIB || IsSALU) { 2511 // We have to produce a carry out, and there isn't a free SGPR pair 2512 // for it. We can keep the whole computation on the SALU to avoid 2513 // clobbering an additional register at the cost of an extra mov. 2514 2515 // We may have 1 free scratch SGPR even though a carry out is 2516 // unavailable. Only one additional mov is needed. 2517 Register TmpScaledReg = RS->scavengeRegisterBackwards( 2518 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false); 2519 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2520 2521 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2522 .addReg(FrameReg) 2523 .addImm(ST.getWavefrontSizeLog2()); 2524 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2525 .addReg(ScaledReg, RegState::Kill) 2526 .addImm(Offset); 2527 if (!IsSALU) 2528 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2529 .addReg(ScaledReg, RegState::Kill); 2530 else 2531 ResultReg = ScaledReg; 2532 2533 // If there were truly no free SGPRs, we need to undo everything. 2534 if (!TmpScaledReg.isValid()) { 2535 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2536 .addReg(ScaledReg, RegState::Kill) 2537 .addImm(-Offset); 2538 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2539 .addReg(FrameReg) 2540 .addImm(ST.getWavefrontSizeLog2()); 2541 } 2542 } 2543 } 2544 2545 // Don't introduce an extra copy if we're just materializing in a mov. 2546 if (IsCopy) { 2547 MI->eraseFromParent(); 2548 return true; 2549 } 2550 FIOp.ChangeToRegister(ResultReg, false, false, true); 2551 return false; 2552 } 2553 2554 if (IsMUBUF) { 2555 // Disable offen so we don't need a 0 vgpr base. 2556 assert(static_cast<int>(FIOperandNum) == 2557 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2558 AMDGPU::OpName::vaddr)); 2559 2560 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2561 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2562 2563 if (FrameReg != AMDGPU::NoRegister) 2564 SOffset.ChangeToRegister(FrameReg, false); 2565 2566 int64_t Offset = FrameInfo.getObjectOffset(Index); 2567 int64_t OldImm 2568 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2569 int64_t NewOffset = OldImm + Offset; 2570 2571 if (TII->isLegalMUBUFImmOffset(NewOffset) && 2572 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2573 MI->eraseFromParent(); 2574 return true; 2575 } 2576 } 2577 2578 // If the offset is simply too big, don't convert to a scratch wave offset 2579 // relative index. 2580 2581 FIOp.ChangeToImmediate(Offset); 2582 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2583 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 2584 MI, false, 0); 2585 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2586 .addImm(Offset); 2587 FIOp.ChangeToRegister(TmpReg, false, false, true); 2588 } 2589 } 2590 } 2591 return false; 2592 } 2593 2594 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2595 return AMDGPUInstPrinter::getRegisterName(Reg); 2596 } 2597 2598 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { 2599 return getRegBitWidth(RC.getID()); 2600 } 2601 2602 static const TargetRegisterClass * 2603 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2604 if (BitWidth == 64) 2605 return &AMDGPU::VReg_64RegClass; 2606 if (BitWidth == 96) 2607 return &AMDGPU::VReg_96RegClass; 2608 if (BitWidth == 128) 2609 return &AMDGPU::VReg_128RegClass; 2610 if (BitWidth == 160) 2611 return &AMDGPU::VReg_160RegClass; 2612 if (BitWidth == 192) 2613 return &AMDGPU::VReg_192RegClass; 2614 if (BitWidth == 224) 2615 return &AMDGPU::VReg_224RegClass; 2616 if (BitWidth == 256) 2617 return &AMDGPU::VReg_256RegClass; 2618 if (BitWidth == 288) 2619 return &AMDGPU::VReg_288RegClass; 2620 if (BitWidth == 320) 2621 return &AMDGPU::VReg_320RegClass; 2622 if (BitWidth == 352) 2623 return &AMDGPU::VReg_352RegClass; 2624 if (BitWidth == 384) 2625 return &AMDGPU::VReg_384RegClass; 2626 if (BitWidth == 512) 2627 return &AMDGPU::VReg_512RegClass; 2628 if (BitWidth == 1024) 2629 return &AMDGPU::VReg_1024RegClass; 2630 2631 return nullptr; 2632 } 2633 2634 static const TargetRegisterClass * 2635 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2636 if (BitWidth == 64) 2637 return &AMDGPU::VReg_64_Align2RegClass; 2638 if (BitWidth == 96) 2639 return &AMDGPU::VReg_96_Align2RegClass; 2640 if (BitWidth == 128) 2641 return &AMDGPU::VReg_128_Align2RegClass; 2642 if (BitWidth == 160) 2643 return &AMDGPU::VReg_160_Align2RegClass; 2644 if (BitWidth == 192) 2645 return &AMDGPU::VReg_192_Align2RegClass; 2646 if (BitWidth == 224) 2647 return &AMDGPU::VReg_224_Align2RegClass; 2648 if (BitWidth == 256) 2649 return &AMDGPU::VReg_256_Align2RegClass; 2650 if (BitWidth == 288) 2651 return &AMDGPU::VReg_288_Align2RegClass; 2652 if (BitWidth == 320) 2653 return &AMDGPU::VReg_320_Align2RegClass; 2654 if (BitWidth == 352) 2655 return &AMDGPU::VReg_352_Align2RegClass; 2656 if (BitWidth == 384) 2657 return &AMDGPU::VReg_384_Align2RegClass; 2658 if (BitWidth == 512) 2659 return &AMDGPU::VReg_512_Align2RegClass; 2660 if (BitWidth == 1024) 2661 return &AMDGPU::VReg_1024_Align2RegClass; 2662 2663 return nullptr; 2664 } 2665 2666 const TargetRegisterClass * 2667 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2668 if (BitWidth == 1) 2669 return &AMDGPU::VReg_1RegClass; 2670 if (BitWidth == 16) 2671 return &AMDGPU::VGPR_16RegClass; 2672 if (BitWidth == 32) 2673 return &AMDGPU::VGPR_32RegClass; 2674 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2675 : getAnyVGPRClassForBitWidth(BitWidth); 2676 } 2677 2678 static const TargetRegisterClass * 2679 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2680 if (BitWidth == 64) 2681 return &AMDGPU::AReg_64RegClass; 2682 if (BitWidth == 96) 2683 return &AMDGPU::AReg_96RegClass; 2684 if (BitWidth == 128) 2685 return &AMDGPU::AReg_128RegClass; 2686 if (BitWidth == 160) 2687 return &AMDGPU::AReg_160RegClass; 2688 if (BitWidth == 192) 2689 return &AMDGPU::AReg_192RegClass; 2690 if (BitWidth == 224) 2691 return &AMDGPU::AReg_224RegClass; 2692 if (BitWidth == 256) 2693 return &AMDGPU::AReg_256RegClass; 2694 if (BitWidth == 288) 2695 return &AMDGPU::AReg_288RegClass; 2696 if (BitWidth == 320) 2697 return &AMDGPU::AReg_320RegClass; 2698 if (BitWidth == 352) 2699 return &AMDGPU::AReg_352RegClass; 2700 if (BitWidth == 384) 2701 return &AMDGPU::AReg_384RegClass; 2702 if (BitWidth == 512) 2703 return &AMDGPU::AReg_512RegClass; 2704 if (BitWidth == 1024) 2705 return &AMDGPU::AReg_1024RegClass; 2706 2707 return nullptr; 2708 } 2709 2710 static const TargetRegisterClass * 2711 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2712 if (BitWidth == 64) 2713 return &AMDGPU::AReg_64_Align2RegClass; 2714 if (BitWidth == 96) 2715 return &AMDGPU::AReg_96_Align2RegClass; 2716 if (BitWidth == 128) 2717 return &AMDGPU::AReg_128_Align2RegClass; 2718 if (BitWidth == 160) 2719 return &AMDGPU::AReg_160_Align2RegClass; 2720 if (BitWidth == 192) 2721 return &AMDGPU::AReg_192_Align2RegClass; 2722 if (BitWidth == 224) 2723 return &AMDGPU::AReg_224_Align2RegClass; 2724 if (BitWidth == 256) 2725 return &AMDGPU::AReg_256_Align2RegClass; 2726 if (BitWidth == 288) 2727 return &AMDGPU::AReg_288_Align2RegClass; 2728 if (BitWidth == 320) 2729 return &AMDGPU::AReg_320_Align2RegClass; 2730 if (BitWidth == 352) 2731 return &AMDGPU::AReg_352_Align2RegClass; 2732 if (BitWidth == 384) 2733 return &AMDGPU::AReg_384_Align2RegClass; 2734 if (BitWidth == 512) 2735 return &AMDGPU::AReg_512_Align2RegClass; 2736 if (BitWidth == 1024) 2737 return &AMDGPU::AReg_1024_Align2RegClass; 2738 2739 return nullptr; 2740 } 2741 2742 const TargetRegisterClass * 2743 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2744 if (BitWidth == 16) 2745 return &AMDGPU::AGPR_LO16RegClass; 2746 if (BitWidth == 32) 2747 return &AMDGPU::AGPR_32RegClass; 2748 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2749 : getAnyAGPRClassForBitWidth(BitWidth); 2750 } 2751 2752 static const TargetRegisterClass * 2753 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2754 if (BitWidth == 64) 2755 return &AMDGPU::AV_64RegClass; 2756 if (BitWidth == 96) 2757 return &AMDGPU::AV_96RegClass; 2758 if (BitWidth == 128) 2759 return &AMDGPU::AV_128RegClass; 2760 if (BitWidth == 160) 2761 return &AMDGPU::AV_160RegClass; 2762 if (BitWidth == 192) 2763 return &AMDGPU::AV_192RegClass; 2764 if (BitWidth == 224) 2765 return &AMDGPU::AV_224RegClass; 2766 if (BitWidth == 256) 2767 return &AMDGPU::AV_256RegClass; 2768 if (BitWidth == 288) 2769 return &AMDGPU::AV_288RegClass; 2770 if (BitWidth == 320) 2771 return &AMDGPU::AV_320RegClass; 2772 if (BitWidth == 352) 2773 return &AMDGPU::AV_352RegClass; 2774 if (BitWidth == 384) 2775 return &AMDGPU::AV_384RegClass; 2776 if (BitWidth == 512) 2777 return &AMDGPU::AV_512RegClass; 2778 if (BitWidth == 1024) 2779 return &AMDGPU::AV_1024RegClass; 2780 2781 return nullptr; 2782 } 2783 2784 static const TargetRegisterClass * 2785 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2786 if (BitWidth == 64) 2787 return &AMDGPU::AV_64_Align2RegClass; 2788 if (BitWidth == 96) 2789 return &AMDGPU::AV_96_Align2RegClass; 2790 if (BitWidth == 128) 2791 return &AMDGPU::AV_128_Align2RegClass; 2792 if (BitWidth == 160) 2793 return &AMDGPU::AV_160_Align2RegClass; 2794 if (BitWidth == 192) 2795 return &AMDGPU::AV_192_Align2RegClass; 2796 if (BitWidth == 224) 2797 return &AMDGPU::AV_224_Align2RegClass; 2798 if (BitWidth == 256) 2799 return &AMDGPU::AV_256_Align2RegClass; 2800 if (BitWidth == 288) 2801 return &AMDGPU::AV_288_Align2RegClass; 2802 if (BitWidth == 320) 2803 return &AMDGPU::AV_320_Align2RegClass; 2804 if (BitWidth == 352) 2805 return &AMDGPU::AV_352_Align2RegClass; 2806 if (BitWidth == 384) 2807 return &AMDGPU::AV_384_Align2RegClass; 2808 if (BitWidth == 512) 2809 return &AMDGPU::AV_512_Align2RegClass; 2810 if (BitWidth == 1024) 2811 return &AMDGPU::AV_1024_Align2RegClass; 2812 2813 return nullptr; 2814 } 2815 2816 const TargetRegisterClass * 2817 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2818 if (BitWidth == 32) 2819 return &AMDGPU::AV_32RegClass; 2820 return ST.needsAlignedVGPRs() 2821 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2822 : getAnyVectorSuperClassForBitWidth(BitWidth); 2823 } 2824 2825 const TargetRegisterClass * 2826 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2827 if (BitWidth == 16) 2828 return &AMDGPU::SGPR_LO16RegClass; 2829 if (BitWidth == 32) 2830 return &AMDGPU::SReg_32RegClass; 2831 if (BitWidth == 64) 2832 return &AMDGPU::SReg_64RegClass; 2833 if (BitWidth == 96) 2834 return &AMDGPU::SGPR_96RegClass; 2835 if (BitWidth == 128) 2836 return &AMDGPU::SGPR_128RegClass; 2837 if (BitWidth == 160) 2838 return &AMDGPU::SGPR_160RegClass; 2839 if (BitWidth == 192) 2840 return &AMDGPU::SGPR_192RegClass; 2841 if (BitWidth == 224) 2842 return &AMDGPU::SGPR_224RegClass; 2843 if (BitWidth == 256) 2844 return &AMDGPU::SGPR_256RegClass; 2845 if (BitWidth == 288) 2846 return &AMDGPU::SGPR_288RegClass; 2847 if (BitWidth == 320) 2848 return &AMDGPU::SGPR_320RegClass; 2849 if (BitWidth == 352) 2850 return &AMDGPU::SGPR_352RegClass; 2851 if (BitWidth == 384) 2852 return &AMDGPU::SGPR_384RegClass; 2853 if (BitWidth == 512) 2854 return &AMDGPU::SGPR_512RegClass; 2855 if (BitWidth == 1024) 2856 return &AMDGPU::SGPR_1024RegClass; 2857 2858 return nullptr; 2859 } 2860 2861 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2862 Register Reg) const { 2863 const TargetRegisterClass *RC; 2864 if (Reg.isVirtual()) 2865 RC = MRI.getRegClass(Reg); 2866 else 2867 RC = getPhysRegBaseClass(Reg); 2868 return RC ? isSGPRClass(RC) : false; 2869 } 2870 2871 const TargetRegisterClass * 2872 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2873 unsigned Size = getRegSizeInBits(*SRC); 2874 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2875 assert(VRC && "Invalid register class size"); 2876 return VRC; 2877 } 2878 2879 const TargetRegisterClass * 2880 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2881 unsigned Size = getRegSizeInBits(*SRC); 2882 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2883 assert(ARC && "Invalid register class size"); 2884 return ARC; 2885 } 2886 2887 const TargetRegisterClass * 2888 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2889 unsigned Size = getRegSizeInBits(*VRC); 2890 if (Size == 32) 2891 return &AMDGPU::SGPR_32RegClass; 2892 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2893 assert(SRC && "Invalid register class size"); 2894 return SRC; 2895 } 2896 2897 const TargetRegisterClass * 2898 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2899 const TargetRegisterClass *SubRC, 2900 unsigned SubIdx) const { 2901 // Ensure this subregister index is aligned in the super register. 2902 const TargetRegisterClass *MatchRC = 2903 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2904 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2905 } 2906 2907 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2908 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2909 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2910 return !ST.hasMFMAInlineLiteralBug(); 2911 2912 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2913 OpType <= AMDGPU::OPERAND_SRC_LAST; 2914 } 2915 2916 bool SIRegisterInfo::shouldRewriteCopySrc( 2917 const TargetRegisterClass *DefRC, 2918 unsigned DefSubReg, 2919 const TargetRegisterClass *SrcRC, 2920 unsigned SrcSubReg) const { 2921 // We want to prefer the smallest register class possible, so we don't want to 2922 // stop and rewrite on anything that looks like a subregister 2923 // extract. Operations mostly don't care about the super register class, so we 2924 // only want to stop on the most basic of copies between the same register 2925 // class. 2926 // 2927 // e.g. if we have something like 2928 // %0 = ... 2929 // %1 = ... 2930 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2931 // %3 = COPY %2, sub0 2932 // 2933 // We want to look through the COPY to find: 2934 // => %3 = COPY %0 2935 2936 // Plain copy. 2937 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2938 } 2939 2940 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2941 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2942 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2943 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2944 } 2945 2946 /// Returns a lowest register that is not used at any point in the function. 2947 /// If all registers are used, then this function will return 2948 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return 2949 /// highest unused register. 2950 MCRegister SIRegisterInfo::findUnusedRegister( 2951 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, 2952 const MachineFunction &MF, bool ReserveHighestRegister) const { 2953 if (ReserveHighestRegister) { 2954 for (MCRegister Reg : reverse(*RC)) 2955 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2956 return Reg; 2957 } else { 2958 for (MCRegister Reg : *RC) 2959 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2960 return Reg; 2961 } 2962 return MCRegister(); 2963 } 2964 2965 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, 2966 const RegisterBankInfo &RBI, 2967 Register Reg) const { 2968 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); 2969 if (!RB) 2970 return false; 2971 2972 return !RBI.isDivergentRegBank(RB); 2973 } 2974 2975 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2976 unsigned EltSize) const { 2977 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); 2978 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2979 2980 const unsigned RegDWORDs = RegBitWidth / 32; 2981 const unsigned EltDWORDs = EltSize / 4; 2982 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2983 2984 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2985 const unsigned NumParts = RegDWORDs / EltDWORDs; 2986 2987 return ArrayRef(Parts.data(), NumParts); 2988 } 2989 2990 const TargetRegisterClass* 2991 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2992 Register Reg) const { 2993 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 2994 } 2995 2996 const TargetRegisterClass * 2997 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 2998 const MachineOperand &MO) const { 2999 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 3000 return getSubRegisterClass(SrcRC, MO.getSubReg()); 3001 } 3002 3003 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 3004 Register Reg) const { 3005 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3006 // Registers without classes are unaddressable, SGPR-like registers. 3007 return RC && isVGPRClass(RC); 3008 } 3009 3010 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 3011 Register Reg) const { 3012 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3013 3014 // Registers without classes are unaddressable, SGPR-like registers. 3015 return RC && isAGPRClass(RC); 3016 } 3017 3018 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 3019 const TargetRegisterClass *SrcRC, 3020 unsigned SubReg, 3021 const TargetRegisterClass *DstRC, 3022 unsigned DstSubReg, 3023 const TargetRegisterClass *NewRC, 3024 LiveIntervals &LIS) const { 3025 unsigned SrcSize = getRegSizeInBits(*SrcRC); 3026 unsigned DstSize = getRegSizeInBits(*DstRC); 3027 unsigned NewSize = getRegSizeInBits(*NewRC); 3028 3029 // Do not increase size of registers beyond dword, we would need to allocate 3030 // adjacent registers and constraint regalloc more than needed. 3031 3032 // Always allow dword coalescing. 3033 if (SrcSize <= 32 || DstSize <= 32) 3034 return true; 3035 3036 return NewSize <= DstSize || NewSize <= SrcSize; 3037 } 3038 3039 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 3040 MachineFunction &MF) const { 3041 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3042 3043 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 3044 MF.getFunction()); 3045 switch (RC->getID()) { 3046 default: 3047 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 3048 case AMDGPU::VGPR_32RegClassID: 3049 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 3050 case AMDGPU::SGPR_32RegClassID: 3051 case AMDGPU::SGPR_LO16RegClassID: 3052 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 3053 } 3054 } 3055 3056 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 3057 unsigned Idx) const { 3058 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 3059 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 3060 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 3061 const_cast<MachineFunction &>(MF)); 3062 3063 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 3064 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 3065 const_cast<MachineFunction &>(MF)); 3066 3067 llvm_unreachable("Unexpected register pressure set!"); 3068 } 3069 3070 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 3071 static const int Empty[] = { -1 }; 3072 3073 if (RegPressureIgnoredUnits[RegUnit]) 3074 return Empty; 3075 3076 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 3077 } 3078 3079 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 3080 // Not a callee saved register. 3081 return AMDGPU::SGPR30_SGPR31; 3082 } 3083 3084 const TargetRegisterClass * 3085 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 3086 const RegisterBank &RB) const { 3087 switch (RB.getID()) { 3088 case AMDGPU::VGPRRegBankID: 3089 return getVGPRClassForBitWidth( 3090 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); 3091 case AMDGPU::VCCRegBankID: 3092 assert(Size == 1); 3093 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3094 : &AMDGPU::SReg_64_XEXECRegClass; 3095 case AMDGPU::SGPRRegBankID: 3096 return getSGPRClassForBitWidth(std::max(32u, Size)); 3097 case AMDGPU::AGPRRegBankID: 3098 return getAGPRClassForBitWidth(std::max(32u, Size)); 3099 default: 3100 llvm_unreachable("unknown register bank"); 3101 } 3102 } 3103 3104 const TargetRegisterClass * 3105 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3106 const MachineRegisterInfo &MRI) const { 3107 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3108 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 3109 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3110 3111 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 3112 return getAllocatableClass(RC); 3113 3114 return nullptr; 3115 } 3116 3117 MCRegister SIRegisterInfo::getVCC() const { 3118 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3119 } 3120 3121 MCRegister SIRegisterInfo::getExec() const { 3122 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3123 } 3124 3125 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3126 // VGPR tuples have an alignment requirement on gfx90a variants. 3127 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3128 : &AMDGPU::VReg_64RegClass; 3129 } 3130 3131 const TargetRegisterClass * 3132 SIRegisterInfo::getRegClass(unsigned RCID) const { 3133 switch ((int)RCID) { 3134 case AMDGPU::SReg_1RegClassID: 3135 return getBoolRC(); 3136 case AMDGPU::SReg_1_XEXECRegClassID: 3137 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3138 : &AMDGPU::SReg_64_XEXECRegClass; 3139 case -1: 3140 return nullptr; 3141 default: 3142 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3143 } 3144 } 3145 3146 // Find reaching register definition 3147 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3148 MachineInstr &Use, 3149 MachineRegisterInfo &MRI, 3150 LiveIntervals *LIS) const { 3151 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 3152 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3153 SlotIndex DefIdx; 3154 3155 if (Reg.isVirtual()) { 3156 if (!LIS->hasInterval(Reg)) 3157 return nullptr; 3158 LiveInterval &LI = LIS->getInterval(Reg); 3159 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3160 : MRI.getMaxLaneMaskForVReg(Reg); 3161 VNInfo *V = nullptr; 3162 if (LI.hasSubRanges()) { 3163 for (auto &S : LI.subranges()) { 3164 if ((S.LaneMask & SubLanes) == SubLanes) { 3165 V = S.getVNInfoAt(UseIdx); 3166 break; 3167 } 3168 } 3169 } else { 3170 V = LI.getVNInfoAt(UseIdx); 3171 } 3172 if (!V) 3173 return nullptr; 3174 DefIdx = V->def; 3175 } else { 3176 // Find last def. 3177 for (MCRegUnit Unit : regunits(Reg.asMCReg())) { 3178 LiveRange &LR = LIS->getRegUnit(Unit); 3179 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3180 if (!DefIdx.isValid() || 3181 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3182 LIS->getInstructionFromIndex(V->def))) 3183 DefIdx = V->def; 3184 } else { 3185 return nullptr; 3186 } 3187 } 3188 } 3189 3190 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3191 3192 if (!Def || !MDT.dominates(Def, &Use)) 3193 return nullptr; 3194 3195 assert(Def->modifiesRegister(Reg, this)); 3196 3197 return Def; 3198 } 3199 3200 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3201 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3202 3203 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3204 AMDGPU::SReg_32RegClass, 3205 AMDGPU::AGPR_32RegClass } ) { 3206 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3207 return Super; 3208 } 3209 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3210 &AMDGPU::VGPR_32RegClass)) { 3211 return Super; 3212 } 3213 3214 return AMDGPU::NoRegister; 3215 } 3216 3217 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3218 if (!ST.needsAlignedVGPRs()) 3219 return true; 3220 3221 if (isVGPRClass(&RC)) 3222 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3223 if (isAGPRClass(&RC)) 3224 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3225 if (isVectorSuperClass(&RC)) 3226 return RC.hasSuperClassEq( 3227 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3228 3229 return true; 3230 } 3231 3232 const TargetRegisterClass * 3233 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3234 if (!RC || !ST.needsAlignedVGPRs()) 3235 return RC; 3236 3237 unsigned Size = getRegSizeInBits(*RC); 3238 if (Size <= 32) 3239 return RC; 3240 3241 if (isVGPRClass(RC)) 3242 return getAlignedVGPRClassForBitWidth(Size); 3243 if (isAGPRClass(RC)) 3244 return getAlignedAGPRClassForBitWidth(Size); 3245 if (isVectorSuperClass(RC)) 3246 return getAlignedVectorSuperClassForBitWidth(Size); 3247 3248 return RC; 3249 } 3250 3251 ArrayRef<MCPhysReg> 3252 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3253 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 3254 } 3255 3256 ArrayRef<MCPhysReg> 3257 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3258 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 3259 } 3260 3261 ArrayRef<MCPhysReg> 3262 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3263 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3264 } 3265 3266 unsigned 3267 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, 3268 unsigned SubReg) const { 3269 switch (RC->TSFlags & SIRCFlags::RegKindMask) { 3270 case SIRCFlags::HasSGPR: 3271 return std::min(128u, getSubRegIdxSize(SubReg)); 3272 case SIRCFlags::HasAGPR: 3273 case SIRCFlags::HasVGPR: 3274 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: 3275 return std::min(32u, getSubRegIdxSize(SubReg)); 3276 default: 3277 break; 3278 } 3279 return 0; 3280 } 3281