1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveRegUnits.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling SGPRs to VGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 namespace llvm { 49 50 // A temporary struct to spill SGPRs. 51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 52 // just v_writelane and v_readlane. 53 // 54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 55 // is saved to scratch (or the other way around for loads). 56 // For this, a VGPR is required where the needed lanes can be clobbered. The 57 // RegScavenger can provide a VGPR where currently active lanes can be 58 // clobbered, but we still need to save inactive lanes. 59 // The high-level steps are: 60 // - Try to scavenge SGPR(s) to save exec 61 // - Try to scavenge VGPR 62 // - Save needed, all or inactive lanes of a TmpVGPR 63 // - Spill/Restore SGPRs using TmpVGPR 64 // - Restore TmpVGPR 65 // 66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 67 // cannot scavenge temporary SGPRs to save exec, we use the following code: 68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 69 // s_not exec, exec 70 // buffer_store_dword TmpVGPR ; save inactive lanes 71 // s_not exec, exec 72 struct SGPRSpillBuilder { 73 struct PerVGPRData { 74 unsigned PerVGPR; 75 unsigned NumVGPRs; 76 int64_t VGPRLanes; 77 }; 78 79 // The SGPR to save 80 Register SuperReg; 81 MachineBasicBlock::iterator MI; 82 ArrayRef<int16_t> SplitParts; 83 unsigned NumSubRegs; 84 bool IsKill; 85 const DebugLoc &DL; 86 87 /* When spilling to stack */ 88 // The SGPRs are written into this VGPR, which is then written to scratch 89 // (or vice versa for loads). 90 Register TmpVGPR = AMDGPU::NoRegister; 91 // Temporary spill slot to save TmpVGPR to. 92 int TmpVGPRIndex = 0; 93 // If TmpVGPR is live before the spill or if it is scavenged. 94 bool TmpVGPRLive = false; 95 // Scavenged SGPR to save EXEC. 96 Register SavedExecReg = AMDGPU::NoRegister; 97 // Stack index to write the SGPRs to. 98 int Index; 99 unsigned EltSize = 4; 100 101 RegScavenger *RS; 102 MachineBasicBlock *MBB; 103 MachineFunction &MF; 104 SIMachineFunctionInfo &MFI; 105 const SIInstrInfo &TII; 106 const SIRegisterInfo &TRI; 107 bool IsWave32; 108 Register ExecReg; 109 unsigned MovOpc; 110 unsigned NotOpc; 111 112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 113 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 114 RegScavenger *RS) 115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 116 MI->getOperand(0).isKill(), Index, RS) {} 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 120 bool IsKill, int Index, RegScavenger *RS) 121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 124 IsWave32(IsWave32) { 125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 126 SplitParts = TRI.getRegSplitParts(RC, EltSize); 127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 128 129 if (IsWave32) { 130 ExecReg = AMDGPU::EXEC_LO; 131 MovOpc = AMDGPU::S_MOV_B32; 132 NotOpc = AMDGPU::S_NOT_B32; 133 } else { 134 ExecReg = AMDGPU::EXEC; 135 MovOpc = AMDGPU::S_MOV_B64; 136 NotOpc = AMDGPU::S_NOT_B64; 137 } 138 139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 141 SuperReg != AMDGPU::EXEC && "exec should never spill"); 142 } 143 144 PerVGPRData getPerVGPRData() { 145 PerVGPRData Data; 146 Data.PerVGPR = IsWave32 ? 32 : 64; 147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 149 return Data; 150 } 151 152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 153 // free. 154 // Writes these instructions if an SGPR can be scavenged: 155 // s_mov_b64 s[6:7], exec ; Save exec 156 // s_mov_b64 exec, 3 ; Wanted lanemask 157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 158 // 159 // Writes these instructions if no SGPR can be scavenged: 160 // buffer_store_dword v0 ; Only if no free VGPR was found 161 // s_not_b64 exec, exec 162 // buffer_store_dword v0 ; Save inactive lanes 163 // ; exec stays inverted, it is flipped back in 164 // ; restore. 165 void prepare() { 166 // Scavenged temporary VGPR to use. It must be scavenged once for any number 167 // of spilled subregs. 168 // FIXME: The liveness analysis is limited and does not tell if a register 169 // is in use in lanes that are currently inactive. We can never be sure if 170 // a register as actually in use in another lane, so we need to save all 171 // used lanes of the chosen VGPR. 172 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 174 0, false); 175 176 // Reserve temporary stack slot 177 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 178 if (TmpVGPR) { 179 // Found a register that is dead in the currently active lanes, we only 180 // need to spill inactive lanes. 181 TmpVGPRLive = false; 182 } else { 183 // Pick v0 because it doesn't make a difference. 184 TmpVGPR = AMDGPU::VGPR0; 185 TmpVGPRLive = true; 186 } 187 188 if (TmpVGPRLive) { 189 // We need to inform the scavenger that this index is already in use until 190 // we're done with the custom emergency spill. 191 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 192 } 193 194 // We may end up recursively calling the scavenger, and don't want to re-use 195 // the same register. 196 RS->setRegUsed(TmpVGPR); 197 198 // Try to scavenge SGPRs to save exec 199 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 200 const TargetRegisterClass &RC = 201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 202 RS->setRegUsed(SuperReg); 203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); 204 205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 206 207 if (SavedExecReg) { 208 RS->setRegUsed(SavedExecReg); 209 // Set exec to needed lanes 210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 211 auto I = 212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 213 if (!TmpVGPRLive) 214 I.addReg(TmpVGPR, RegState::ImplicitDefine); 215 // Spill needed lanes 216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 217 } else { 218 // The modify and restore of exec clobber SCC, which we would have to save 219 // and restore. FIXME: We probably would need to reserve a register for 220 // this. 221 if (RS->isRegUsed(AMDGPU::SCC)) 222 MI->emitError("unhandled SGPR spill to memory"); 223 224 // Spill active lanes 225 if (TmpVGPRLive) 226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 227 /*IsKill*/ false); 228 // Spill inactive lanes 229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 230 if (!TmpVGPRLive) 231 I.addReg(TmpVGPR, RegState::ImplicitDefine); 232 I->getOperand(2).setIsDead(); // Mark SCC as dead. 233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 234 } 235 } 236 237 // Writes these instructions if an SGPR can be scavenged: 238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 239 // s_waitcnt vmcnt(0) ; If a free VGPR was found 240 // s_mov_b64 exec, s[6:7] ; Save exec 241 // 242 // Writes these instructions if no SGPR can be scavenged: 243 // buffer_load_dword v0 ; Restore inactive lanes 244 // s_waitcnt vmcnt(0) ; If a free VGPR was found 245 // s_not_b64 exec, exec 246 // buffer_load_dword v0 ; Only if no free VGPR was found 247 void restore() { 248 if (SavedExecReg) { 249 // Restore used lanes 250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 251 /*IsKill*/ false); 252 // Restore exec 253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 254 .addReg(SavedExecReg, RegState::Kill); 255 // Add an implicit use of the load so it is not dead. 256 // FIXME This inserts an unnecessary waitcnt 257 if (!TmpVGPRLive) { 258 I.addReg(TmpVGPR, RegState::ImplicitKill); 259 } 260 } else { 261 // Restore inactive lanes 262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 263 /*IsKill*/ false); 264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 265 if (!TmpVGPRLive) 266 I.addReg(TmpVGPR, RegState::ImplicitKill); 267 I->getOperand(2).setIsDead(); // Mark SCC as dead. 268 269 // Restore active lanes 270 if (TmpVGPRLive) 271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 272 } 273 274 // Inform the scavenger where we're releasing our custom scavenged register. 275 if (TmpVGPRLive) { 276 MachineBasicBlock::iterator RestorePt = std::prev(MI); 277 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 278 } 279 } 280 281 // Write TmpVGPR to memory or read TmpVGPR from memory. 282 // Either using a single buffer_load/store if exec is set to the needed mask 283 // or using 284 // buffer_load 285 // s_not exec, exec 286 // buffer_load 287 // s_not exec, exec 288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 289 if (SavedExecReg) { 290 // Spill needed lanes 291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 292 } else { 293 // The modify and restore of exec clobber SCC, which we would have to save 294 // and restore. FIXME: We probably would need to reserve a register for 295 // this. 296 if (RS->isRegUsed(AMDGPU::SCC)) 297 MI->emitError("unhandled SGPR spill to memory"); 298 299 // Spill active lanes 300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 301 /*IsKill*/ false); 302 // Spill inactive lanes 303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 308 } 309 } 310 311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 312 assert(MBB->getParent() == &MF); 313 MI = NewMI; 314 MBB = NewMBB; 315 } 316 }; 317 318 } // namespace llvm 319 320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 322 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 323 324 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 325 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 326 (getSubRegIndexLaneMask(AMDGPU::lo16) | 327 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 328 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 329 "getNumCoveredRegs() will not work with generated subreg masks!"); 330 331 RegPressureIgnoredUnits.resize(getNumRegUnits()); 332 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); 333 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 334 RegPressureIgnoredUnits.set(*regunits(Reg).begin()); 335 336 // HACK: Until this is fully tablegen'd. 337 static llvm::once_flag InitializeRegSplitPartsFlag; 338 339 static auto InitializeRegSplitPartsOnce = [this]() { 340 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 341 unsigned Size = getSubRegIdxSize(Idx); 342 if (Size & 31) 343 continue; 344 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 345 unsigned Pos = getSubRegIdxOffset(Idx); 346 if (Pos % Size) 347 continue; 348 Pos /= Size; 349 if (Vec.empty()) { 350 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 351 Vec.resize(MaxNumParts); 352 } 353 Vec[Pos] = Idx; 354 } 355 }; 356 357 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 358 359 static auto InitializeSubRegFromChannelTableOnce = [this]() { 360 for (auto &Row : SubRegFromChannelTable) 361 Row.fill(AMDGPU::NoSubRegister); 362 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 363 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 364 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 365 assert(Width < SubRegFromChannelTableWidthMap.size()); 366 Width = SubRegFromChannelTableWidthMap[Width]; 367 if (Width == 0) 368 continue; 369 unsigned TableIdx = Width - 1; 370 assert(TableIdx < SubRegFromChannelTable.size()); 371 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 372 SubRegFromChannelTable[TableIdx][Offset] = Idx; 373 } 374 }; 375 376 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 377 llvm::call_once(InitializeSubRegFromChannelTableFlag, 378 InitializeSubRegFromChannelTableOnce); 379 } 380 381 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 382 MCRegister Reg) const { 383 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) 384 Reserved.set(*R); 385 } 386 387 // Forced to be here by one .inc 388 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 389 const MachineFunction *MF) const { 390 CallingConv::ID CC = MF->getFunction().getCallingConv(); 391 switch (CC) { 392 case CallingConv::C: 393 case CallingConv::Fast: 394 case CallingConv::Cold: 395 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 396 : CSR_AMDGPU_SaveList; 397 case CallingConv::AMDGPU_Gfx: 398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 399 : CSR_AMDGPU_SI_Gfx_SaveList; 400 case CallingConv::AMDGPU_CS_ChainPreserve: 401 return CSR_AMDGPU_CS_ChainPreserve_SaveList; 402 default: { 403 // Dummy to not crash RegisterClassInfo. 404 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 405 return &NoCalleeSavedReg; 406 } 407 } 408 } 409 410 const MCPhysReg * 411 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 412 return nullptr; 413 } 414 415 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 416 CallingConv::ID CC) const { 417 switch (CC) { 418 case CallingConv::C: 419 case CallingConv::Fast: 420 case CallingConv::Cold: 421 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 422 : CSR_AMDGPU_RegMask; 423 case CallingConv::AMDGPU_Gfx: 424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 425 : CSR_AMDGPU_SI_Gfx_RegMask; 426 case CallingConv::AMDGPU_CS_Chain: 427 case CallingConv::AMDGPU_CS_ChainPreserve: 428 // Calls to these functions never return, so we can pretend everything is 429 // preserved. 430 return AMDGPU_AllVGPRs_RegMask; 431 default: 432 return nullptr; 433 } 434 } 435 436 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 437 return CSR_AMDGPU_NoRegs_RegMask; 438 } 439 440 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { 441 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; 442 } 443 444 const TargetRegisterClass * 445 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 446 const MachineFunction &MF) const { 447 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 448 // equivalent AV class. If used one, the verifier will crash after 449 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 450 // until Instruction selection. 451 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 452 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 453 return &AMDGPU::AV_32RegClass; 454 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 455 return &AMDGPU::AV_64RegClass; 456 if (RC == &AMDGPU::VReg_64_Align2RegClass || 457 RC == &AMDGPU::AReg_64_Align2RegClass) 458 return &AMDGPU::AV_64_Align2RegClass; 459 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 460 return &AMDGPU::AV_96RegClass; 461 if (RC == &AMDGPU::VReg_96_Align2RegClass || 462 RC == &AMDGPU::AReg_96_Align2RegClass) 463 return &AMDGPU::AV_96_Align2RegClass; 464 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 465 return &AMDGPU::AV_128RegClass; 466 if (RC == &AMDGPU::VReg_128_Align2RegClass || 467 RC == &AMDGPU::AReg_128_Align2RegClass) 468 return &AMDGPU::AV_128_Align2RegClass; 469 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 470 return &AMDGPU::AV_160RegClass; 471 if (RC == &AMDGPU::VReg_160_Align2RegClass || 472 RC == &AMDGPU::AReg_160_Align2RegClass) 473 return &AMDGPU::AV_160_Align2RegClass; 474 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 475 return &AMDGPU::AV_192RegClass; 476 if (RC == &AMDGPU::VReg_192_Align2RegClass || 477 RC == &AMDGPU::AReg_192_Align2RegClass) 478 return &AMDGPU::AV_192_Align2RegClass; 479 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 480 return &AMDGPU::AV_256RegClass; 481 if (RC == &AMDGPU::VReg_256_Align2RegClass || 482 RC == &AMDGPU::AReg_256_Align2RegClass) 483 return &AMDGPU::AV_256_Align2RegClass; 484 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 485 return &AMDGPU::AV_512RegClass; 486 if (RC == &AMDGPU::VReg_512_Align2RegClass || 487 RC == &AMDGPU::AReg_512_Align2RegClass) 488 return &AMDGPU::AV_512_Align2RegClass; 489 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 490 return &AMDGPU::AV_1024RegClass; 491 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 492 RC == &AMDGPU::AReg_1024_Align2RegClass) 493 return &AMDGPU::AV_1024_Align2RegClass; 494 } 495 496 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 497 } 498 499 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 500 const SIFrameLowering *TFI = ST.getFrameLowering(); 501 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 502 // During ISel lowering we always reserve the stack pointer in entry and chain 503 // functions, but never actually want to reference it when accessing our own 504 // frame. If we need a frame pointer we use it, but otherwise we can just use 505 // an immediate "0" which we represent by returning NoRegister. 506 if (FuncInfo->isBottomOfStack()) { 507 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 508 } 509 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 510 : FuncInfo->getStackPtrOffsetReg(); 511 } 512 513 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 514 // When we need stack realignment, we can't reference off of the 515 // stack pointer, so we reserve a base pointer. 516 const MachineFrameInfo &MFI = MF.getFrameInfo(); 517 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 518 } 519 520 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 521 522 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 523 return AMDGPU_AllVGPRs_RegMask; 524 } 525 526 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 527 return AMDGPU_AllAGPRs_RegMask; 528 } 529 530 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 531 return AMDGPU_AllVectorRegs_RegMask; 532 } 533 534 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 535 return AMDGPU_AllAllocatableSRegs_RegMask; 536 } 537 538 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 539 unsigned NumRegs) { 540 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 541 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 542 assert(NumRegIndex && "Not implemented"); 543 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 544 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 545 } 546 547 MCRegister 548 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, 549 const unsigned Align, 550 const TargetRegisterClass *RC) const { 551 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; 552 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 553 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); 554 } 555 556 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 557 const MachineFunction &MF) const { 558 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); 559 } 560 561 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 562 BitVector Reserved(getNumRegs()); 563 Reserved.set(AMDGPU::MODE); 564 565 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 566 567 // Reserve special purpose registers. 568 // 569 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 570 // this seems likely to result in bugs, so I'm marking them as reserved. 571 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 572 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 573 574 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 575 reserveRegisterTuples(Reserved, AMDGPU::M0); 576 577 // Reserve src_vccz, src_execz, src_scc. 578 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 579 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 580 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 581 582 // Reserve the memory aperture registers 583 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 584 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 585 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 586 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 587 588 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 589 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 590 591 // Reserve xnack_mask registers - support is not implemented in Codegen. 592 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 593 594 // Reserve lds_direct register - support is not implemented in Codegen. 595 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 596 597 // Reserve Trap Handler registers - support is not implemented in Codegen. 598 reserveRegisterTuples(Reserved, AMDGPU::TBA); 599 reserveRegisterTuples(Reserved, AMDGPU::TMA); 600 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 601 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 602 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 603 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 604 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 605 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 606 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 607 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 608 609 // Reserve null register - it shall never be allocated 610 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 611 612 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 613 // will result in bugs. 614 if (isWave32) { 615 Reserved.set(AMDGPU::VCC); 616 Reserved.set(AMDGPU::VCC_HI); 617 } 618 619 // Reserve SGPRs. 620 // 621 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 622 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 623 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 624 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 625 reserveRegisterTuples(Reserved, Reg); 626 } 627 628 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 629 if (ScratchRSrcReg != AMDGPU::NoRegister) { 630 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 631 // need to spill. 632 // TODO: May need to reserve a VGPR if doing LDS spilling. 633 reserveRegisterTuples(Reserved, ScratchRSrcReg); 634 } 635 636 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 637 if (LongBranchReservedReg) 638 reserveRegisterTuples(Reserved, LongBranchReservedReg); 639 640 // We have to assume the SP is needed in case there are calls in the function, 641 // which is detected after the function is lowered. If we aren't really going 642 // to need SP, don't bother reserving it. 643 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 644 if (StackPtrReg) { 645 reserveRegisterTuples(Reserved, StackPtrReg); 646 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 647 } 648 649 MCRegister FrameReg = MFI->getFrameOffsetReg(); 650 if (FrameReg) { 651 reserveRegisterTuples(Reserved, FrameReg); 652 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 653 } 654 655 if (hasBasePointer(MF)) { 656 MCRegister BasePtrReg = getBaseRegister(); 657 reserveRegisterTuples(Reserved, BasePtrReg); 658 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 659 } 660 661 // FIXME: Use same reserved register introduced in D149775 662 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. 663 Register ExecCopyReg = MFI->getSGPRForEXECCopy(); 664 if (ExecCopyReg) 665 reserveRegisterTuples(Reserved, ExecCopyReg); 666 667 // Reserve VGPRs/AGPRs. 668 // 669 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 670 unsigned MaxNumAGPRs = MaxNumVGPRs; 671 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 672 673 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 674 // a wave may have up to 512 total vector registers combining together both 675 // VGPRs and AGPRs. Hence, in an entry function without calls and without 676 // AGPRs used within it, it is possible to use the whole vector register 677 // budget for VGPRs. 678 // 679 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 680 // register file accordingly. 681 if (ST.hasGFX90AInsts()) { 682 if (MFI->usesAGPRs(MF)) { 683 MaxNumVGPRs /= 2; 684 MaxNumAGPRs = MaxNumVGPRs; 685 } else { 686 if (MaxNumVGPRs > TotalNumVGPRs) { 687 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 688 MaxNumVGPRs = TotalNumVGPRs; 689 } else 690 MaxNumAGPRs = 0; 691 } 692 } 693 694 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 695 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 696 reserveRegisterTuples(Reserved, Reg); 697 } 698 699 if (ST.hasMAIInsts()) { 700 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 701 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 702 reserveRegisterTuples(Reserved, Reg); 703 } 704 } else { 705 // Reserve all the AGPRs if there are no instructions to use it. 706 for (MCRegister Reg : AMDGPU::AGPR_32RegClass) 707 reserveRegisterTuples(Reserved, Reg); 708 } 709 710 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 711 // VGPR available at all times. 712 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 713 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 714 } 715 716 for (Register Reg : MFI->getWWMReservedRegs()) 717 reserveRegisterTuples(Reserved, Reg); 718 719 // FIXME: Stop using reserved registers for this. 720 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 721 reserveRegisterTuples(Reserved, Reg); 722 723 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 724 reserveRegisterTuples(Reserved, Reg); 725 726 return Reserved; 727 } 728 729 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 730 MCRegister PhysReg) const { 731 return !MF.getRegInfo().isReserved(PhysReg); 732 } 733 734 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 735 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 736 // On entry or in chain functions, the base address is 0, so it can't possibly 737 // need any more alignment. 738 739 // FIXME: Should be able to specify the entry frame alignment per calling 740 // convention instead. 741 if (Info->isBottomOfStack()) 742 return false; 743 744 return TargetRegisterInfo::shouldRealignStack(MF); 745 } 746 747 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 748 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 749 if (Info->isEntryFunction()) { 750 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 751 return MFI.hasStackObjects() || MFI.hasCalls(); 752 } 753 754 // May need scavenger for dealing with callee saved registers. 755 return true; 756 } 757 758 bool SIRegisterInfo::requiresFrameIndexScavenging( 759 const MachineFunction &MF) const { 760 // Do not use frame virtual registers. They used to be used for SGPRs, but 761 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 762 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 763 // spill. 764 return false; 765 } 766 767 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 768 const MachineFunction &MF) const { 769 const MachineFrameInfo &MFI = MF.getFrameInfo(); 770 return MFI.hasStackObjects(); 771 } 772 773 bool SIRegisterInfo::requiresVirtualBaseRegisters( 774 const MachineFunction &) const { 775 // There are no special dedicated stack or frame pointers. 776 return true; 777 } 778 779 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 780 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 781 782 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 783 AMDGPU::OpName::offset); 784 return MI->getOperand(OffIdx).getImm(); 785 } 786 787 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 788 int Idx) const { 789 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 790 return 0; 791 792 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 793 AMDGPU::OpName::vaddr) || 794 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 795 AMDGPU::OpName::saddr))) && 796 "Should never see frame index on non-address operand"); 797 798 return getScratchInstrOffset(MI); 799 } 800 801 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 802 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 803 return false; 804 805 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 806 807 const SIInstrInfo *TII = ST.getInstrInfo(); 808 if (SIInstrInfo::isMUBUF(*MI)) 809 return !TII->isLegalMUBUFImmOffset(FullOffset); 810 811 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 812 SIInstrFlags::FlatScratch); 813 } 814 815 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 816 int FrameIdx, 817 int64_t Offset) const { 818 MachineBasicBlock::iterator Ins = MBB->begin(); 819 DebugLoc DL; // Defaults to "unknown" 820 821 if (Ins != MBB->end()) 822 DL = Ins->getDebugLoc(); 823 824 MachineFunction *MF = MBB->getParent(); 825 const SIInstrInfo *TII = ST.getInstrInfo(); 826 MachineRegisterInfo &MRI = MF->getRegInfo(); 827 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 828 : AMDGPU::V_MOV_B32_e32; 829 830 Register BaseReg = MRI.createVirtualRegister( 831 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 832 : &AMDGPU::VGPR_32RegClass); 833 834 if (Offset == 0) { 835 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 836 .addFrameIndex(FrameIdx); 837 return BaseReg; 838 } 839 840 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 841 842 Register FIReg = MRI.createVirtualRegister( 843 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 844 : &AMDGPU::VGPR_32RegClass); 845 846 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 847 .addImm(Offset); 848 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 849 .addFrameIndex(FrameIdx); 850 851 if (ST.enableFlatScratch() ) { 852 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 853 .addReg(OffsetReg, RegState::Kill) 854 .addReg(FIReg); 855 return BaseReg; 856 } 857 858 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 859 .addReg(OffsetReg, RegState::Kill) 860 .addReg(FIReg) 861 .addImm(0); // clamp bit 862 863 return BaseReg; 864 } 865 866 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 867 int64_t Offset) const { 868 const SIInstrInfo *TII = ST.getInstrInfo(); 869 bool IsFlat = TII->isFLATScratch(MI); 870 871 #ifndef NDEBUG 872 // FIXME: Is it possible to be storing a frame index to itself? 873 bool SeenFI = false; 874 for (const MachineOperand &MO: MI.operands()) { 875 if (MO.isFI()) { 876 if (SeenFI) 877 llvm_unreachable("should not see multiple frame indices"); 878 879 SeenFI = true; 880 } 881 } 882 #endif 883 884 MachineOperand *FIOp = 885 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 886 : AMDGPU::OpName::vaddr); 887 888 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 889 int64_t NewOffset = OffsetOp->getImm() + Offset; 890 891 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 892 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 893 894 if (IsFlat) { 895 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 896 SIInstrFlags::FlatScratch) && 897 "offset should be legal"); 898 FIOp->ChangeToRegister(BaseReg, false); 899 OffsetOp->setImm(NewOffset); 900 return; 901 } 902 903 #ifndef NDEBUG 904 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 905 assert(SOffset->isImm() && SOffset->getImm() == 0); 906 #endif 907 908 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); 909 910 FIOp->ChangeToRegister(BaseReg, false); 911 OffsetOp->setImm(NewOffset); 912 } 913 914 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 915 Register BaseReg, 916 int64_t Offset) const { 917 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 918 return false; 919 920 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 921 922 const SIInstrInfo *TII = ST.getInstrInfo(); 923 if (SIInstrInfo::isMUBUF(*MI)) 924 return TII->isLegalMUBUFImmOffset(NewOffset); 925 926 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 927 SIInstrFlags::FlatScratch); 928 } 929 930 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 931 const MachineFunction &MF, unsigned Kind) const { 932 // This is inaccurate. It depends on the instruction and address space. The 933 // only place where we should hit this is for dealing with frame indexes / 934 // private accesses, so this is correct in that case. 935 return &AMDGPU::VGPR_32RegClass; 936 } 937 938 const TargetRegisterClass * 939 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 940 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 941 return getEquivalentVGPRClass(RC); 942 if (RC == &AMDGPU::SCC_CLASSRegClass) 943 return getWaveMaskRegClass(); 944 945 return RC; 946 } 947 948 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 949 950 switch (Op) { 951 case AMDGPU::SI_SPILL_S1024_SAVE: 952 case AMDGPU::SI_SPILL_S1024_RESTORE: 953 case AMDGPU::SI_SPILL_V1024_SAVE: 954 case AMDGPU::SI_SPILL_V1024_RESTORE: 955 case AMDGPU::SI_SPILL_A1024_SAVE: 956 case AMDGPU::SI_SPILL_A1024_RESTORE: 957 case AMDGPU::SI_SPILL_AV1024_SAVE: 958 case AMDGPU::SI_SPILL_AV1024_RESTORE: 959 return 32; 960 case AMDGPU::SI_SPILL_S512_SAVE: 961 case AMDGPU::SI_SPILL_S512_RESTORE: 962 case AMDGPU::SI_SPILL_V512_SAVE: 963 case AMDGPU::SI_SPILL_V512_RESTORE: 964 case AMDGPU::SI_SPILL_A512_SAVE: 965 case AMDGPU::SI_SPILL_A512_RESTORE: 966 case AMDGPU::SI_SPILL_AV512_SAVE: 967 case AMDGPU::SI_SPILL_AV512_RESTORE: 968 return 16; 969 case AMDGPU::SI_SPILL_S384_SAVE: 970 case AMDGPU::SI_SPILL_S384_RESTORE: 971 case AMDGPU::SI_SPILL_V384_SAVE: 972 case AMDGPU::SI_SPILL_V384_RESTORE: 973 case AMDGPU::SI_SPILL_A384_SAVE: 974 case AMDGPU::SI_SPILL_A384_RESTORE: 975 case AMDGPU::SI_SPILL_AV384_SAVE: 976 case AMDGPU::SI_SPILL_AV384_RESTORE: 977 return 12; 978 case AMDGPU::SI_SPILL_S352_SAVE: 979 case AMDGPU::SI_SPILL_S352_RESTORE: 980 case AMDGPU::SI_SPILL_V352_SAVE: 981 case AMDGPU::SI_SPILL_V352_RESTORE: 982 case AMDGPU::SI_SPILL_A352_SAVE: 983 case AMDGPU::SI_SPILL_A352_RESTORE: 984 case AMDGPU::SI_SPILL_AV352_SAVE: 985 case AMDGPU::SI_SPILL_AV352_RESTORE: 986 return 11; 987 case AMDGPU::SI_SPILL_S320_SAVE: 988 case AMDGPU::SI_SPILL_S320_RESTORE: 989 case AMDGPU::SI_SPILL_V320_SAVE: 990 case AMDGPU::SI_SPILL_V320_RESTORE: 991 case AMDGPU::SI_SPILL_A320_SAVE: 992 case AMDGPU::SI_SPILL_A320_RESTORE: 993 case AMDGPU::SI_SPILL_AV320_SAVE: 994 case AMDGPU::SI_SPILL_AV320_RESTORE: 995 return 10; 996 case AMDGPU::SI_SPILL_S288_SAVE: 997 case AMDGPU::SI_SPILL_S288_RESTORE: 998 case AMDGPU::SI_SPILL_V288_SAVE: 999 case AMDGPU::SI_SPILL_V288_RESTORE: 1000 case AMDGPU::SI_SPILL_A288_SAVE: 1001 case AMDGPU::SI_SPILL_A288_RESTORE: 1002 case AMDGPU::SI_SPILL_AV288_SAVE: 1003 case AMDGPU::SI_SPILL_AV288_RESTORE: 1004 return 9; 1005 case AMDGPU::SI_SPILL_S256_SAVE: 1006 case AMDGPU::SI_SPILL_S256_RESTORE: 1007 case AMDGPU::SI_SPILL_V256_SAVE: 1008 case AMDGPU::SI_SPILL_V256_RESTORE: 1009 case AMDGPU::SI_SPILL_A256_SAVE: 1010 case AMDGPU::SI_SPILL_A256_RESTORE: 1011 case AMDGPU::SI_SPILL_AV256_SAVE: 1012 case AMDGPU::SI_SPILL_AV256_RESTORE: 1013 return 8; 1014 case AMDGPU::SI_SPILL_S224_SAVE: 1015 case AMDGPU::SI_SPILL_S224_RESTORE: 1016 case AMDGPU::SI_SPILL_V224_SAVE: 1017 case AMDGPU::SI_SPILL_V224_RESTORE: 1018 case AMDGPU::SI_SPILL_A224_SAVE: 1019 case AMDGPU::SI_SPILL_A224_RESTORE: 1020 case AMDGPU::SI_SPILL_AV224_SAVE: 1021 case AMDGPU::SI_SPILL_AV224_RESTORE: 1022 return 7; 1023 case AMDGPU::SI_SPILL_S192_SAVE: 1024 case AMDGPU::SI_SPILL_S192_RESTORE: 1025 case AMDGPU::SI_SPILL_V192_SAVE: 1026 case AMDGPU::SI_SPILL_V192_RESTORE: 1027 case AMDGPU::SI_SPILL_A192_SAVE: 1028 case AMDGPU::SI_SPILL_A192_RESTORE: 1029 case AMDGPU::SI_SPILL_AV192_SAVE: 1030 case AMDGPU::SI_SPILL_AV192_RESTORE: 1031 return 6; 1032 case AMDGPU::SI_SPILL_S160_SAVE: 1033 case AMDGPU::SI_SPILL_S160_RESTORE: 1034 case AMDGPU::SI_SPILL_V160_SAVE: 1035 case AMDGPU::SI_SPILL_V160_RESTORE: 1036 case AMDGPU::SI_SPILL_A160_SAVE: 1037 case AMDGPU::SI_SPILL_A160_RESTORE: 1038 case AMDGPU::SI_SPILL_AV160_SAVE: 1039 case AMDGPU::SI_SPILL_AV160_RESTORE: 1040 return 5; 1041 case AMDGPU::SI_SPILL_S128_SAVE: 1042 case AMDGPU::SI_SPILL_S128_RESTORE: 1043 case AMDGPU::SI_SPILL_V128_SAVE: 1044 case AMDGPU::SI_SPILL_V128_RESTORE: 1045 case AMDGPU::SI_SPILL_A128_SAVE: 1046 case AMDGPU::SI_SPILL_A128_RESTORE: 1047 case AMDGPU::SI_SPILL_AV128_SAVE: 1048 case AMDGPU::SI_SPILL_AV128_RESTORE: 1049 return 4; 1050 case AMDGPU::SI_SPILL_S96_SAVE: 1051 case AMDGPU::SI_SPILL_S96_RESTORE: 1052 case AMDGPU::SI_SPILL_V96_SAVE: 1053 case AMDGPU::SI_SPILL_V96_RESTORE: 1054 case AMDGPU::SI_SPILL_A96_SAVE: 1055 case AMDGPU::SI_SPILL_A96_RESTORE: 1056 case AMDGPU::SI_SPILL_AV96_SAVE: 1057 case AMDGPU::SI_SPILL_AV96_RESTORE: 1058 return 3; 1059 case AMDGPU::SI_SPILL_S64_SAVE: 1060 case AMDGPU::SI_SPILL_S64_RESTORE: 1061 case AMDGPU::SI_SPILL_V64_SAVE: 1062 case AMDGPU::SI_SPILL_V64_RESTORE: 1063 case AMDGPU::SI_SPILL_A64_SAVE: 1064 case AMDGPU::SI_SPILL_A64_RESTORE: 1065 case AMDGPU::SI_SPILL_AV64_SAVE: 1066 case AMDGPU::SI_SPILL_AV64_RESTORE: 1067 return 2; 1068 case AMDGPU::SI_SPILL_S32_SAVE: 1069 case AMDGPU::SI_SPILL_S32_RESTORE: 1070 case AMDGPU::SI_SPILL_V32_SAVE: 1071 case AMDGPU::SI_SPILL_V32_RESTORE: 1072 case AMDGPU::SI_SPILL_A32_SAVE: 1073 case AMDGPU::SI_SPILL_A32_RESTORE: 1074 case AMDGPU::SI_SPILL_AV32_SAVE: 1075 case AMDGPU::SI_SPILL_AV32_RESTORE: 1076 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 1077 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 1078 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: 1079 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: 1080 return 1; 1081 default: llvm_unreachable("Invalid spill opcode"); 1082 } 1083 } 1084 1085 static int getOffsetMUBUFStore(unsigned Opc) { 1086 switch (Opc) { 1087 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1088 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1089 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1090 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1091 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1092 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1093 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1094 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1095 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1096 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1097 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1098 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1099 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1100 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1101 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1102 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1103 default: 1104 return -1; 1105 } 1106 } 1107 1108 static int getOffsetMUBUFLoad(unsigned Opc) { 1109 switch (Opc) { 1110 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1111 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1112 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1113 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1114 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1115 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1116 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1117 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1118 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1119 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1120 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1121 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1122 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1123 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1124 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1125 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1126 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1127 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1128 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1129 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1130 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1131 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1132 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1133 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1134 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1135 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1136 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1137 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1138 default: 1139 return -1; 1140 } 1141 } 1142 1143 static int getOffenMUBUFStore(unsigned Opc) { 1144 switch (Opc) { 1145 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1146 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1147 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1148 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1149 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1150 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1151 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1152 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1153 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1154 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1155 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1156 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1157 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1158 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1159 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1160 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1161 default: 1162 return -1; 1163 } 1164 } 1165 1166 static int getOffenMUBUFLoad(unsigned Opc) { 1167 switch (Opc) { 1168 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1169 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1170 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1171 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1172 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1173 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1174 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1175 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1176 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1177 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1178 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1179 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1180 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1181 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1182 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1183 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1184 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1185 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1186 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1187 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1188 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1189 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1190 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1191 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1192 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1193 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1194 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1195 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1196 default: 1197 return -1; 1198 } 1199 } 1200 1201 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1202 MachineBasicBlock &MBB, 1203 MachineBasicBlock::iterator MI, 1204 int Index, unsigned Lane, 1205 unsigned ValueReg, bool IsKill) { 1206 MachineFunction *MF = MBB.getParent(); 1207 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1208 const SIInstrInfo *TII = ST.getInstrInfo(); 1209 1210 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1211 1212 if (Reg == AMDGPU::NoRegister) 1213 return MachineInstrBuilder(); 1214 1215 bool IsStore = MI->mayStore(); 1216 MachineRegisterInfo &MRI = MF->getRegInfo(); 1217 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1218 1219 unsigned Dst = IsStore ? Reg : ValueReg; 1220 unsigned Src = IsStore ? ValueReg : Reg; 1221 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1222 DebugLoc DL = MI->getDebugLoc(); 1223 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1224 // Spiller during regalloc may restore a spilled register to its superclass. 1225 // It could result in AGPR spills restored to VGPRs or the other way around, 1226 // making the src and dst with identical regclasses at this point. It just 1227 // needs a copy in such cases. 1228 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1229 .addReg(Src, getKillRegState(IsKill)); 1230 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1231 return CopyMIB; 1232 } 1233 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1234 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1235 1236 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1237 .addReg(Src, getKillRegState(IsKill)); 1238 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1239 return MIB; 1240 } 1241 1242 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1243 // need to handle the case where an SGPR may need to be spilled while spilling. 1244 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1245 MachineFrameInfo &MFI, 1246 MachineBasicBlock::iterator MI, 1247 int Index, 1248 int64_t Offset) { 1249 const SIInstrInfo *TII = ST.getInstrInfo(); 1250 MachineBasicBlock *MBB = MI->getParent(); 1251 const DebugLoc &DL = MI->getDebugLoc(); 1252 bool IsStore = MI->mayStore(); 1253 1254 unsigned Opc = MI->getOpcode(); 1255 int LoadStoreOp = IsStore ? 1256 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1257 if (LoadStoreOp == -1) 1258 return false; 1259 1260 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1261 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1262 return true; 1263 1264 MachineInstrBuilder NewMI = 1265 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1266 .add(*Reg) 1267 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1268 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1269 .addImm(Offset) 1270 .addImm(0) // cpol 1271 .addImm(0) // swz 1272 .cloneMemRefs(*MI); 1273 1274 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1275 AMDGPU::OpName::vdata_in); 1276 if (VDataIn) 1277 NewMI.add(*VDataIn); 1278 return true; 1279 } 1280 1281 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1282 unsigned LoadStoreOp, 1283 unsigned EltSize) { 1284 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1285 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1286 bool UseST = 1287 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1288 1289 switch (EltSize) { 1290 case 4: 1291 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1292 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1293 break; 1294 case 8: 1295 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1296 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1297 break; 1298 case 12: 1299 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1300 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1301 break; 1302 case 16: 1303 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1304 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1305 break; 1306 default: 1307 llvm_unreachable("Unexpected spill load/store size!"); 1308 } 1309 1310 if (HasVAddr) 1311 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1312 else if (UseST) 1313 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1314 1315 return LoadStoreOp; 1316 } 1317 1318 void SIRegisterInfo::buildSpillLoadStore( 1319 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1320 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1321 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1322 RegScavenger *RS, LiveRegUnits *LiveUnits) const { 1323 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both"); 1324 1325 MachineFunction *MF = MBB.getParent(); 1326 const SIInstrInfo *TII = ST.getInstrInfo(); 1327 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1328 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1329 1330 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1331 bool IsStore = Desc->mayStore(); 1332 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1333 1334 bool CanClobberSCC = false; 1335 bool Scavenged = false; 1336 MCRegister SOffset = ScratchOffsetReg; 1337 1338 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1339 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1340 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1341 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; 1342 1343 // Always use 4 byte operations for AGPRs because we need to scavenge 1344 // a temporary VGPR. 1345 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1346 unsigned NumSubRegs = RegWidth / EltSize; 1347 unsigned Size = NumSubRegs * EltSize; 1348 unsigned RemSize = RegWidth - Size; 1349 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1350 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1351 int64_t MaterializedOffset = Offset; 1352 1353 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1354 int64_t ScratchOffsetRegDelta = 0; 1355 1356 if (IsFlat && EltSize > 4) { 1357 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1358 Desc = &TII->get(LoadStoreOp); 1359 } 1360 1361 Align Alignment = MFI.getObjectAlign(Index); 1362 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1363 1364 assert((IsFlat || ((Offset % EltSize) == 0)) && 1365 "unexpected VGPR spill offset"); 1366 1367 // Track a VGPR to use for a constant offset we need to materialize. 1368 Register TmpOffsetVGPR; 1369 1370 // Track a VGPR to use as an intermediate value. 1371 Register TmpIntermediateVGPR; 1372 bool UseVGPROffset = false; 1373 1374 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1375 // combination. 1376 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1377 int64_t VOffset) { 1378 // We are using a VGPR offset 1379 if (IsFlat && SGPRBase) { 1380 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1381 // SGPR, so perform the add as vector. 1382 // We don't need a base SGPR in the kernel. 1383 1384 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1385 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1386 .addReg(SGPRBase) 1387 .addImm(VOffset) 1388 .addImm(0); // clamp 1389 } else { 1390 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1391 .addReg(SGPRBase); 1392 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1393 .addImm(VOffset) 1394 .addReg(TmpOffsetVGPR); 1395 } 1396 } else { 1397 assert(TmpOffsetVGPR); 1398 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1399 .addImm(VOffset); 1400 } 1401 }; 1402 1403 bool IsOffsetLegal = 1404 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1405 SIInstrFlags::FlatScratch) 1406 : TII->isLegalMUBUFImmOffset(MaxOffset); 1407 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1408 SOffset = MCRegister(); 1409 1410 // We don't have access to the register scavenger if this function is called 1411 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. 1412 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1413 // entry. 1414 if (RS) { 1415 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1416 1417 // Piggy back on the liveness scan we just did see if SCC is dead. 1418 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1419 } else if (LiveUnits) { 1420 CanClobberSCC = LiveUnits->available(AMDGPU::SCC); 1421 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1422 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1423 SOffset = Reg; 1424 break; 1425 } 1426 } 1427 } 1428 1429 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1430 SOffset = Register(); 1431 1432 if (!SOffset) { 1433 UseVGPROffset = true; 1434 1435 if (RS) { 1436 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1437 } else { 1438 assert(LiveUnits); 1439 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1440 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1441 TmpOffsetVGPR = Reg; 1442 break; 1443 } 1444 } 1445 } 1446 1447 assert(TmpOffsetVGPR); 1448 } else if (!SOffset && CanClobberSCC) { 1449 // There are no free SGPRs, and since we are in the process of spilling 1450 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1451 // on SI/CI and on VI it is true until we implement spilling using scalar 1452 // stores), we have no way to free up an SGPR. Our solution here is to 1453 // add the offset directly to the ScratchOffset or StackPtrOffset 1454 // register, and then subtract the offset after the spill to return the 1455 // register to it's original value. 1456 1457 // TODO: If we don't have to do an emergency stack slot spill, converting 1458 // to use the VGPR offset is fewer instructions. 1459 if (!ScratchOffsetReg) 1460 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1461 SOffset = ScratchOffsetReg; 1462 ScratchOffsetRegDelta = Offset; 1463 } else { 1464 Scavenged = true; 1465 } 1466 1467 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1468 // we can simplify the adjustment of Offset here to just scale with 1469 // WavefrontSize. 1470 if (!IsFlat && !UseVGPROffset) 1471 Offset *= ST.getWavefrontSize(); 1472 1473 if (!UseVGPROffset && !SOffset) 1474 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1475 1476 if (UseVGPROffset) { 1477 // We are using a VGPR offset 1478 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1479 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1480 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1481 } else { 1482 assert(Offset != 0); 1483 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1484 .addReg(ScratchOffsetReg) 1485 .addImm(Offset); 1486 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1487 } 1488 1489 Offset = 0; 1490 } 1491 1492 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1493 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1494 && "Unexpected vaddr for flat scratch with a FI operand"); 1495 1496 if (UseVGPROffset) { 1497 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1498 } else { 1499 assert(ST.hasFlatScratchSTMode()); 1500 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1501 } 1502 1503 Desc = &TII->get(LoadStoreOp); 1504 } 1505 1506 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1507 ++i, RegOffset += EltSize) { 1508 if (i == NumSubRegs) { 1509 EltSize = RemSize; 1510 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1511 } 1512 Desc = &TII->get(LoadStoreOp); 1513 1514 if (!IsFlat && UseVGPROffset) { 1515 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1516 : getOffenMUBUFLoad(LoadStoreOp); 1517 Desc = &TII->get(NewLoadStoreOp); 1518 } 1519 1520 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1521 // If we are spilling an AGPR beyond the range of the memory instruction 1522 // offset and need to use a VGPR offset, we ideally have at least 2 1523 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1524 // recycle the VGPR used for the offset which requires resetting after 1525 // each subregister. 1526 1527 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1528 } 1529 1530 unsigned NumRegs = EltSize / 4; 1531 Register SubReg = e == 1 1532 ? ValueReg 1533 : Register(getSubReg(ValueReg, 1534 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1535 1536 unsigned SOffsetRegState = 0; 1537 unsigned SrcDstRegState = getDefRegState(!IsStore); 1538 const bool IsLastSubReg = i + 1 == e; 1539 const bool IsFirstSubReg = i == 0; 1540 if (IsLastSubReg) { 1541 SOffsetRegState |= getKillRegState(Scavenged); 1542 // The last implicit use carries the "Kill" flag. 1543 SrcDstRegState |= getKillRegState(IsKill); 1544 } 1545 1546 // Make sure the whole register is defined if there are undef components by 1547 // adding an implicit def of the super-reg on the first instruction. 1548 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1549 bool NeedSuperRegImpOperand = e > 1; 1550 1551 // Remaining element size to spill into memory after some parts of it 1552 // spilled into either AGPRs or VGPRs. 1553 unsigned RemEltSize = EltSize; 1554 1555 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1556 // starting from the last lane. In case if a register cannot be completely 1557 // spilled into another register that will ensure its alignment does not 1558 // change. For targets with VGPR alignment requirement this is important 1559 // in case of flat scratch usage as we might get a scratch_load or 1560 // scratch_store of an unaligned register otherwise. 1561 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1562 LaneE = RegOffset / 4; 1563 Lane >= LaneE; --Lane) { 1564 bool IsSubReg = e > 1 || EltSize > 4; 1565 Register Sub = IsSubReg 1566 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1567 : ValueReg; 1568 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1569 if (!MIB.getInstr()) 1570 break; 1571 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1572 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1573 NeedSuperRegDef = false; 1574 } 1575 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1576 NeedSuperRegImpOperand = true; 1577 unsigned State = SrcDstRegState; 1578 if (!IsLastSubReg || (Lane != LaneE)) 1579 State &= ~RegState::Kill; 1580 if (!IsFirstSubReg || (Lane != LaneS)) 1581 State &= ~RegState::Define; 1582 MIB.addReg(ValueReg, RegState::Implicit | State); 1583 } 1584 RemEltSize -= 4; 1585 } 1586 1587 if (!RemEltSize) // Fully spilled into AGPRs. 1588 continue; 1589 1590 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1591 assert(IsFlat && EltSize > 4); 1592 1593 unsigned NumRegs = RemEltSize / 4; 1594 SubReg = Register(getSubReg(ValueReg, 1595 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1596 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1597 Desc = &TII->get(Opc); 1598 } 1599 1600 unsigned FinalReg = SubReg; 1601 1602 if (IsAGPR) { 1603 assert(EltSize == 4); 1604 1605 if (!TmpIntermediateVGPR) { 1606 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1607 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1608 } 1609 if (IsStore) { 1610 auto AccRead = BuildMI(MBB, MI, DL, 1611 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1612 TmpIntermediateVGPR) 1613 .addReg(SubReg, getKillRegState(IsKill)); 1614 if (NeedSuperRegDef) 1615 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1616 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1617 } 1618 SubReg = TmpIntermediateVGPR; 1619 } else if (UseVGPROffset) { 1620 // FIXME: change to scavengeRegisterBackwards() 1621 if (!TmpOffsetVGPR) { 1622 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 1623 MI, false, 0); 1624 RS->setRegUsed(TmpOffsetVGPR); 1625 } 1626 } 1627 1628 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1629 MachineMemOperand *NewMMO = 1630 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1631 commonAlignment(Alignment, RegOffset)); 1632 1633 auto MIB = 1634 BuildMI(MBB, MI, DL, *Desc) 1635 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1636 1637 if (UseVGPROffset) { 1638 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1639 // intermediate accvgpr_write. 1640 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1641 } 1642 1643 if (!IsFlat) 1644 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1645 1646 if (SOffset == AMDGPU::NoRegister) { 1647 if (!IsFlat) { 1648 if (UseVGPROffset && ScratchOffsetReg) { 1649 MIB.addReg(ScratchOffsetReg); 1650 } else { 1651 assert(FuncInfo->isBottomOfStack()); 1652 MIB.addImm(0); 1653 } 1654 } 1655 } else { 1656 MIB.addReg(SOffset, SOffsetRegState); 1657 } 1658 MIB.addImm(Offset + RegOffset) 1659 .addImm(0); // cpol 1660 if (!IsFlat) 1661 MIB.addImm(0); // swz 1662 MIB.addMemOperand(NewMMO); 1663 1664 if (!IsAGPR && NeedSuperRegDef) 1665 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1666 1667 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1668 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1669 FinalReg) 1670 .addReg(TmpIntermediateVGPR, RegState::Kill); 1671 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1672 } 1673 1674 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1675 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1676 1677 // The epilog restore of a wwm-scratch register can cause undesired 1678 // optimization during machine-cp post PrologEpilogInserter if the same 1679 // register was assigned for return value ABI lowering with a COPY 1680 // instruction. As given below, with the epilog reload, the earlier COPY 1681 // appeared to be dead during machine-cp. 1682 // ... 1683 // v0 in WWM operation, needs the WWM spill at prolog/epilog. 1684 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 1685 // ... 1686 // Epilog block: 1687 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 1688 // ... 1689 // WWM spill restore to preserve the inactive lanes of v0. 1690 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 1691 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 1692 // $exec = S_MOV_B64 killed $sgpr4_sgpr5 1693 // ... 1694 // SI_RETURN implicit $vgpr0 1695 // ... 1696 // To fix it, mark the same reg as a tied op for such restore instructions 1697 // so that it marks a usage for the preceding COPY. 1698 if (!IsStore && MI != MBB.end() && MI->isReturn() && 1699 MI->readsRegister(SubReg, this)) { 1700 MIB.addReg(SubReg, RegState::Implicit); 1701 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1702 } 1703 } 1704 1705 if (ScratchOffsetRegDelta != 0) { 1706 // Subtract the offset we added to the ScratchOffset register. 1707 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1708 .addReg(SOffset) 1709 .addImm(-ScratchOffsetRegDelta); 1710 } 1711 } 1712 1713 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1714 int Offset, bool IsLoad, 1715 bool IsKill) const { 1716 // Load/store VGPR 1717 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1718 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1719 1720 Register FrameReg = 1721 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1722 ? getBaseRegister() 1723 : getFrameRegister(SB.MF); 1724 1725 Align Alignment = FrameInfo.getObjectAlign(Index); 1726 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1727 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1728 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1729 SB.EltSize, Alignment); 1730 1731 if (IsLoad) { 1732 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1733 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1734 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1735 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1736 } else { 1737 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1738 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1739 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1740 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1741 // This only ever adds one VGPR spill 1742 SB.MFI.addToSpilledVGPRs(1); 1743 } 1744 } 1745 1746 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 1747 RegScavenger *RS, SlotIndexes *Indexes, 1748 LiveIntervals *LIS, bool OnlyToVGPR, 1749 bool SpillToPhysVGPRLane) const { 1750 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1751 1752 ArrayRef<SpilledReg> VGPRSpills = 1753 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1754 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1755 bool SpillToVGPR = !VGPRSpills.empty(); 1756 if (OnlyToVGPR && !SpillToVGPR) 1757 return false; 1758 1759 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1760 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1761 1762 if (SpillToVGPR) { 1763 1764 assert(SB.NumSubRegs == VGPRSpills.size() && 1765 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1766 1767 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1768 Register SubReg = 1769 SB.NumSubRegs == 1 1770 ? SB.SuperReg 1771 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1772 SpilledReg Spill = VGPRSpills[i]; 1773 1774 bool IsFirstSubreg = i == 0; 1775 bool IsLastSubreg = i == SB.NumSubRegs - 1; 1776 bool UseKill = SB.IsKill && IsLastSubreg; 1777 1778 1779 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1780 // spill to this specific vgpr in the first basic block. 1781 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1782 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR) 1783 .addReg(SubReg, getKillRegState(UseKill)) 1784 .addImm(Spill.Lane) 1785 .addReg(Spill.VGPR); 1786 if (Indexes) { 1787 if (IsFirstSubreg) 1788 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1789 else 1790 Indexes->insertMachineInstrInMaps(*MIB); 1791 } 1792 1793 if (IsFirstSubreg && SB.NumSubRegs > 1) { 1794 // We may be spilling a super-register which is only partially defined, 1795 // and need to ensure later spills think the value is defined. 1796 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1797 } 1798 1799 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 1800 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1801 1802 // FIXME: Since this spills to another register instead of an actual 1803 // frame index, we should delete the frame index when all references to 1804 // it are fixed. 1805 } 1806 } else { 1807 SB.prepare(); 1808 1809 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1810 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1811 1812 // Per VGPR helper data 1813 auto PVD = SB.getPerVGPRData(); 1814 1815 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1816 unsigned TmpVGPRFlags = RegState::Undef; 1817 1818 // Write sub registers into the VGPR 1819 for (unsigned i = Offset * PVD.PerVGPR, 1820 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1821 i < e; ++i) { 1822 Register SubReg = 1823 SB.NumSubRegs == 1 1824 ? SB.SuperReg 1825 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1826 1827 MachineInstrBuilder WriteLane = 1828 BuildMI(*SB.MBB, MI, SB.DL, 1829 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR) 1830 .addReg(SubReg, SubKillState) 1831 .addImm(i % PVD.PerVGPR) 1832 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1833 TmpVGPRFlags = 0; 1834 1835 if (Indexes) { 1836 if (i == 0) 1837 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 1838 else 1839 Indexes->insertMachineInstrInMaps(*WriteLane); 1840 } 1841 1842 // There could be undef components of a spilled super register. 1843 // TODO: Can we detect this and skip the spill? 1844 if (SB.NumSubRegs > 1) { 1845 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1846 unsigned SuperKillState = 0; 1847 if (i + 1 == SB.NumSubRegs) 1848 SuperKillState |= getKillRegState(SB.IsKill); 1849 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1850 } 1851 } 1852 1853 // Write out VGPR 1854 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1855 } 1856 1857 SB.restore(); 1858 } 1859 1860 MI->eraseFromParent(); 1861 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1862 1863 if (LIS) 1864 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1865 1866 return true; 1867 } 1868 1869 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 1870 RegScavenger *RS, SlotIndexes *Indexes, 1871 LiveIntervals *LIS, bool OnlyToVGPR, 1872 bool SpillToPhysVGPRLane) const { 1873 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1874 1875 ArrayRef<SpilledReg> VGPRSpills = 1876 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 1877 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 1878 bool SpillToVGPR = !VGPRSpills.empty(); 1879 if (OnlyToVGPR && !SpillToVGPR) 1880 return false; 1881 1882 if (SpillToVGPR) { 1883 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1884 Register SubReg = 1885 SB.NumSubRegs == 1 1886 ? SB.SuperReg 1887 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1888 1889 SpilledReg Spill = VGPRSpills[i]; 1890 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1891 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 1892 .addReg(Spill.VGPR) 1893 .addImm(Spill.Lane); 1894 if (SB.NumSubRegs > 1 && i == 0) 1895 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1896 if (Indexes) { 1897 if (i == e - 1) 1898 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1899 else 1900 Indexes->insertMachineInstrInMaps(*MIB); 1901 } 1902 } 1903 } else { 1904 SB.prepare(); 1905 1906 // Per VGPR helper data 1907 auto PVD = SB.getPerVGPRData(); 1908 1909 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1910 // Load in VGPR data 1911 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1912 1913 // Unpack lanes 1914 for (unsigned i = Offset * PVD.PerVGPR, 1915 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1916 i < e; ++i) { 1917 Register SubReg = 1918 SB.NumSubRegs == 1 1919 ? SB.SuperReg 1920 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1921 1922 bool LastSubReg = (i + 1 == e); 1923 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1924 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 1925 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1926 .addImm(i); 1927 if (SB.NumSubRegs > 1 && i == 0) 1928 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1929 if (Indexes) { 1930 if (i == e - 1) 1931 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1932 else 1933 Indexes->insertMachineInstrInMaps(*MIB); 1934 } 1935 } 1936 } 1937 1938 SB.restore(); 1939 } 1940 1941 MI->eraseFromParent(); 1942 1943 if (LIS) 1944 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1945 1946 return true; 1947 } 1948 1949 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1950 MachineBasicBlock &RestoreMBB, 1951 Register SGPR, RegScavenger *RS) const { 1952 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1953 RS); 1954 SB.prepare(); 1955 // Generate the spill of SGPR to SB.TmpVGPR. 1956 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1957 auto PVD = SB.getPerVGPRData(); 1958 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1959 unsigned TmpVGPRFlags = RegState::Undef; 1960 // Write sub registers into the VGPR 1961 for (unsigned i = Offset * PVD.PerVGPR, 1962 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1963 i < e; ++i) { 1964 Register SubReg = 1965 SB.NumSubRegs == 1 1966 ? SB.SuperReg 1967 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1968 1969 MachineInstrBuilder WriteLane = 1970 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1971 SB.TmpVGPR) 1972 .addReg(SubReg, SubKillState) 1973 .addImm(i % PVD.PerVGPR) 1974 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1975 TmpVGPRFlags = 0; 1976 // There could be undef components of a spilled super register. 1977 // TODO: Can we detect this and skip the spill? 1978 if (SB.NumSubRegs > 1) { 1979 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1980 unsigned SuperKillState = 0; 1981 if (i + 1 == SB.NumSubRegs) 1982 SuperKillState |= getKillRegState(SB.IsKill); 1983 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1984 } 1985 } 1986 // Don't need to write VGPR out. 1987 } 1988 1989 // Restore clobbered registers in the specified restore block. 1990 MI = RestoreMBB.end(); 1991 SB.setMI(&RestoreMBB, MI); 1992 // Generate the restore of SGPR from SB.TmpVGPR. 1993 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1994 // Don't need to load VGPR in. 1995 // Unpack lanes 1996 for (unsigned i = Offset * PVD.PerVGPR, 1997 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1998 i < e; ++i) { 1999 Register SubReg = 2000 SB.NumSubRegs == 1 2001 ? SB.SuperReg 2002 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2003 bool LastSubReg = (i + 1 == e); 2004 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 2005 SubReg) 2006 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2007 .addImm(i); 2008 if (SB.NumSubRegs > 1 && i == 0) 2009 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2010 } 2011 } 2012 SB.restore(); 2013 2014 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2015 return false; 2016 } 2017 2018 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 2019 /// a VGPR and the stack slot can be safely eliminated when all other users are 2020 /// handled. 2021 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 2022 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 2023 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { 2024 switch (MI->getOpcode()) { 2025 case AMDGPU::SI_SPILL_S1024_SAVE: 2026 case AMDGPU::SI_SPILL_S512_SAVE: 2027 case AMDGPU::SI_SPILL_S384_SAVE: 2028 case AMDGPU::SI_SPILL_S352_SAVE: 2029 case AMDGPU::SI_SPILL_S320_SAVE: 2030 case AMDGPU::SI_SPILL_S288_SAVE: 2031 case AMDGPU::SI_SPILL_S256_SAVE: 2032 case AMDGPU::SI_SPILL_S224_SAVE: 2033 case AMDGPU::SI_SPILL_S192_SAVE: 2034 case AMDGPU::SI_SPILL_S160_SAVE: 2035 case AMDGPU::SI_SPILL_S128_SAVE: 2036 case AMDGPU::SI_SPILL_S96_SAVE: 2037 case AMDGPU::SI_SPILL_S64_SAVE: 2038 case AMDGPU::SI_SPILL_S32_SAVE: 2039 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2040 case AMDGPU::SI_SPILL_S1024_RESTORE: 2041 case AMDGPU::SI_SPILL_S512_RESTORE: 2042 case AMDGPU::SI_SPILL_S384_RESTORE: 2043 case AMDGPU::SI_SPILL_S352_RESTORE: 2044 case AMDGPU::SI_SPILL_S320_RESTORE: 2045 case AMDGPU::SI_SPILL_S288_RESTORE: 2046 case AMDGPU::SI_SPILL_S256_RESTORE: 2047 case AMDGPU::SI_SPILL_S224_RESTORE: 2048 case AMDGPU::SI_SPILL_S192_RESTORE: 2049 case AMDGPU::SI_SPILL_S160_RESTORE: 2050 case AMDGPU::SI_SPILL_S128_RESTORE: 2051 case AMDGPU::SI_SPILL_S96_RESTORE: 2052 case AMDGPU::SI_SPILL_S64_RESTORE: 2053 case AMDGPU::SI_SPILL_S32_RESTORE: 2054 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2055 default: 2056 llvm_unreachable("not an SGPR spill instruction"); 2057 } 2058 } 2059 2060 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2061 int SPAdj, unsigned FIOperandNum, 2062 RegScavenger *RS) const { 2063 MachineFunction *MF = MI->getParent()->getParent(); 2064 MachineBasicBlock *MBB = MI->getParent(); 2065 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2066 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2067 const SIInstrInfo *TII = ST.getInstrInfo(); 2068 DebugLoc DL = MI->getDebugLoc(); 2069 2070 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2071 2072 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 2073 int Index = MI->getOperand(FIOperandNum).getIndex(); 2074 2075 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2076 ? getBaseRegister() 2077 : getFrameRegister(*MF); 2078 2079 switch (MI->getOpcode()) { 2080 // SGPR register spill 2081 case AMDGPU::SI_SPILL_S1024_SAVE: 2082 case AMDGPU::SI_SPILL_S512_SAVE: 2083 case AMDGPU::SI_SPILL_S384_SAVE: 2084 case AMDGPU::SI_SPILL_S352_SAVE: 2085 case AMDGPU::SI_SPILL_S320_SAVE: 2086 case AMDGPU::SI_SPILL_S288_SAVE: 2087 case AMDGPU::SI_SPILL_S256_SAVE: 2088 case AMDGPU::SI_SPILL_S224_SAVE: 2089 case AMDGPU::SI_SPILL_S192_SAVE: 2090 case AMDGPU::SI_SPILL_S160_SAVE: 2091 case AMDGPU::SI_SPILL_S128_SAVE: 2092 case AMDGPU::SI_SPILL_S96_SAVE: 2093 case AMDGPU::SI_SPILL_S64_SAVE: 2094 case AMDGPU::SI_SPILL_S32_SAVE: { 2095 return spillSGPR(MI, Index, RS); 2096 } 2097 2098 // SGPR register restore 2099 case AMDGPU::SI_SPILL_S1024_RESTORE: 2100 case AMDGPU::SI_SPILL_S512_RESTORE: 2101 case AMDGPU::SI_SPILL_S384_RESTORE: 2102 case AMDGPU::SI_SPILL_S352_RESTORE: 2103 case AMDGPU::SI_SPILL_S320_RESTORE: 2104 case AMDGPU::SI_SPILL_S288_RESTORE: 2105 case AMDGPU::SI_SPILL_S256_RESTORE: 2106 case AMDGPU::SI_SPILL_S224_RESTORE: 2107 case AMDGPU::SI_SPILL_S192_RESTORE: 2108 case AMDGPU::SI_SPILL_S160_RESTORE: 2109 case AMDGPU::SI_SPILL_S128_RESTORE: 2110 case AMDGPU::SI_SPILL_S96_RESTORE: 2111 case AMDGPU::SI_SPILL_S64_RESTORE: 2112 case AMDGPU::SI_SPILL_S32_RESTORE: { 2113 return restoreSGPR(MI, Index, RS); 2114 } 2115 2116 // VGPR register spill 2117 case AMDGPU::SI_SPILL_V1024_SAVE: 2118 case AMDGPU::SI_SPILL_V512_SAVE: 2119 case AMDGPU::SI_SPILL_V384_SAVE: 2120 case AMDGPU::SI_SPILL_V352_SAVE: 2121 case AMDGPU::SI_SPILL_V320_SAVE: 2122 case AMDGPU::SI_SPILL_V288_SAVE: 2123 case AMDGPU::SI_SPILL_V256_SAVE: 2124 case AMDGPU::SI_SPILL_V224_SAVE: 2125 case AMDGPU::SI_SPILL_V192_SAVE: 2126 case AMDGPU::SI_SPILL_V160_SAVE: 2127 case AMDGPU::SI_SPILL_V128_SAVE: 2128 case AMDGPU::SI_SPILL_V96_SAVE: 2129 case AMDGPU::SI_SPILL_V64_SAVE: 2130 case AMDGPU::SI_SPILL_V32_SAVE: 2131 case AMDGPU::SI_SPILL_A1024_SAVE: 2132 case AMDGPU::SI_SPILL_A512_SAVE: 2133 case AMDGPU::SI_SPILL_A384_SAVE: 2134 case AMDGPU::SI_SPILL_A352_SAVE: 2135 case AMDGPU::SI_SPILL_A320_SAVE: 2136 case AMDGPU::SI_SPILL_A288_SAVE: 2137 case AMDGPU::SI_SPILL_A256_SAVE: 2138 case AMDGPU::SI_SPILL_A224_SAVE: 2139 case AMDGPU::SI_SPILL_A192_SAVE: 2140 case AMDGPU::SI_SPILL_A160_SAVE: 2141 case AMDGPU::SI_SPILL_A128_SAVE: 2142 case AMDGPU::SI_SPILL_A96_SAVE: 2143 case AMDGPU::SI_SPILL_A64_SAVE: 2144 case AMDGPU::SI_SPILL_A32_SAVE: 2145 case AMDGPU::SI_SPILL_AV1024_SAVE: 2146 case AMDGPU::SI_SPILL_AV512_SAVE: 2147 case AMDGPU::SI_SPILL_AV384_SAVE: 2148 case AMDGPU::SI_SPILL_AV352_SAVE: 2149 case AMDGPU::SI_SPILL_AV320_SAVE: 2150 case AMDGPU::SI_SPILL_AV288_SAVE: 2151 case AMDGPU::SI_SPILL_AV256_SAVE: 2152 case AMDGPU::SI_SPILL_AV224_SAVE: 2153 case AMDGPU::SI_SPILL_AV192_SAVE: 2154 case AMDGPU::SI_SPILL_AV160_SAVE: 2155 case AMDGPU::SI_SPILL_AV128_SAVE: 2156 case AMDGPU::SI_SPILL_AV96_SAVE: 2157 case AMDGPU::SI_SPILL_AV64_SAVE: 2158 case AMDGPU::SI_SPILL_AV32_SAVE: 2159 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 2160 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { 2161 const MachineOperand *VData = TII->getNamedOperand(*MI, 2162 AMDGPU::OpName::vdata); 2163 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2164 MFI->getStackPtrOffsetReg()); 2165 2166 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2167 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2168 auto *MBB = MI->getParent(); 2169 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2170 if (IsWWMRegSpill) { 2171 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2172 RS->isRegUsed(AMDGPU::SCC)); 2173 } 2174 buildSpillLoadStore( 2175 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2176 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2177 *MI->memoperands_begin(), RS); 2178 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2179 if (IsWWMRegSpill) 2180 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2181 2182 MI->eraseFromParent(); 2183 return true; 2184 } 2185 case AMDGPU::SI_SPILL_V32_RESTORE: 2186 case AMDGPU::SI_SPILL_V64_RESTORE: 2187 case AMDGPU::SI_SPILL_V96_RESTORE: 2188 case AMDGPU::SI_SPILL_V128_RESTORE: 2189 case AMDGPU::SI_SPILL_V160_RESTORE: 2190 case AMDGPU::SI_SPILL_V192_RESTORE: 2191 case AMDGPU::SI_SPILL_V224_RESTORE: 2192 case AMDGPU::SI_SPILL_V256_RESTORE: 2193 case AMDGPU::SI_SPILL_V288_RESTORE: 2194 case AMDGPU::SI_SPILL_V320_RESTORE: 2195 case AMDGPU::SI_SPILL_V352_RESTORE: 2196 case AMDGPU::SI_SPILL_V384_RESTORE: 2197 case AMDGPU::SI_SPILL_V512_RESTORE: 2198 case AMDGPU::SI_SPILL_V1024_RESTORE: 2199 case AMDGPU::SI_SPILL_A32_RESTORE: 2200 case AMDGPU::SI_SPILL_A64_RESTORE: 2201 case AMDGPU::SI_SPILL_A96_RESTORE: 2202 case AMDGPU::SI_SPILL_A128_RESTORE: 2203 case AMDGPU::SI_SPILL_A160_RESTORE: 2204 case AMDGPU::SI_SPILL_A192_RESTORE: 2205 case AMDGPU::SI_SPILL_A224_RESTORE: 2206 case AMDGPU::SI_SPILL_A256_RESTORE: 2207 case AMDGPU::SI_SPILL_A288_RESTORE: 2208 case AMDGPU::SI_SPILL_A320_RESTORE: 2209 case AMDGPU::SI_SPILL_A352_RESTORE: 2210 case AMDGPU::SI_SPILL_A384_RESTORE: 2211 case AMDGPU::SI_SPILL_A512_RESTORE: 2212 case AMDGPU::SI_SPILL_A1024_RESTORE: 2213 case AMDGPU::SI_SPILL_AV32_RESTORE: 2214 case AMDGPU::SI_SPILL_AV64_RESTORE: 2215 case AMDGPU::SI_SPILL_AV96_RESTORE: 2216 case AMDGPU::SI_SPILL_AV128_RESTORE: 2217 case AMDGPU::SI_SPILL_AV160_RESTORE: 2218 case AMDGPU::SI_SPILL_AV192_RESTORE: 2219 case AMDGPU::SI_SPILL_AV224_RESTORE: 2220 case AMDGPU::SI_SPILL_AV256_RESTORE: 2221 case AMDGPU::SI_SPILL_AV288_RESTORE: 2222 case AMDGPU::SI_SPILL_AV320_RESTORE: 2223 case AMDGPU::SI_SPILL_AV352_RESTORE: 2224 case AMDGPU::SI_SPILL_AV384_RESTORE: 2225 case AMDGPU::SI_SPILL_AV512_RESTORE: 2226 case AMDGPU::SI_SPILL_AV1024_RESTORE: 2227 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 2228 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { 2229 const MachineOperand *VData = TII->getNamedOperand(*MI, 2230 AMDGPU::OpName::vdata); 2231 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2232 MFI->getStackPtrOffsetReg()); 2233 2234 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2235 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2236 auto *MBB = MI->getParent(); 2237 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2238 if (IsWWMRegSpill) { 2239 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2240 RS->isRegUsed(AMDGPU::SCC)); 2241 } 2242 buildSpillLoadStore( 2243 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2244 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2245 *MI->memoperands_begin(), RS); 2246 2247 if (IsWWMRegSpill) 2248 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2249 2250 MI->eraseFromParent(); 2251 return true; 2252 } 2253 2254 default: { 2255 // Other access to frame index 2256 const DebugLoc &DL = MI->getDebugLoc(); 2257 2258 int64_t Offset = FrameInfo.getObjectOffset(Index); 2259 if (ST.enableFlatScratch()) { 2260 if (TII->isFLATScratch(*MI)) { 2261 assert((int16_t)FIOperandNum == 2262 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2263 AMDGPU::OpName::saddr)); 2264 2265 // The offset is always swizzled, just replace it 2266 if (FrameReg) 2267 FIOp.ChangeToRegister(FrameReg, false); 2268 2269 if (!Offset) 2270 return false; 2271 2272 MachineOperand *OffsetOp = 2273 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2274 int64_t NewOffset = Offset + OffsetOp->getImm(); 2275 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2276 SIInstrFlags::FlatScratch)) { 2277 OffsetOp->setImm(NewOffset); 2278 if (FrameReg) 2279 return false; 2280 Offset = 0; 2281 } 2282 2283 if (!Offset) { 2284 unsigned Opc = MI->getOpcode(); 2285 int NewOpc = -1; 2286 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2287 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2288 } else if (ST.hasFlatScratchSTMode()) { 2289 // On GFX10 we have ST mode to use no registers for an address. 2290 // Otherwise we need to materialize 0 into an SGPR. 2291 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2292 } 2293 2294 if (NewOpc != -1) { 2295 // removeOperand doesn't fixup tied operand indexes as it goes, so 2296 // it asserts. Untie vdst_in for now and retie them afterwards. 2297 int VDstIn = AMDGPU::getNamedOperandIdx(Opc, 2298 AMDGPU::OpName::vdst_in); 2299 bool TiedVDst = VDstIn != -1 && 2300 MI->getOperand(VDstIn).isReg() && 2301 MI->getOperand(VDstIn).isTied(); 2302 if (TiedVDst) 2303 MI->untieRegOperand(VDstIn); 2304 2305 MI->removeOperand( 2306 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2307 2308 if (TiedVDst) { 2309 int NewVDst = 2310 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2311 int NewVDstIn = 2312 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2313 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2314 MI->tieOperands(NewVDst, NewVDstIn); 2315 } 2316 MI->setDesc(TII->get(NewOpc)); 2317 return false; 2318 } 2319 } 2320 } 2321 2322 if (!FrameReg) { 2323 FIOp.ChangeToImmediate(Offset); 2324 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2325 return false; 2326 } 2327 2328 // We need to use register here. Check if we can use an SGPR or need 2329 // a VGPR. 2330 FIOp.ChangeToRegister(AMDGPU::M0, false); 2331 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2332 2333 if (!Offset && FrameReg && UseSGPR) { 2334 FIOp.setReg(FrameReg); 2335 return false; 2336 } 2337 2338 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2339 : &AMDGPU::VGPR_32RegClass; 2340 2341 Register TmpReg = 2342 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); 2343 FIOp.setReg(TmpReg); 2344 FIOp.setIsKill(); 2345 2346 if ((!FrameReg || !Offset) && TmpReg) { 2347 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2348 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2349 if (FrameReg) 2350 MIB.addReg(FrameReg); 2351 else 2352 MIB.addImm(Offset); 2353 2354 return false; 2355 } 2356 2357 bool NeedSaveSCC = 2358 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2359 2360 Register TmpSReg = 2361 UseSGPR ? TmpReg 2362 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2363 MI, false, 0, !UseSGPR); 2364 2365 // TODO: for flat scratch another attempt can be made with a VGPR index 2366 // if no SGPRs can be scavenged. 2367 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2368 report_fatal_error("Cannot scavenge register in FI elimination!"); 2369 2370 if (!TmpSReg) { 2371 // Use frame register and restore it after. 2372 TmpSReg = FrameReg; 2373 FIOp.setReg(FrameReg); 2374 FIOp.setIsKill(false); 2375 } 2376 2377 if (NeedSaveSCC) { 2378 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 2379 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 2380 .addReg(FrameReg) 2381 .addImm(Offset); 2382 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2383 .addReg(TmpSReg) 2384 .addImm(0); 2385 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 2386 .addImm(0) 2387 .addReg(TmpSReg); 2388 } else { 2389 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2390 .addReg(FrameReg) 2391 .addImm(Offset); 2392 } 2393 2394 if (!UseSGPR) 2395 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2396 .addReg(TmpSReg, RegState::Kill); 2397 2398 if (TmpSReg == FrameReg) { 2399 // Undo frame register modification. 2400 if (NeedSaveSCC && !MI->registerDefIsDead(AMDGPU::SCC)) { 2401 MachineBasicBlock::iterator I = 2402 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 2403 TmpSReg) 2404 .addReg(FrameReg) 2405 .addImm(-Offset); 2406 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2407 .addReg(TmpSReg) 2408 .addImm(0); 2409 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 2410 TmpSReg) 2411 .addImm(0) 2412 .addReg(TmpSReg); 2413 } else { 2414 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2415 FrameReg) 2416 .addReg(FrameReg) 2417 .addImm(-Offset); 2418 } 2419 } 2420 2421 return false; 2422 } 2423 2424 bool IsMUBUF = TII->isMUBUF(*MI); 2425 2426 if (!IsMUBUF && !MFI->isBottomOfStack()) { 2427 // Convert to a swizzled stack address by scaling by the wave size. 2428 // In an entry function/kernel the offset is already swizzled. 2429 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 2430 bool LiveSCC = 2431 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2432 const TargetRegisterClass *RC = IsSALU && !LiveSCC 2433 ? &AMDGPU::SReg_32RegClass 2434 : &AMDGPU::VGPR_32RegClass; 2435 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 2436 MI->getOpcode() == AMDGPU::V_MOV_B32_e64; 2437 Register ResultReg = 2438 IsCopy ? MI->getOperand(0).getReg() 2439 : RS->scavengeRegisterBackwards(*RC, MI, false, 0); 2440 2441 int64_t Offset = FrameInfo.getObjectOffset(Index); 2442 if (Offset == 0) { 2443 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 2444 : AMDGPU::V_LSHRREV_B32_e64; 2445 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg); 2446 if (OpCode == AMDGPU::V_LSHRREV_B32_e64) 2447 // For V_LSHRREV, the operands are reversed (the shift count goes 2448 // first). 2449 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg); 2450 else 2451 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2()); 2452 if (IsSALU && !LiveSCC) 2453 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 2454 if (IsSALU && LiveSCC) { 2455 Register NewDest = RS->scavengeRegisterBackwards( 2456 AMDGPU::SReg_32RegClass, Shift, false, 0); 2457 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 2458 NewDest) 2459 .addReg(ResultReg); 2460 ResultReg = NewDest; 2461 } 2462 } else { 2463 MachineInstrBuilder MIB; 2464 if (!IsSALU) { 2465 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 2466 nullptr) { 2467 // Reuse ResultReg in intermediate step. 2468 Register ScaledReg = ResultReg; 2469 2470 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2471 ScaledReg) 2472 .addImm(ST.getWavefrontSizeLog2()) 2473 .addReg(FrameReg); 2474 2475 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2476 2477 // TODO: Fold if use instruction is another add of a constant. 2478 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2479 // FIXME: This can fail 2480 MIB.addImm(Offset); 2481 MIB.addReg(ScaledReg, RegState::Kill); 2482 if (!IsVOP2) 2483 MIB.addImm(0); // clamp bit 2484 } else { 2485 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2486 "Need to reuse carry out register"); 2487 2488 // Use scavenged unused carry out as offset register. 2489 Register ConstOffsetReg; 2490 if (!isWave32) 2491 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2492 else 2493 ConstOffsetReg = MIB.getReg(1); 2494 2495 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2496 .addImm(Offset); 2497 MIB.addReg(ConstOffsetReg, RegState::Kill); 2498 MIB.addReg(ScaledReg, RegState::Kill); 2499 MIB.addImm(0); // clamp bit 2500 } 2501 } 2502 } 2503 if (!MIB || IsSALU) { 2504 // We have to produce a carry out, and there isn't a free SGPR pair 2505 // for it. We can keep the whole computation on the SALU to avoid 2506 // clobbering an additional register at the cost of an extra mov. 2507 2508 // We may have 1 free scratch SGPR even though a carry out is 2509 // unavailable. Only one additional mov is needed. 2510 Register TmpScaledReg = RS->scavengeRegisterBackwards( 2511 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false); 2512 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2513 2514 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2515 .addReg(FrameReg) 2516 .addImm(ST.getWavefrontSizeLog2()); 2517 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2518 .addReg(ScaledReg, RegState::Kill) 2519 .addImm(Offset); 2520 if (!IsSALU) 2521 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2522 .addReg(ScaledReg, RegState::Kill); 2523 else 2524 ResultReg = ScaledReg; 2525 2526 // If there were truly no free SGPRs, we need to undo everything. 2527 if (!TmpScaledReg.isValid()) { 2528 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2529 .addReg(ScaledReg, RegState::Kill) 2530 .addImm(-Offset); 2531 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2532 .addReg(FrameReg) 2533 .addImm(ST.getWavefrontSizeLog2()); 2534 } 2535 } 2536 } 2537 2538 // Don't introduce an extra copy if we're just materializing in a mov. 2539 if (IsCopy) { 2540 MI->eraseFromParent(); 2541 return true; 2542 } 2543 FIOp.ChangeToRegister(ResultReg, false, false, true); 2544 return false; 2545 } 2546 2547 if (IsMUBUF) { 2548 // Disable offen so we don't need a 0 vgpr base. 2549 assert(static_cast<int>(FIOperandNum) == 2550 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2551 AMDGPU::OpName::vaddr)); 2552 2553 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2554 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2555 2556 if (FrameReg != AMDGPU::NoRegister) 2557 SOffset.ChangeToRegister(FrameReg, false); 2558 2559 int64_t Offset = FrameInfo.getObjectOffset(Index); 2560 int64_t OldImm 2561 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2562 int64_t NewOffset = OldImm + Offset; 2563 2564 if (TII->isLegalMUBUFImmOffset(NewOffset) && 2565 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2566 MI->eraseFromParent(); 2567 return true; 2568 } 2569 } 2570 2571 // If the offset is simply too big, don't convert to a scratch wave offset 2572 // relative index. 2573 2574 FIOp.ChangeToImmediate(Offset); 2575 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2576 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 2577 MI, false, 0); 2578 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2579 .addImm(Offset); 2580 FIOp.ChangeToRegister(TmpReg, false, false, true); 2581 } 2582 } 2583 } 2584 return false; 2585 } 2586 2587 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2588 return AMDGPUInstPrinter::getRegisterName(Reg); 2589 } 2590 2591 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { 2592 return getRegBitWidth(RC.getID()); 2593 } 2594 2595 static const TargetRegisterClass * 2596 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2597 if (BitWidth == 64) 2598 return &AMDGPU::VReg_64RegClass; 2599 if (BitWidth == 96) 2600 return &AMDGPU::VReg_96RegClass; 2601 if (BitWidth == 128) 2602 return &AMDGPU::VReg_128RegClass; 2603 if (BitWidth == 160) 2604 return &AMDGPU::VReg_160RegClass; 2605 if (BitWidth == 192) 2606 return &AMDGPU::VReg_192RegClass; 2607 if (BitWidth == 224) 2608 return &AMDGPU::VReg_224RegClass; 2609 if (BitWidth == 256) 2610 return &AMDGPU::VReg_256RegClass; 2611 if (BitWidth == 288) 2612 return &AMDGPU::VReg_288RegClass; 2613 if (BitWidth == 320) 2614 return &AMDGPU::VReg_320RegClass; 2615 if (BitWidth == 352) 2616 return &AMDGPU::VReg_352RegClass; 2617 if (BitWidth == 384) 2618 return &AMDGPU::VReg_384RegClass; 2619 if (BitWidth == 512) 2620 return &AMDGPU::VReg_512RegClass; 2621 if (BitWidth == 1024) 2622 return &AMDGPU::VReg_1024RegClass; 2623 2624 return nullptr; 2625 } 2626 2627 static const TargetRegisterClass * 2628 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2629 if (BitWidth == 64) 2630 return &AMDGPU::VReg_64_Align2RegClass; 2631 if (BitWidth == 96) 2632 return &AMDGPU::VReg_96_Align2RegClass; 2633 if (BitWidth == 128) 2634 return &AMDGPU::VReg_128_Align2RegClass; 2635 if (BitWidth == 160) 2636 return &AMDGPU::VReg_160_Align2RegClass; 2637 if (BitWidth == 192) 2638 return &AMDGPU::VReg_192_Align2RegClass; 2639 if (BitWidth == 224) 2640 return &AMDGPU::VReg_224_Align2RegClass; 2641 if (BitWidth == 256) 2642 return &AMDGPU::VReg_256_Align2RegClass; 2643 if (BitWidth == 288) 2644 return &AMDGPU::VReg_288_Align2RegClass; 2645 if (BitWidth == 320) 2646 return &AMDGPU::VReg_320_Align2RegClass; 2647 if (BitWidth == 352) 2648 return &AMDGPU::VReg_352_Align2RegClass; 2649 if (BitWidth == 384) 2650 return &AMDGPU::VReg_384_Align2RegClass; 2651 if (BitWidth == 512) 2652 return &AMDGPU::VReg_512_Align2RegClass; 2653 if (BitWidth == 1024) 2654 return &AMDGPU::VReg_1024_Align2RegClass; 2655 2656 return nullptr; 2657 } 2658 2659 const TargetRegisterClass * 2660 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2661 if (BitWidth == 1) 2662 return &AMDGPU::VReg_1RegClass; 2663 if (BitWidth == 16) 2664 return &AMDGPU::VGPR_LO16RegClass; 2665 if (BitWidth == 32) 2666 return &AMDGPU::VGPR_32RegClass; 2667 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2668 : getAnyVGPRClassForBitWidth(BitWidth); 2669 } 2670 2671 static const TargetRegisterClass * 2672 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2673 if (BitWidth == 64) 2674 return &AMDGPU::AReg_64RegClass; 2675 if (BitWidth == 96) 2676 return &AMDGPU::AReg_96RegClass; 2677 if (BitWidth == 128) 2678 return &AMDGPU::AReg_128RegClass; 2679 if (BitWidth == 160) 2680 return &AMDGPU::AReg_160RegClass; 2681 if (BitWidth == 192) 2682 return &AMDGPU::AReg_192RegClass; 2683 if (BitWidth == 224) 2684 return &AMDGPU::AReg_224RegClass; 2685 if (BitWidth == 256) 2686 return &AMDGPU::AReg_256RegClass; 2687 if (BitWidth == 288) 2688 return &AMDGPU::AReg_288RegClass; 2689 if (BitWidth == 320) 2690 return &AMDGPU::AReg_320RegClass; 2691 if (BitWidth == 352) 2692 return &AMDGPU::AReg_352RegClass; 2693 if (BitWidth == 384) 2694 return &AMDGPU::AReg_384RegClass; 2695 if (BitWidth == 512) 2696 return &AMDGPU::AReg_512RegClass; 2697 if (BitWidth == 1024) 2698 return &AMDGPU::AReg_1024RegClass; 2699 2700 return nullptr; 2701 } 2702 2703 static const TargetRegisterClass * 2704 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2705 if (BitWidth == 64) 2706 return &AMDGPU::AReg_64_Align2RegClass; 2707 if (BitWidth == 96) 2708 return &AMDGPU::AReg_96_Align2RegClass; 2709 if (BitWidth == 128) 2710 return &AMDGPU::AReg_128_Align2RegClass; 2711 if (BitWidth == 160) 2712 return &AMDGPU::AReg_160_Align2RegClass; 2713 if (BitWidth == 192) 2714 return &AMDGPU::AReg_192_Align2RegClass; 2715 if (BitWidth == 224) 2716 return &AMDGPU::AReg_224_Align2RegClass; 2717 if (BitWidth == 256) 2718 return &AMDGPU::AReg_256_Align2RegClass; 2719 if (BitWidth == 288) 2720 return &AMDGPU::AReg_288_Align2RegClass; 2721 if (BitWidth == 320) 2722 return &AMDGPU::AReg_320_Align2RegClass; 2723 if (BitWidth == 352) 2724 return &AMDGPU::AReg_352_Align2RegClass; 2725 if (BitWidth == 384) 2726 return &AMDGPU::AReg_384_Align2RegClass; 2727 if (BitWidth == 512) 2728 return &AMDGPU::AReg_512_Align2RegClass; 2729 if (BitWidth == 1024) 2730 return &AMDGPU::AReg_1024_Align2RegClass; 2731 2732 return nullptr; 2733 } 2734 2735 const TargetRegisterClass * 2736 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2737 if (BitWidth == 16) 2738 return &AMDGPU::AGPR_LO16RegClass; 2739 if (BitWidth == 32) 2740 return &AMDGPU::AGPR_32RegClass; 2741 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2742 : getAnyAGPRClassForBitWidth(BitWidth); 2743 } 2744 2745 static const TargetRegisterClass * 2746 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2747 if (BitWidth == 64) 2748 return &AMDGPU::AV_64RegClass; 2749 if (BitWidth == 96) 2750 return &AMDGPU::AV_96RegClass; 2751 if (BitWidth == 128) 2752 return &AMDGPU::AV_128RegClass; 2753 if (BitWidth == 160) 2754 return &AMDGPU::AV_160RegClass; 2755 if (BitWidth == 192) 2756 return &AMDGPU::AV_192RegClass; 2757 if (BitWidth == 224) 2758 return &AMDGPU::AV_224RegClass; 2759 if (BitWidth == 256) 2760 return &AMDGPU::AV_256RegClass; 2761 if (BitWidth == 288) 2762 return &AMDGPU::AV_288RegClass; 2763 if (BitWidth == 320) 2764 return &AMDGPU::AV_320RegClass; 2765 if (BitWidth == 352) 2766 return &AMDGPU::AV_352RegClass; 2767 if (BitWidth == 384) 2768 return &AMDGPU::AV_384RegClass; 2769 if (BitWidth == 512) 2770 return &AMDGPU::AV_512RegClass; 2771 if (BitWidth == 1024) 2772 return &AMDGPU::AV_1024RegClass; 2773 2774 return nullptr; 2775 } 2776 2777 static const TargetRegisterClass * 2778 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2779 if (BitWidth == 64) 2780 return &AMDGPU::AV_64_Align2RegClass; 2781 if (BitWidth == 96) 2782 return &AMDGPU::AV_96_Align2RegClass; 2783 if (BitWidth == 128) 2784 return &AMDGPU::AV_128_Align2RegClass; 2785 if (BitWidth == 160) 2786 return &AMDGPU::AV_160_Align2RegClass; 2787 if (BitWidth == 192) 2788 return &AMDGPU::AV_192_Align2RegClass; 2789 if (BitWidth == 224) 2790 return &AMDGPU::AV_224_Align2RegClass; 2791 if (BitWidth == 256) 2792 return &AMDGPU::AV_256_Align2RegClass; 2793 if (BitWidth == 288) 2794 return &AMDGPU::AV_288_Align2RegClass; 2795 if (BitWidth == 320) 2796 return &AMDGPU::AV_320_Align2RegClass; 2797 if (BitWidth == 352) 2798 return &AMDGPU::AV_352_Align2RegClass; 2799 if (BitWidth == 384) 2800 return &AMDGPU::AV_384_Align2RegClass; 2801 if (BitWidth == 512) 2802 return &AMDGPU::AV_512_Align2RegClass; 2803 if (BitWidth == 1024) 2804 return &AMDGPU::AV_1024_Align2RegClass; 2805 2806 return nullptr; 2807 } 2808 2809 const TargetRegisterClass * 2810 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2811 if (BitWidth == 16) 2812 return &AMDGPU::VGPR_LO16RegClass; 2813 if (BitWidth == 32) 2814 return &AMDGPU::AV_32RegClass; 2815 return ST.needsAlignedVGPRs() 2816 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2817 : getAnyVectorSuperClassForBitWidth(BitWidth); 2818 } 2819 2820 const TargetRegisterClass * 2821 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2822 if (BitWidth == 16) 2823 return &AMDGPU::SGPR_LO16RegClass; 2824 if (BitWidth == 32) 2825 return &AMDGPU::SReg_32RegClass; 2826 if (BitWidth == 64) 2827 return &AMDGPU::SReg_64RegClass; 2828 if (BitWidth == 96) 2829 return &AMDGPU::SGPR_96RegClass; 2830 if (BitWidth == 128) 2831 return &AMDGPU::SGPR_128RegClass; 2832 if (BitWidth == 160) 2833 return &AMDGPU::SGPR_160RegClass; 2834 if (BitWidth == 192) 2835 return &AMDGPU::SGPR_192RegClass; 2836 if (BitWidth == 224) 2837 return &AMDGPU::SGPR_224RegClass; 2838 if (BitWidth == 256) 2839 return &AMDGPU::SGPR_256RegClass; 2840 if (BitWidth == 288) 2841 return &AMDGPU::SGPR_288RegClass; 2842 if (BitWidth == 320) 2843 return &AMDGPU::SGPR_320RegClass; 2844 if (BitWidth == 352) 2845 return &AMDGPU::SGPR_352RegClass; 2846 if (BitWidth == 384) 2847 return &AMDGPU::SGPR_384RegClass; 2848 if (BitWidth == 512) 2849 return &AMDGPU::SGPR_512RegClass; 2850 if (BitWidth == 1024) 2851 return &AMDGPU::SGPR_1024RegClass; 2852 2853 return nullptr; 2854 } 2855 2856 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2857 Register Reg) const { 2858 const TargetRegisterClass *RC; 2859 if (Reg.isVirtual()) 2860 RC = MRI.getRegClass(Reg); 2861 else 2862 RC = getPhysRegBaseClass(Reg); 2863 return RC ? isSGPRClass(RC) : false; 2864 } 2865 2866 const TargetRegisterClass * 2867 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2868 unsigned Size = getRegSizeInBits(*SRC); 2869 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2870 assert(VRC && "Invalid register class size"); 2871 return VRC; 2872 } 2873 2874 const TargetRegisterClass * 2875 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2876 unsigned Size = getRegSizeInBits(*SRC); 2877 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2878 assert(ARC && "Invalid register class size"); 2879 return ARC; 2880 } 2881 2882 const TargetRegisterClass * 2883 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2884 unsigned Size = getRegSizeInBits(*VRC); 2885 if (Size == 32) 2886 return &AMDGPU::SGPR_32RegClass; 2887 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2888 assert(SRC && "Invalid register class size"); 2889 return SRC; 2890 } 2891 2892 const TargetRegisterClass * 2893 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2894 const TargetRegisterClass *SubRC, 2895 unsigned SubIdx) const { 2896 // Ensure this subregister index is aligned in the super register. 2897 const TargetRegisterClass *MatchRC = 2898 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2899 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2900 } 2901 2902 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2903 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2904 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2905 return !ST.hasMFMAInlineLiteralBug(); 2906 2907 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2908 OpType <= AMDGPU::OPERAND_SRC_LAST; 2909 } 2910 2911 bool SIRegisterInfo::shouldRewriteCopySrc( 2912 const TargetRegisterClass *DefRC, 2913 unsigned DefSubReg, 2914 const TargetRegisterClass *SrcRC, 2915 unsigned SrcSubReg) const { 2916 // We want to prefer the smallest register class possible, so we don't want to 2917 // stop and rewrite on anything that looks like a subregister 2918 // extract. Operations mostly don't care about the super register class, so we 2919 // only want to stop on the most basic of copies between the same register 2920 // class. 2921 // 2922 // e.g. if we have something like 2923 // %0 = ... 2924 // %1 = ... 2925 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2926 // %3 = COPY %2, sub0 2927 // 2928 // We want to look through the COPY to find: 2929 // => %3 = COPY %0 2930 2931 // Plain copy. 2932 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2933 } 2934 2935 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2936 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2937 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2938 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2939 } 2940 2941 /// Returns a lowest register that is not used at any point in the function. 2942 /// If all registers are used, then this function will return 2943 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return 2944 /// highest unused register. 2945 MCRegister SIRegisterInfo::findUnusedRegister( 2946 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, 2947 const MachineFunction &MF, bool ReserveHighestRegister) const { 2948 if (ReserveHighestRegister) { 2949 for (MCRegister Reg : reverse(*RC)) 2950 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2951 return Reg; 2952 } else { 2953 for (MCRegister Reg : *RC) 2954 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2955 return Reg; 2956 } 2957 return MCRegister(); 2958 } 2959 2960 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, 2961 const RegisterBankInfo &RBI, 2962 Register Reg) const { 2963 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); 2964 if (!RB) 2965 return false; 2966 2967 return !RBI.isDivergentRegBank(RB); 2968 } 2969 2970 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2971 unsigned EltSize) const { 2972 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); 2973 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2974 2975 const unsigned RegDWORDs = RegBitWidth / 32; 2976 const unsigned EltDWORDs = EltSize / 4; 2977 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2978 2979 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2980 const unsigned NumParts = RegDWORDs / EltDWORDs; 2981 2982 return ArrayRef(Parts.data(), NumParts); 2983 } 2984 2985 const TargetRegisterClass* 2986 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2987 Register Reg) const { 2988 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 2989 } 2990 2991 const TargetRegisterClass * 2992 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 2993 const MachineOperand &MO) const { 2994 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 2995 return getSubRegisterClass(SrcRC, MO.getSubReg()); 2996 } 2997 2998 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2999 Register Reg) const { 3000 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3001 // Registers without classes are unaddressable, SGPR-like registers. 3002 return RC && isVGPRClass(RC); 3003 } 3004 3005 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 3006 Register Reg) const { 3007 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3008 3009 // Registers without classes are unaddressable, SGPR-like registers. 3010 return RC && isAGPRClass(RC); 3011 } 3012 3013 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 3014 const TargetRegisterClass *SrcRC, 3015 unsigned SubReg, 3016 const TargetRegisterClass *DstRC, 3017 unsigned DstSubReg, 3018 const TargetRegisterClass *NewRC, 3019 LiveIntervals &LIS) const { 3020 unsigned SrcSize = getRegSizeInBits(*SrcRC); 3021 unsigned DstSize = getRegSizeInBits(*DstRC); 3022 unsigned NewSize = getRegSizeInBits(*NewRC); 3023 3024 // Do not increase size of registers beyond dword, we would need to allocate 3025 // adjacent registers and constraint regalloc more than needed. 3026 3027 // Always allow dword coalescing. 3028 if (SrcSize <= 32 || DstSize <= 32) 3029 return true; 3030 3031 return NewSize <= DstSize || NewSize <= SrcSize; 3032 } 3033 3034 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 3035 MachineFunction &MF) const { 3036 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3037 3038 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 3039 MF.getFunction()); 3040 switch (RC->getID()) { 3041 default: 3042 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 3043 case AMDGPU::VGPR_32RegClassID: 3044 case AMDGPU::VGPR_LO16RegClassID: 3045 case AMDGPU::VGPR_HI16RegClassID: 3046 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 3047 case AMDGPU::SGPR_32RegClassID: 3048 case AMDGPU::SGPR_LO16RegClassID: 3049 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 3050 } 3051 } 3052 3053 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 3054 unsigned Idx) const { 3055 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 3056 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 3057 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 3058 const_cast<MachineFunction &>(MF)); 3059 3060 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 3061 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 3062 const_cast<MachineFunction &>(MF)); 3063 3064 llvm_unreachable("Unexpected register pressure set!"); 3065 } 3066 3067 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 3068 static const int Empty[] = { -1 }; 3069 3070 if (RegPressureIgnoredUnits[RegUnit]) 3071 return Empty; 3072 3073 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 3074 } 3075 3076 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 3077 // Not a callee saved register. 3078 return AMDGPU::SGPR30_SGPR31; 3079 } 3080 3081 const TargetRegisterClass * 3082 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 3083 const RegisterBank &RB) const { 3084 switch (RB.getID()) { 3085 case AMDGPU::VGPRRegBankID: 3086 return getVGPRClassForBitWidth( 3087 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); 3088 case AMDGPU::VCCRegBankID: 3089 assert(Size == 1); 3090 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3091 : &AMDGPU::SReg_64_XEXECRegClass; 3092 case AMDGPU::SGPRRegBankID: 3093 return getSGPRClassForBitWidth(std::max(32u, Size)); 3094 case AMDGPU::AGPRRegBankID: 3095 return getAGPRClassForBitWidth(std::max(32u, Size)); 3096 default: 3097 llvm_unreachable("unknown register bank"); 3098 } 3099 } 3100 3101 const TargetRegisterClass * 3102 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3103 const MachineRegisterInfo &MRI) const { 3104 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3105 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 3106 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3107 3108 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 3109 return getAllocatableClass(RC); 3110 3111 return nullptr; 3112 } 3113 3114 MCRegister SIRegisterInfo::getVCC() const { 3115 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3116 } 3117 3118 MCRegister SIRegisterInfo::getExec() const { 3119 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3120 } 3121 3122 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3123 // VGPR tuples have an alignment requirement on gfx90a variants. 3124 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3125 : &AMDGPU::VReg_64RegClass; 3126 } 3127 3128 const TargetRegisterClass * 3129 SIRegisterInfo::getRegClass(unsigned RCID) const { 3130 switch ((int)RCID) { 3131 case AMDGPU::SReg_1RegClassID: 3132 return getBoolRC(); 3133 case AMDGPU::SReg_1_XEXECRegClassID: 3134 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3135 : &AMDGPU::SReg_64_XEXECRegClass; 3136 case -1: 3137 return nullptr; 3138 default: 3139 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3140 } 3141 } 3142 3143 // Find reaching register definition 3144 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3145 MachineInstr &Use, 3146 MachineRegisterInfo &MRI, 3147 LiveIntervals *LIS) const { 3148 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 3149 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3150 SlotIndex DefIdx; 3151 3152 if (Reg.isVirtual()) { 3153 if (!LIS->hasInterval(Reg)) 3154 return nullptr; 3155 LiveInterval &LI = LIS->getInterval(Reg); 3156 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3157 : MRI.getMaxLaneMaskForVReg(Reg); 3158 VNInfo *V = nullptr; 3159 if (LI.hasSubRanges()) { 3160 for (auto &S : LI.subranges()) { 3161 if ((S.LaneMask & SubLanes) == SubLanes) { 3162 V = S.getVNInfoAt(UseIdx); 3163 break; 3164 } 3165 } 3166 } else { 3167 V = LI.getVNInfoAt(UseIdx); 3168 } 3169 if (!V) 3170 return nullptr; 3171 DefIdx = V->def; 3172 } else { 3173 // Find last def. 3174 for (MCRegUnit Unit : regunits(Reg.asMCReg())) { 3175 LiveRange &LR = LIS->getRegUnit(Unit); 3176 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3177 if (!DefIdx.isValid() || 3178 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3179 LIS->getInstructionFromIndex(V->def))) 3180 DefIdx = V->def; 3181 } else { 3182 return nullptr; 3183 } 3184 } 3185 } 3186 3187 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3188 3189 if (!Def || !MDT.dominates(Def, &Use)) 3190 return nullptr; 3191 3192 assert(Def->modifiesRegister(Reg, this)); 3193 3194 return Def; 3195 } 3196 3197 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3198 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3199 3200 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3201 AMDGPU::SReg_32RegClass, 3202 AMDGPU::AGPR_32RegClass } ) { 3203 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3204 return Super; 3205 } 3206 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3207 &AMDGPU::VGPR_32RegClass)) { 3208 return Super; 3209 } 3210 3211 return AMDGPU::NoRegister; 3212 } 3213 3214 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3215 if (!ST.needsAlignedVGPRs()) 3216 return true; 3217 3218 if (isVGPRClass(&RC)) 3219 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3220 if (isAGPRClass(&RC)) 3221 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3222 if (isVectorSuperClass(&RC)) 3223 return RC.hasSuperClassEq( 3224 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3225 3226 return true; 3227 } 3228 3229 const TargetRegisterClass * 3230 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3231 if (!RC || !ST.needsAlignedVGPRs()) 3232 return RC; 3233 3234 unsigned Size = getRegSizeInBits(*RC); 3235 if (Size <= 32) 3236 return RC; 3237 3238 if (isVGPRClass(RC)) 3239 return getAlignedVGPRClassForBitWidth(Size); 3240 if (isAGPRClass(RC)) 3241 return getAlignedAGPRClassForBitWidth(Size); 3242 if (isVectorSuperClass(RC)) 3243 return getAlignedVectorSuperClassForBitWidth(Size); 3244 3245 return RC; 3246 } 3247 3248 ArrayRef<MCPhysReg> 3249 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3250 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 3251 } 3252 3253 ArrayRef<MCPhysReg> 3254 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3255 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 3256 } 3257 3258 ArrayRef<MCPhysReg> 3259 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3260 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3261 } 3262 3263 unsigned 3264 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, 3265 unsigned SubReg) const { 3266 switch (RC->TSFlags & SIRCFlags::RegKindMask) { 3267 case SIRCFlags::HasSGPR: 3268 return std::min(128u, getSubRegIdxSize(SubReg)); 3269 case SIRCFlags::HasAGPR: 3270 case SIRCFlags::HasVGPR: 3271 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: 3272 return std::min(32u, getSubRegIdxSize(SubReg)); 3273 default: 3274 break; 3275 } 3276 return 0; 3277 } 3278