1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 namespace llvm { 47 48 // A temporary struct to spill SGPRs. 49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 50 // just v_writelane and v_readlane. 51 // 52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 53 // is saved to scratch (or the other way around for loads). 54 // For this, a VGPR is required where the needed lanes can be clobbered. The 55 // RegScavenger can provide a VGPR where currently active lanes can be 56 // clobbered, but we still need to save inactive lanes. 57 // The high-level steps are: 58 // - Try to scavenge SGPR(s) to save exec 59 // - Try to scavenge VGPR 60 // - Save needed, all or inactive lanes of a TmpVGPR 61 // - Spill/Restore SGPRs using TmpVGPR 62 // - Restore TmpVGPR 63 // 64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 65 // cannot scavenge temporary SGPRs to save exec, we use the following code: 66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 67 // s_not exec, exec 68 // buffer_store_dword TmpVGPR ; save inactive lanes 69 // s_not exec, exec 70 struct SGPRSpillBuilder { 71 struct PerVGPRData { 72 unsigned PerVGPR; 73 unsigned NumVGPRs; 74 int64_t VGPRLanes; 75 }; 76 77 // The SGPR to save 78 Register SuperReg; 79 MachineBasicBlock::iterator MI; 80 ArrayRef<int16_t> SplitParts; 81 unsigned NumSubRegs; 82 bool IsKill; 83 const DebugLoc &DL; 84 85 /* When spilling to stack */ 86 // The SGPRs are written into this VGPR, which is then written to scratch 87 // (or vice versa for loads). 88 Register TmpVGPR = AMDGPU::NoRegister; 89 // Temporary spill slot to save TmpVGPR to. 90 int TmpVGPRIndex = 0; 91 // If TmpVGPR is live before the spill or if it is scavenged. 92 bool TmpVGPRLive = false; 93 // Scavenged SGPR to save EXEC. 94 Register SavedExecReg = AMDGPU::NoRegister; 95 // Stack index to write the SGPRs to. 96 int Index; 97 unsigned EltSize = 4; 98 99 RegScavenger *RS; 100 MachineBasicBlock *MBB; 101 MachineFunction &MF; 102 SIMachineFunctionInfo &MFI; 103 const SIInstrInfo &TII; 104 const SIRegisterInfo &TRI; 105 bool IsWave32; 106 Register ExecReg; 107 unsigned MovOpc; 108 unsigned NotOpc; 109 110 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 111 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 112 RegScavenger *RS) 113 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 114 MI->getOperand(0).isKill(), Index, RS) {} 115 116 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 117 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 118 bool IsKill, int Index, RegScavenger *RS) 119 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 120 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 121 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 122 IsWave32(IsWave32) { 123 const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); 124 SplitParts = TRI.getRegSplitParts(RC, EltSize); 125 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 126 127 if (IsWave32) { 128 ExecReg = AMDGPU::EXEC_LO; 129 MovOpc = AMDGPU::S_MOV_B32; 130 NotOpc = AMDGPU::S_NOT_B32; 131 } else { 132 ExecReg = AMDGPU::EXEC; 133 MovOpc = AMDGPU::S_MOV_B64; 134 NotOpc = AMDGPU::S_NOT_B64; 135 } 136 137 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 138 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 139 SuperReg != AMDGPU::EXEC && "exec should never spill"); 140 } 141 142 PerVGPRData getPerVGPRData() { 143 PerVGPRData Data; 144 Data.PerVGPR = IsWave32 ? 32 : 64; 145 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 146 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 147 return Data; 148 } 149 150 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 151 // free. 152 // Writes these instructions if an SGPR can be scavenged: 153 // s_mov_b64 s[6:7], exec ; Save exec 154 // s_mov_b64 exec, 3 ; Wanted lanemask 155 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 156 // 157 // Writes these instructions if no SGPR can be scavenged: 158 // buffer_store_dword v0 ; Only if no free VGPR was found 159 // s_not_b64 exec, exec 160 // buffer_store_dword v0 ; Save inactive lanes 161 // ; exec stays inverted, it is flipped back in 162 // ; restore. 163 void prepare() { 164 // Scavenged temporary VGPR to use. It must be scavenged once for any number 165 // of spilled subregs. 166 // FIXME: The liveness analysis is limited and does not tell if a register 167 // is in use in lanes that are currently inactive. We can never be sure if 168 // a register as actually in use in another lane, so we need to save all 169 // used lanes of the chosen VGPR. 170 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 171 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 172 173 // Reserve temporary stack slot 174 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 175 if (TmpVGPR) { 176 // Found a register that is dead in the currently active lanes, we only 177 // need to spill inactive lanes. 178 TmpVGPRLive = false; 179 } else { 180 // Pick v0 because it doesn't make a difference. 181 TmpVGPR = AMDGPU::VGPR0; 182 TmpVGPRLive = true; 183 } 184 185 // Try to scavenge SGPRs to save exec 186 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 187 const TargetRegisterClass &RC = 188 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 189 RS->setRegUsed(SuperReg); 190 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 191 192 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 193 194 if (SavedExecReg) { 195 RS->setRegUsed(SavedExecReg); 196 // Set exec to needed lanes 197 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 198 auto I = 199 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 200 if (!TmpVGPRLive) 201 I.addReg(TmpVGPR, RegState::ImplicitDefine); 202 // Spill needed lanes 203 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 204 } else { 205 // Spill active lanes 206 if (TmpVGPRLive) 207 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 208 /*IsKill*/ false); 209 // Spill inactive lanes 210 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 211 if (!TmpVGPRLive) 212 I.addReg(TmpVGPR, RegState::ImplicitDefine); 213 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 214 } 215 } 216 217 // Writes these instructions if an SGPR can be scavenged: 218 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 219 // s_waitcnt vmcnt(0) ; If a free VGPR was found 220 // s_mov_b64 exec, s[6:7] ; Save exec 221 // 222 // Writes these instructions if no SGPR can be scavenged: 223 // buffer_load_dword v0 ; Restore inactive lanes 224 // s_waitcnt vmcnt(0) ; If a free VGPR was found 225 // s_not_b64 exec, exec 226 // buffer_load_dword v0 ; Only if no free VGPR was found 227 void restore() { 228 if (SavedExecReg) { 229 // Restore used lanes 230 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 231 /*IsKill*/ false); 232 // Restore exec 233 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 234 .addReg(SavedExecReg, RegState::Kill); 235 // Add an implicit use of the load so it is not dead. 236 // FIXME This inserts an unnecessary waitcnt 237 if (!TmpVGPRLive) { 238 I.addReg(TmpVGPR, RegState::ImplicitKill); 239 } 240 } else { 241 // Restore inactive lanes 242 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 243 /*IsKill*/ false); 244 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 245 if (!TmpVGPRLive) { 246 I.addReg(TmpVGPR, RegState::ImplicitKill); 247 } 248 // Restore active lanes 249 if (TmpVGPRLive) 250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 251 } 252 } 253 254 // Write TmpVGPR to memory or read TmpVGPR from memory. 255 // Either using a single buffer_load/store if exec is set to the needed mask 256 // or using 257 // buffer_load 258 // s_not exec, exec 259 // buffer_load 260 // s_not exec, exec 261 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 262 if (SavedExecReg) { 263 // Spill needed lanes 264 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 265 } else { 266 // Spill active lanes 267 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 268 /*IsKill*/ false); 269 // Spill inactive lanes 270 BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 271 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 272 BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 273 } 274 } 275 276 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 277 assert(MBB->getParent() == &MF); 278 MI = NewMI; 279 MBB = NewMBB; 280 } 281 }; 282 283 } // namespace llvm 284 285 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 286 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 287 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 288 289 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 290 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 291 (getSubRegIndexLaneMask(AMDGPU::lo16) | 292 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 293 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 294 "getNumCoveredRegs() will not work with generated subreg masks!"); 295 296 RegPressureIgnoredUnits.resize(getNumRegUnits()); 297 RegPressureIgnoredUnits.set( 298 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 299 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 300 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 301 302 // HACK: Until this is fully tablegen'd. 303 static llvm::once_flag InitializeRegSplitPartsFlag; 304 305 static auto InitializeRegSplitPartsOnce = [this]() { 306 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 307 unsigned Size = getSubRegIdxSize(Idx); 308 if (Size & 31) 309 continue; 310 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 311 unsigned Pos = getSubRegIdxOffset(Idx); 312 if (Pos % Size) 313 continue; 314 Pos /= Size; 315 if (Vec.empty()) { 316 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 317 Vec.resize(MaxNumParts); 318 } 319 Vec[Pos] = Idx; 320 } 321 }; 322 323 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 324 325 static auto InitializeSubRegFromChannelTableOnce = [this]() { 326 for (auto &Row : SubRegFromChannelTable) 327 Row.fill(AMDGPU::NoSubRegister); 328 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 329 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 330 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 331 assert(Width < SubRegFromChannelTableWidthMap.size()); 332 Width = SubRegFromChannelTableWidthMap[Width]; 333 if (Width == 0) 334 continue; 335 unsigned TableIdx = Width - 1; 336 assert(TableIdx < SubRegFromChannelTable.size()); 337 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 338 SubRegFromChannelTable[TableIdx][Offset] = Idx; 339 } 340 }; 341 342 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 343 llvm::call_once(InitializeSubRegFromChannelTableFlag, 344 InitializeSubRegFromChannelTableOnce); 345 } 346 347 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 348 MCRegister Reg) const { 349 MCRegAliasIterator R(Reg, this, true); 350 351 for (; R.isValid(); ++R) 352 Reserved.set(*R); 353 } 354 355 // Forced to be here by one .inc 356 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 357 const MachineFunction *MF) const { 358 CallingConv::ID CC = MF->getFunction().getCallingConv(); 359 switch (CC) { 360 case CallingConv::C: 361 case CallingConv::Fast: 362 case CallingConv::Cold: 363 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 364 ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 365 : CSR_AMDGPU_HighRegs_SaveList; 366 case CallingConv::AMDGPU_Gfx: 367 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 368 ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList 369 : CSR_AMDGPU_SI_Gfx_SaveList; 370 default: { 371 // Dummy to not crash RegisterClassInfo. 372 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 373 return &NoCalleeSavedReg; 374 } 375 } 376 } 377 378 const MCPhysReg * 379 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 380 return nullptr; 381 } 382 383 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 384 CallingConv::ID CC) const { 385 switch (CC) { 386 case CallingConv::C: 387 case CallingConv::Fast: 388 case CallingConv::Cold: 389 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 390 ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 391 : CSR_AMDGPU_HighRegs_RegMask; 392 case CallingConv::AMDGPU_Gfx: 393 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 394 ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask 395 : CSR_AMDGPU_SI_Gfx_RegMask; 396 default: 397 return nullptr; 398 } 399 } 400 401 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 402 return CSR_AMDGPU_NoRegs_RegMask; 403 } 404 405 const TargetRegisterClass * 406 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 407 const MachineFunction &MF) const { 408 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 409 // equivalent AV class. If used one, the verifier will crash after 410 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 411 // until Instruction selection. 412 if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() && 413 (isVGPRClass(RC) || isAGPRClass(RC))) { 414 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 415 return &AMDGPU::AV_32RegClass; 416 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 417 return &AMDGPU::AV_64RegClass; 418 if (RC == &AMDGPU::VReg_64_Align2RegClass || 419 RC == &AMDGPU::AReg_64_Align2RegClass) 420 return &AMDGPU::AV_64_Align2RegClass; 421 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 422 return &AMDGPU::AV_96RegClass; 423 if (RC == &AMDGPU::VReg_96_Align2RegClass || 424 RC == &AMDGPU::AReg_96_Align2RegClass) 425 return &AMDGPU::AV_96_Align2RegClass; 426 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 427 return &AMDGPU::AV_128RegClass; 428 if (RC == &AMDGPU::VReg_128_Align2RegClass || 429 RC == &AMDGPU::AReg_128_Align2RegClass) 430 return &AMDGPU::AV_128_Align2RegClass; 431 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 432 return &AMDGPU::AV_160RegClass; 433 if (RC == &AMDGPU::VReg_160_Align2RegClass || 434 RC == &AMDGPU::AReg_160_Align2RegClass) 435 return &AMDGPU::AV_160_Align2RegClass; 436 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 437 return &AMDGPU::AV_192RegClass; 438 if (RC == &AMDGPU::VReg_192_Align2RegClass || 439 RC == &AMDGPU::AReg_192_Align2RegClass) 440 return &AMDGPU::AV_192_Align2RegClass; 441 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 442 return &AMDGPU::AV_256RegClass; 443 if (RC == &AMDGPU::VReg_256_Align2RegClass || 444 RC == &AMDGPU::AReg_256_Align2RegClass) 445 return &AMDGPU::AV_256_Align2RegClass; 446 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 447 return &AMDGPU::AV_512RegClass; 448 if (RC == &AMDGPU::VReg_512_Align2RegClass || 449 RC == &AMDGPU::AReg_512_Align2RegClass) 450 return &AMDGPU::AV_512_Align2RegClass; 451 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 452 return &AMDGPU::AV_1024RegClass; 453 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 454 RC == &AMDGPU::AReg_1024_Align2RegClass) 455 return &AMDGPU::AV_1024_Align2RegClass; 456 } 457 458 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 459 } 460 461 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 462 const SIFrameLowering *TFI = 463 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 464 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 465 // During ISel lowering we always reserve the stack pointer in entry 466 // functions, but never actually want to reference it when accessing our own 467 // frame. If we need a frame pointer we use it, but otherwise we can just use 468 // an immediate "0" which we represent by returning NoRegister. 469 if (FuncInfo->isEntryFunction()) { 470 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 471 } 472 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 473 : FuncInfo->getStackPtrOffsetReg(); 474 } 475 476 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 477 // When we need stack realignment, we can't reference off of the 478 // stack pointer, so we reserve a base pointer. 479 const MachineFrameInfo &MFI = MF.getFrameInfo(); 480 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 481 } 482 483 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 484 485 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 486 return CSR_AMDGPU_AllVGPRs_RegMask; 487 } 488 489 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 490 return CSR_AMDGPU_AllAGPRs_RegMask; 491 } 492 493 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 494 return CSR_AMDGPU_AllVectorRegs_RegMask; 495 } 496 497 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 498 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 499 } 500 501 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 502 unsigned NumRegs) { 503 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 504 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 505 assert(NumRegIndex && "Not implemented"); 506 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 507 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 508 } 509 510 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 511 const MachineFunction &MF) const { 512 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 513 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 514 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 515 } 516 517 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 518 BitVector Reserved(getNumRegs()); 519 Reserved.set(AMDGPU::MODE); 520 521 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 522 // this seems likely to result in bugs, so I'm marking them as reserved. 523 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 524 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 525 526 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 527 reserveRegisterTuples(Reserved, AMDGPU::M0); 528 529 // Reserve src_vccz, src_execz, src_scc. 530 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 531 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 532 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 533 534 // Reserve the memory aperture registers. 535 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 536 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 537 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 538 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 539 540 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 541 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 542 543 // Reserve xnack_mask registers - support is not implemented in Codegen. 544 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 545 546 // Reserve lds_direct register - support is not implemented in Codegen. 547 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 548 549 // Reserve Trap Handler registers - support is not implemented in Codegen. 550 reserveRegisterTuples(Reserved, AMDGPU::TBA); 551 reserveRegisterTuples(Reserved, AMDGPU::TMA); 552 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 553 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 554 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 555 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 556 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 557 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 558 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 559 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 560 561 // Reserve null register - it shall never be allocated 562 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 563 564 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 565 // will result in bugs. 566 if (isWave32) { 567 Reserved.set(AMDGPU::VCC); 568 Reserved.set(AMDGPU::VCC_HI); 569 } 570 571 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 572 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 573 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 574 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 575 reserveRegisterTuples(Reserved, Reg); 576 } 577 578 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 579 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 580 unsigned MaxNumAGPRs = MaxNumVGPRs; 581 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 582 583 if (ST.hasGFX90AInsts()) { 584 // In an entry function without calls and AGPRs used it is possible to use 585 // the whole register budget for VGPRs. 586 587 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and 588 // split register file accordingly. 589 if (MFI->usesAGPRs(MF)) { 590 MaxNumVGPRs /= 2; 591 MaxNumAGPRs = MaxNumVGPRs; 592 } else { 593 if (MaxNumVGPRs > TotalNumVGPRs) { 594 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 595 MaxNumVGPRs = TotalNumVGPRs; 596 } else 597 MaxNumAGPRs = 0; 598 } 599 } 600 601 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 602 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 603 reserveRegisterTuples(Reserved, Reg); 604 } 605 606 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 607 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 608 reserveRegisterTuples(Reserved, Reg); 609 } 610 611 for (auto Reg : AMDGPU::SReg_32RegClass) { 612 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 613 Register Low = getSubReg(Reg, AMDGPU::lo16); 614 // This is to prevent BB vcc liveness errors. 615 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 616 Reserved.set(Low); 617 } 618 619 for (auto Reg : AMDGPU::AGPR_32RegClass) { 620 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 621 } 622 623 // Reserve all the rest AGPRs if there are no instructions to use it. 624 if (!ST.hasMAIInsts()) { 625 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 626 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 627 reserveRegisterTuples(Reserved, Reg); 628 } 629 } 630 631 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 632 if (ScratchRSrcReg != AMDGPU::NoRegister) { 633 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 634 // to spill. 635 // TODO: May need to reserve a VGPR if doing LDS spilling. 636 reserveRegisterTuples(Reserved, ScratchRSrcReg); 637 } 638 639 // We have to assume the SP is needed in case there are calls in the function, 640 // which is detected after the function is lowered. If we aren't really going 641 // to need SP, don't bother reserving it. 642 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 643 644 if (StackPtrReg) { 645 reserveRegisterTuples(Reserved, StackPtrReg); 646 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 647 } 648 649 MCRegister FrameReg = MFI->getFrameOffsetReg(); 650 if (FrameReg) { 651 reserveRegisterTuples(Reserved, FrameReg); 652 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 653 } 654 655 if (hasBasePointer(MF)) { 656 MCRegister BasePtrReg = getBaseRegister(); 657 reserveRegisterTuples(Reserved, BasePtrReg); 658 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 659 } 660 661 for (auto Reg : MFI->WWMReservedRegs) { 662 reserveRegisterTuples(Reserved, Reg.first); 663 } 664 665 // Reserve VGPRs used for SGPR spilling. 666 // Note we treat freezeReservedRegs unusually because we run register 667 // allocation in two phases. It's OK to re-freeze with new registers for the 668 // second run. 669 #if 0 670 for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { 671 for (auto &SpilledVGPR : SpilledFI.second) 672 reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); 673 } 674 #endif 675 676 // FIXME: Stop using reserved registers for this. 677 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 678 reserveRegisterTuples(Reserved, Reg); 679 680 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 681 reserveRegisterTuples(Reserved, Reg); 682 683 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 684 reserveRegisterTuples(Reserved, SSpill.VGPR); 685 686 return Reserved; 687 } 688 689 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 690 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 691 // On entry, the base address is 0, so it can't possibly need any more 692 // alignment. 693 694 // FIXME: Should be able to specify the entry frame alignment per calling 695 // convention instead. 696 if (Info->isEntryFunction()) 697 return false; 698 699 return TargetRegisterInfo::shouldRealignStack(MF); 700 } 701 702 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 703 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 704 if (Info->isEntryFunction()) { 705 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 706 return MFI.hasStackObjects() || MFI.hasCalls(); 707 } 708 709 // May need scavenger for dealing with callee saved registers. 710 return true; 711 } 712 713 bool SIRegisterInfo::requiresFrameIndexScavenging( 714 const MachineFunction &MF) const { 715 // Do not use frame virtual registers. They used to be used for SGPRs, but 716 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 717 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 718 // spill. 719 return false; 720 } 721 722 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 723 const MachineFunction &MF) const { 724 const MachineFrameInfo &MFI = MF.getFrameInfo(); 725 return MFI.hasStackObjects(); 726 } 727 728 bool SIRegisterInfo::requiresVirtualBaseRegisters( 729 const MachineFunction &) const { 730 // There are no special dedicated stack or frame pointers. 731 return true; 732 } 733 734 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 735 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 736 737 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 738 AMDGPU::OpName::offset); 739 return MI->getOperand(OffIdx).getImm(); 740 } 741 742 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 743 int Idx) const { 744 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 745 return 0; 746 747 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 748 AMDGPU::OpName::vaddr) || 749 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 750 AMDGPU::OpName::saddr))) && 751 "Should never see frame index on non-address operand"); 752 753 return getScratchInstrOffset(MI); 754 } 755 756 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 757 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 758 return false; 759 760 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 761 762 if (SIInstrInfo::isMUBUF(*MI)) 763 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 764 765 const SIInstrInfo *TII = ST.getInstrInfo(); 766 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 767 SIInstrFlags::FlatScratch); 768 } 769 770 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 771 int FrameIdx, 772 int64_t Offset) const { 773 MachineBasicBlock::iterator Ins = MBB->begin(); 774 DebugLoc DL; // Defaults to "unknown" 775 776 if (Ins != MBB->end()) 777 DL = Ins->getDebugLoc(); 778 779 MachineFunction *MF = MBB->getParent(); 780 const SIInstrInfo *TII = ST.getInstrInfo(); 781 MachineRegisterInfo &MRI = MF->getRegInfo(); 782 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 783 : AMDGPU::V_MOV_B32_e32; 784 785 Register BaseReg = MRI.createVirtualRegister( 786 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 787 : &AMDGPU::VGPR_32RegClass); 788 789 if (Offset == 0) { 790 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 791 .addFrameIndex(FrameIdx); 792 return BaseReg; 793 } 794 795 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 796 797 Register FIReg = MRI.createVirtualRegister( 798 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 799 : &AMDGPU::VGPR_32RegClass); 800 801 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 802 .addImm(Offset); 803 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 804 .addFrameIndex(FrameIdx); 805 806 if (ST.enableFlatScratch() ) { 807 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 808 .addReg(OffsetReg, RegState::Kill) 809 .addReg(FIReg); 810 return BaseReg; 811 } 812 813 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 814 .addReg(OffsetReg, RegState::Kill) 815 .addReg(FIReg) 816 .addImm(0); // clamp bit 817 818 return BaseReg; 819 } 820 821 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 822 int64_t Offset) const { 823 const SIInstrInfo *TII = ST.getInstrInfo(); 824 bool IsFlat = TII->isFLATScratch(MI); 825 826 #ifndef NDEBUG 827 // FIXME: Is it possible to be storing a frame index to itself? 828 bool SeenFI = false; 829 for (const MachineOperand &MO: MI.operands()) { 830 if (MO.isFI()) { 831 if (SeenFI) 832 llvm_unreachable("should not see multiple frame indices"); 833 834 SeenFI = true; 835 } 836 } 837 #endif 838 839 MachineOperand *FIOp = 840 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 841 : AMDGPU::OpName::vaddr); 842 843 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 844 int64_t NewOffset = OffsetOp->getImm() + Offset; 845 846 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 847 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 848 849 if (IsFlat) { 850 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 851 SIInstrFlags::FlatScratch) && 852 "offset should be legal"); 853 FIOp->ChangeToRegister(BaseReg, false); 854 OffsetOp->setImm(NewOffset); 855 return; 856 } 857 858 #ifndef NDEBUG 859 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 860 assert(SOffset->isImm() && SOffset->getImm() == 0); 861 #endif 862 863 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 864 "offset should be legal"); 865 866 FIOp->ChangeToRegister(BaseReg, false); 867 OffsetOp->setImm(NewOffset); 868 } 869 870 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 871 Register BaseReg, 872 int64_t Offset) const { 873 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 874 return false; 875 876 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 877 878 if (SIInstrInfo::isMUBUF(*MI)) 879 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 880 881 const SIInstrInfo *TII = ST.getInstrInfo(); 882 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 883 SIInstrFlags::FlatScratch); 884 } 885 886 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 887 const MachineFunction &MF, unsigned Kind) const { 888 // This is inaccurate. It depends on the instruction and address space. The 889 // only place where we should hit this is for dealing with frame indexes / 890 // private accesses, so this is correct in that case. 891 return &AMDGPU::VGPR_32RegClass; 892 } 893 894 const TargetRegisterClass * 895 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 896 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 897 return getEquivalentVGPRClass(RC); 898 899 return RC; 900 } 901 902 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 903 904 switch (Op) { 905 case AMDGPU::SI_SPILL_S1024_SAVE: 906 case AMDGPU::SI_SPILL_S1024_RESTORE: 907 case AMDGPU::SI_SPILL_V1024_SAVE: 908 case AMDGPU::SI_SPILL_V1024_RESTORE: 909 case AMDGPU::SI_SPILL_A1024_SAVE: 910 case AMDGPU::SI_SPILL_A1024_RESTORE: 911 return 32; 912 case AMDGPU::SI_SPILL_S512_SAVE: 913 case AMDGPU::SI_SPILL_S512_RESTORE: 914 case AMDGPU::SI_SPILL_V512_SAVE: 915 case AMDGPU::SI_SPILL_V512_RESTORE: 916 case AMDGPU::SI_SPILL_A512_SAVE: 917 case AMDGPU::SI_SPILL_A512_RESTORE: 918 return 16; 919 case AMDGPU::SI_SPILL_S256_SAVE: 920 case AMDGPU::SI_SPILL_S256_RESTORE: 921 case AMDGPU::SI_SPILL_V256_SAVE: 922 case AMDGPU::SI_SPILL_V256_RESTORE: 923 case AMDGPU::SI_SPILL_A256_SAVE: 924 case AMDGPU::SI_SPILL_A256_RESTORE: 925 return 8; 926 case AMDGPU::SI_SPILL_S224_SAVE: 927 case AMDGPU::SI_SPILL_S224_RESTORE: 928 case AMDGPU::SI_SPILL_V224_SAVE: 929 case AMDGPU::SI_SPILL_V224_RESTORE: 930 case AMDGPU::SI_SPILL_A224_SAVE: 931 case AMDGPU::SI_SPILL_A224_RESTORE: 932 return 7; 933 case AMDGPU::SI_SPILL_S192_SAVE: 934 case AMDGPU::SI_SPILL_S192_RESTORE: 935 case AMDGPU::SI_SPILL_V192_SAVE: 936 case AMDGPU::SI_SPILL_V192_RESTORE: 937 case AMDGPU::SI_SPILL_A192_SAVE: 938 case AMDGPU::SI_SPILL_A192_RESTORE: 939 return 6; 940 case AMDGPU::SI_SPILL_S160_SAVE: 941 case AMDGPU::SI_SPILL_S160_RESTORE: 942 case AMDGPU::SI_SPILL_V160_SAVE: 943 case AMDGPU::SI_SPILL_V160_RESTORE: 944 case AMDGPU::SI_SPILL_A160_SAVE: 945 case AMDGPU::SI_SPILL_A160_RESTORE: 946 return 5; 947 case AMDGPU::SI_SPILL_S128_SAVE: 948 case AMDGPU::SI_SPILL_S128_RESTORE: 949 case AMDGPU::SI_SPILL_V128_SAVE: 950 case AMDGPU::SI_SPILL_V128_RESTORE: 951 case AMDGPU::SI_SPILL_A128_SAVE: 952 case AMDGPU::SI_SPILL_A128_RESTORE: 953 return 4; 954 case AMDGPU::SI_SPILL_S96_SAVE: 955 case AMDGPU::SI_SPILL_S96_RESTORE: 956 case AMDGPU::SI_SPILL_V96_SAVE: 957 case AMDGPU::SI_SPILL_V96_RESTORE: 958 case AMDGPU::SI_SPILL_A96_SAVE: 959 case AMDGPU::SI_SPILL_A96_RESTORE: 960 return 3; 961 case AMDGPU::SI_SPILL_S64_SAVE: 962 case AMDGPU::SI_SPILL_S64_RESTORE: 963 case AMDGPU::SI_SPILL_V64_SAVE: 964 case AMDGPU::SI_SPILL_V64_RESTORE: 965 case AMDGPU::SI_SPILL_A64_SAVE: 966 case AMDGPU::SI_SPILL_A64_RESTORE: 967 return 2; 968 case AMDGPU::SI_SPILL_S32_SAVE: 969 case AMDGPU::SI_SPILL_S32_RESTORE: 970 case AMDGPU::SI_SPILL_V32_SAVE: 971 case AMDGPU::SI_SPILL_V32_RESTORE: 972 case AMDGPU::SI_SPILL_A32_SAVE: 973 case AMDGPU::SI_SPILL_A32_RESTORE: 974 return 1; 975 default: llvm_unreachable("Invalid spill opcode"); 976 } 977 } 978 979 static int getOffsetMUBUFStore(unsigned Opc) { 980 switch (Opc) { 981 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 982 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 983 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 984 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 985 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 986 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 987 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 988 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 989 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 990 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 991 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 992 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 993 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 994 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 995 default: 996 return -1; 997 } 998 } 999 1000 static int getOffsetMUBUFLoad(unsigned Opc) { 1001 switch (Opc) { 1002 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1003 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1004 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1005 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1006 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1007 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1008 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1009 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1010 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1011 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1012 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1013 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1014 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1015 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1016 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1017 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1018 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1019 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1020 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1021 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1022 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1023 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1024 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1025 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1026 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1027 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1028 default: 1029 return -1; 1030 } 1031 } 1032 1033 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1034 MachineBasicBlock &MBB, 1035 MachineBasicBlock::iterator MI, 1036 int Index, unsigned Lane, 1037 unsigned ValueReg, bool IsKill) { 1038 MachineFunction *MF = MBB.getParent(); 1039 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1040 const SIInstrInfo *TII = ST.getInstrInfo(); 1041 1042 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1043 1044 if (Reg == AMDGPU::NoRegister) 1045 return MachineInstrBuilder(); 1046 1047 bool IsStore = MI->mayStore(); 1048 MachineRegisterInfo &MRI = MF->getRegInfo(); 1049 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1050 1051 unsigned Dst = IsStore ? Reg : ValueReg; 1052 unsigned Src = IsStore ? ValueReg : Reg; 1053 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1054 DebugLoc DL = MI->getDebugLoc(); 1055 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1056 // Spiller during regalloc may restore a spilled register to its superclass. 1057 // It could result in AGPR spills restored to VGPRs or the other way around, 1058 // making the src and dst with identical regclasses at this point. It just 1059 // needs a copy in such cases. 1060 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1061 .addReg(Src, getKillRegState(IsKill)); 1062 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1063 return CopyMIB; 1064 } 1065 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1066 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1067 1068 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1069 .addReg(Src, getKillRegState(IsKill)); 1070 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1071 return MIB; 1072 } 1073 1074 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1075 // need to handle the case where an SGPR may need to be spilled while spilling. 1076 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1077 MachineFrameInfo &MFI, 1078 MachineBasicBlock::iterator MI, 1079 int Index, 1080 int64_t Offset) { 1081 const SIInstrInfo *TII = ST.getInstrInfo(); 1082 MachineBasicBlock *MBB = MI->getParent(); 1083 const DebugLoc &DL = MI->getDebugLoc(); 1084 bool IsStore = MI->mayStore(); 1085 1086 unsigned Opc = MI->getOpcode(); 1087 int LoadStoreOp = IsStore ? 1088 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1089 if (LoadStoreOp == -1) 1090 return false; 1091 1092 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1093 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1094 return true; 1095 1096 MachineInstrBuilder NewMI = 1097 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1098 .add(*Reg) 1099 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1100 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1101 .addImm(Offset) 1102 .addImm(0) // cpol 1103 .addImm(0) // tfe 1104 .addImm(0) // swz 1105 .cloneMemRefs(*MI); 1106 1107 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1108 AMDGPU::OpName::vdata_in); 1109 if (VDataIn) 1110 NewMI.add(*VDataIn); 1111 return true; 1112 } 1113 1114 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1115 unsigned LoadStoreOp, 1116 unsigned EltSize) { 1117 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1118 bool UseST = 1119 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && 1120 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 1121 1122 switch (EltSize) { 1123 case 4: 1124 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1125 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1126 break; 1127 case 8: 1128 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1129 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1130 break; 1131 case 12: 1132 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1133 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1134 break; 1135 case 16: 1136 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1137 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1138 break; 1139 default: 1140 llvm_unreachable("Unexpected spill load/store size!"); 1141 } 1142 1143 if (UseST) 1144 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1145 1146 return LoadStoreOp; 1147 } 1148 1149 void SIRegisterInfo::buildSpillLoadStore( 1150 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1151 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1152 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1153 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1154 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1155 1156 MachineFunction *MF = MBB.getParent(); 1157 const SIInstrInfo *TII = ST.getInstrInfo(); 1158 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1159 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1160 1161 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1162 bool IsStore = Desc->mayStore(); 1163 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1164 1165 bool Scavenged = false; 1166 MCRegister SOffset = ScratchOffsetReg; 1167 1168 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1169 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1170 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1171 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1172 1173 // Always use 4 byte operations for AGPRs because we need to scavenge 1174 // a temporary VGPR. 1175 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1176 unsigned NumSubRegs = RegWidth / EltSize; 1177 unsigned Size = NumSubRegs * EltSize; 1178 unsigned RemSize = RegWidth - Size; 1179 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1180 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1181 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1182 int64_t ScratchOffsetRegDelta = 0; 1183 1184 if (IsFlat && EltSize > 4) { 1185 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1186 Desc = &TII->get(LoadStoreOp); 1187 } 1188 1189 Align Alignment = MFI.getObjectAlign(Index); 1190 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1191 1192 assert((IsFlat || ((Offset % EltSize) == 0)) && 1193 "unexpected VGPR spill offset"); 1194 1195 bool IsOffsetLegal = 1196 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1197 SIInstrFlags::FlatScratch) 1198 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1199 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1200 SOffset = MCRegister(); 1201 1202 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1203 // we can simplify the adjustment of Offset here to just scale with 1204 // WavefrontSize. 1205 if (!IsFlat) 1206 Offset *= ST.getWavefrontSize(); 1207 1208 // We don't have access to the register scavenger if this function is called 1209 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1210 if (RS) { 1211 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 1212 } else if (LiveRegs) { 1213 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1214 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1215 SOffset = Reg; 1216 break; 1217 } 1218 } 1219 } 1220 1221 if (!SOffset) { 1222 // There are no free SGPRs, and since we are in the process of spilling 1223 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1224 // on SI/CI and on VI it is true until we implement spilling using scalar 1225 // stores), we have no way to free up an SGPR. Our solution here is to 1226 // add the offset directly to the ScratchOffset or StackPtrOffset 1227 // register, and then subtract the offset after the spill to return the 1228 // register to it's original value. 1229 if (!ScratchOffsetReg) 1230 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1231 SOffset = ScratchOffsetReg; 1232 ScratchOffsetRegDelta = Offset; 1233 } else { 1234 Scavenged = true; 1235 } 1236 1237 if (!SOffset) 1238 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1239 1240 if (ScratchOffsetReg == AMDGPU::NoRegister) { 1241 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1242 } else { 1243 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1244 .addReg(ScratchOffsetReg) 1245 .addImm(Offset); 1246 } 1247 1248 Offset = 0; 1249 } 1250 1251 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1252 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1253 && "Unexpected vaddr for flat scratch with a FI operand"); 1254 1255 assert(ST.hasFlatScratchSTMode()); 1256 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1257 Desc = &TII->get(LoadStoreOp); 1258 } 1259 1260 Register TmpReg; 1261 1262 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1263 ++i, RegOffset += EltSize) { 1264 if (i == NumSubRegs) { 1265 EltSize = RemSize; 1266 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1267 } 1268 Desc = &TII->get(LoadStoreOp); 1269 1270 unsigned NumRegs = EltSize / 4; 1271 Register SubReg = e == 1 1272 ? ValueReg 1273 : Register(getSubReg(ValueReg, 1274 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1275 1276 unsigned SOffsetRegState = 0; 1277 unsigned SrcDstRegState = getDefRegState(!IsStore); 1278 if (i + 1 == e) { 1279 SOffsetRegState |= getKillRegState(Scavenged); 1280 // The last implicit use carries the "Kill" flag. 1281 SrcDstRegState |= getKillRegState(IsKill); 1282 } 1283 1284 // Make sure the whole register is defined if there are undef components by 1285 // adding an implicit def of the super-reg on the first instruction. 1286 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 1287 bool NeedSuperRegImpOperand = e > 1; 1288 1289 // Remaining element size to spill into memory after some parts of it 1290 // spilled into either AGPRs or VGPRs. 1291 unsigned RemEltSize = EltSize; 1292 1293 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1294 // starting from the last lane. In case if a register cannot be completely 1295 // spilled into another register that will ensure its alignment does not 1296 // change. For targets with VGPR alignment requirement this is important 1297 // in case of flat scratch usage as we might get a scratch_load or 1298 // scratch_store of an unaligned register otherwise. 1299 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1300 LaneE = RegOffset / 4; 1301 Lane >= LaneE; --Lane) { 1302 bool IsSubReg = e > 1 || EltSize > 4; 1303 Register Sub = IsSubReg 1304 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1305 : ValueReg; 1306 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1307 if (!MIB.getInstr()) 1308 break; 1309 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && !i)) { 1310 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1311 NeedSuperRegDef = false; 1312 } 1313 if (IsSubReg || NeedSuperRegImpOperand) { 1314 NeedSuperRegImpOperand = true; 1315 unsigned State = SrcDstRegState; 1316 if (Lane != LaneE) 1317 State &= ~RegState::Kill; 1318 MIB.addReg(ValueReg, RegState::Implicit | State); 1319 } 1320 RemEltSize -= 4; 1321 } 1322 1323 if (!RemEltSize) // Fully spilled into AGPRs. 1324 continue; 1325 1326 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1327 assert(IsFlat && EltSize > 4); 1328 1329 unsigned NumRegs = RemEltSize / 4; 1330 SubReg = Register(getSubReg(ValueReg, 1331 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1332 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1333 Desc = &TII->get(Opc); 1334 } 1335 1336 unsigned FinalReg = SubReg; 1337 1338 if (IsAGPR) { 1339 assert(EltSize == 4); 1340 1341 if (!TmpReg) { 1342 assert(RS && "Needs to have RegScavenger to spill an AGPR!"); 1343 // FIXME: change to scavengeRegisterBackwards() 1344 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1345 RS->setRegUsed(TmpReg); 1346 } 1347 if (IsStore) { 1348 auto AccRead = BuildMI(MBB, MI, DL, 1349 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) 1350 .addReg(SubReg, getKillRegState(IsKill)); 1351 if (NeedSuperRegDef) 1352 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1353 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1354 } 1355 SubReg = TmpReg; 1356 } 1357 1358 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1359 MachineMemOperand *NewMMO = 1360 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1361 commonAlignment(Alignment, RegOffset)); 1362 1363 auto MIB = 1364 BuildMI(MBB, MI, DL, *Desc) 1365 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1366 if (!IsFlat) 1367 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1368 1369 if (SOffset == AMDGPU::NoRegister) { 1370 if (!IsFlat) 1371 MIB.addImm(0); 1372 } else { 1373 MIB.addReg(SOffset, SOffsetRegState); 1374 } 1375 MIB.addImm(Offset + RegOffset) 1376 .addImm(0); // cpol 1377 if (!IsFlat) 1378 MIB.addImm(0) // tfe 1379 .addImm(0); // swz 1380 MIB.addMemOperand(NewMMO); 1381 1382 if (!IsAGPR && NeedSuperRegDef) 1383 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1384 1385 if (!IsStore && TmpReg != AMDGPU::NoRegister) { 1386 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1387 FinalReg) 1388 .addReg(TmpReg, RegState::Kill); 1389 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1390 } 1391 1392 if (NeedSuperRegImpOperand) 1393 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1394 } 1395 1396 if (ScratchOffsetRegDelta != 0) { 1397 // Subtract the offset we added to the ScratchOffset register. 1398 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1399 .addReg(SOffset) 1400 .addImm(-ScratchOffsetRegDelta); 1401 } 1402 } 1403 1404 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1405 int Offset, bool IsLoad, 1406 bool IsKill) const { 1407 // Load/store VGPR 1408 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1409 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1410 1411 Register FrameReg = 1412 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1413 ? getBaseRegister() 1414 : getFrameRegister(SB.MF); 1415 1416 Align Alignment = FrameInfo.getObjectAlign(Index); 1417 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1418 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1419 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1420 SB.EltSize, Alignment); 1421 1422 if (IsLoad) { 1423 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1424 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1425 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1426 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1427 } else { 1428 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1429 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1430 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1431 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1432 // This only ever adds one VGPR spill 1433 SB.MFI.addToSpilledVGPRs(1); 1434 } 1435 } 1436 1437 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1438 int Index, 1439 RegScavenger *RS, 1440 LiveIntervals *LIS, 1441 bool OnlyToVGPR) const { 1442 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1443 1444 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1445 SB.MFI.getSGPRToVGPRSpills(Index); 1446 bool SpillToVGPR = !VGPRSpills.empty(); 1447 if (OnlyToVGPR && !SpillToVGPR) 1448 return false; 1449 1450 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1451 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1452 1453 if (SpillToVGPR) { 1454 1455 assert(SB.NumSubRegs == VGPRSpills.size() && 1456 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1457 1458 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1459 Register SubReg = 1460 SB.NumSubRegs == 1 1461 ? SB.SuperReg 1462 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1463 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1464 1465 bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; 1466 1467 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1468 // spill to this specific vgpr in the first basic block. 1469 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1470 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1471 .addReg(SubReg, getKillRegState(UseKill)) 1472 .addImm(Spill.Lane) 1473 .addReg(Spill.VGPR); 1474 if (LIS) { 1475 if (i == 0) 1476 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1477 else 1478 LIS->InsertMachineInstrInMaps(*MIB); 1479 } 1480 1481 if (i == 0 && SB.NumSubRegs > 1) { 1482 // We may be spilling a super-register which is only partially defined, 1483 // and need to ensure later spills think the value is defined. 1484 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1485 } 1486 1487 if (SB.NumSubRegs > 1) 1488 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1489 1490 // FIXME: Since this spills to another register instead of an actual 1491 // frame index, we should delete the frame index when all references to 1492 // it are fixed. 1493 } 1494 } else { 1495 SB.prepare(); 1496 1497 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1498 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1499 1500 // Per VGPR helper data 1501 auto PVD = SB.getPerVGPRData(); 1502 1503 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1504 unsigned TmpVGPRFlags = RegState::Undef; 1505 1506 // Write sub registers into the VGPR 1507 for (unsigned i = Offset * PVD.PerVGPR, 1508 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1509 i < e; ++i) { 1510 Register SubReg = 1511 SB.NumSubRegs == 1 1512 ? SB.SuperReg 1513 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1514 1515 MachineInstrBuilder WriteLane = 1516 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1517 SB.TmpVGPR) 1518 .addReg(SubReg, SubKillState) 1519 .addImm(i % PVD.PerVGPR) 1520 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1521 TmpVGPRFlags = 0; 1522 1523 if (LIS) { 1524 if (i == 0) 1525 LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); 1526 else 1527 LIS->InsertMachineInstrInMaps(*WriteLane); 1528 } 1529 1530 // There could be undef components of a spilled super register. 1531 // TODO: Can we detect this and skip the spill? 1532 if (SB.NumSubRegs > 1) { 1533 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1534 unsigned SuperKillState = 0; 1535 if (i + 1 == SB.NumSubRegs) 1536 SuperKillState |= getKillRegState(SB.IsKill); 1537 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1538 } 1539 } 1540 1541 // Write out VGPR 1542 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1543 } 1544 1545 SB.restore(); 1546 } 1547 1548 MI->eraseFromParent(); 1549 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1550 1551 if (LIS) 1552 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1553 1554 return true; 1555 } 1556 1557 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1558 int Index, 1559 RegScavenger *RS, 1560 LiveIntervals *LIS, 1561 bool OnlyToVGPR) const { 1562 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1563 1564 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1565 SB.MFI.getSGPRToVGPRSpills(Index); 1566 bool SpillToVGPR = !VGPRSpills.empty(); 1567 if (OnlyToVGPR && !SpillToVGPR) 1568 return false; 1569 1570 if (SpillToVGPR) { 1571 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1572 Register SubReg = 1573 SB.NumSubRegs == 1 1574 ? SB.SuperReg 1575 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1576 1577 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1578 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1579 SubReg) 1580 .addReg(Spill.VGPR) 1581 .addImm(Spill.Lane); 1582 if (SB.NumSubRegs > 1 && i == 0) 1583 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1584 if (LIS) { 1585 if (i == e - 1) 1586 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1587 else 1588 LIS->InsertMachineInstrInMaps(*MIB); 1589 } 1590 1591 } 1592 } else { 1593 SB.prepare(); 1594 1595 // Per VGPR helper data 1596 auto PVD = SB.getPerVGPRData(); 1597 1598 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1599 // Load in VGPR data 1600 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1601 1602 // Unpack lanes 1603 for (unsigned i = Offset * PVD.PerVGPR, 1604 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1605 i < e; ++i) { 1606 Register SubReg = 1607 SB.NumSubRegs == 1 1608 ? SB.SuperReg 1609 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1610 1611 bool LastSubReg = (i + 1 == e); 1612 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1613 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1614 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1615 .addImm(i); 1616 if (SB.NumSubRegs > 1 && i == 0) 1617 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1618 if (LIS) { 1619 if (i == e - 1) 1620 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1621 else 1622 LIS->InsertMachineInstrInMaps(*MIB); 1623 } 1624 } 1625 } 1626 1627 SB.restore(); 1628 } 1629 1630 MI->eraseFromParent(); 1631 1632 if (LIS) 1633 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1634 1635 return true; 1636 } 1637 1638 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1639 MachineBasicBlock &RestoreMBB, 1640 Register SGPR, RegScavenger *RS) const { 1641 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1642 RS); 1643 SB.prepare(); 1644 // Generate the spill of SGPR to SB.TmpVGPR. 1645 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1646 auto PVD = SB.getPerVGPRData(); 1647 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1648 unsigned TmpVGPRFlags = RegState::Undef; 1649 // Write sub registers into the VGPR 1650 for (unsigned i = Offset * PVD.PerVGPR, 1651 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1652 i < e; ++i) { 1653 Register SubReg = 1654 SB.NumSubRegs == 1 1655 ? SB.SuperReg 1656 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1657 1658 MachineInstrBuilder WriteLane = 1659 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1660 SB.TmpVGPR) 1661 .addReg(SubReg, SubKillState) 1662 .addImm(i % PVD.PerVGPR) 1663 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1664 TmpVGPRFlags = 0; 1665 // There could be undef components of a spilled super register. 1666 // TODO: Can we detect this and skip the spill? 1667 if (SB.NumSubRegs > 1) { 1668 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1669 unsigned SuperKillState = 0; 1670 if (i + 1 == SB.NumSubRegs) 1671 SuperKillState |= getKillRegState(SB.IsKill); 1672 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1673 } 1674 } 1675 // Don't need to write VGPR out. 1676 } 1677 1678 // Restore clobbered registers in the specified restore block. 1679 MI = RestoreMBB.end(); 1680 SB.setMI(&RestoreMBB, MI); 1681 // Generate the restore of SGPR from SB.TmpVGPR. 1682 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1683 // Don't need to load VGPR in. 1684 // Unpack lanes 1685 for (unsigned i = Offset * PVD.PerVGPR, 1686 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1687 i < e; ++i) { 1688 Register SubReg = 1689 SB.NumSubRegs == 1 1690 ? SB.SuperReg 1691 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1692 bool LastSubReg = (i + 1 == e); 1693 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1694 SubReg) 1695 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1696 .addImm(i); 1697 if (SB.NumSubRegs > 1 && i == 0) 1698 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1699 } 1700 } 1701 SB.restore(); 1702 1703 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1704 return false; 1705 } 1706 1707 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1708 /// a VGPR and the stack slot can be safely eliminated when all other users are 1709 /// handled. 1710 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1711 MachineBasicBlock::iterator MI, 1712 int FI, 1713 RegScavenger *RS, 1714 LiveIntervals *LIS) const { 1715 switch (MI->getOpcode()) { 1716 case AMDGPU::SI_SPILL_S1024_SAVE: 1717 case AMDGPU::SI_SPILL_S512_SAVE: 1718 case AMDGPU::SI_SPILL_S256_SAVE: 1719 case AMDGPU::SI_SPILL_S224_SAVE: 1720 case AMDGPU::SI_SPILL_S192_SAVE: 1721 case AMDGPU::SI_SPILL_S160_SAVE: 1722 case AMDGPU::SI_SPILL_S128_SAVE: 1723 case AMDGPU::SI_SPILL_S96_SAVE: 1724 case AMDGPU::SI_SPILL_S64_SAVE: 1725 case AMDGPU::SI_SPILL_S32_SAVE: 1726 return spillSGPR(MI, FI, RS, LIS, true); 1727 case AMDGPU::SI_SPILL_S1024_RESTORE: 1728 case AMDGPU::SI_SPILL_S512_RESTORE: 1729 case AMDGPU::SI_SPILL_S256_RESTORE: 1730 case AMDGPU::SI_SPILL_S224_RESTORE: 1731 case AMDGPU::SI_SPILL_S192_RESTORE: 1732 case AMDGPU::SI_SPILL_S160_RESTORE: 1733 case AMDGPU::SI_SPILL_S128_RESTORE: 1734 case AMDGPU::SI_SPILL_S96_RESTORE: 1735 case AMDGPU::SI_SPILL_S64_RESTORE: 1736 case AMDGPU::SI_SPILL_S32_RESTORE: 1737 return restoreSGPR(MI, FI, RS, LIS, true); 1738 default: 1739 llvm_unreachable("not an SGPR spill instruction"); 1740 } 1741 } 1742 1743 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1744 int SPAdj, unsigned FIOperandNum, 1745 RegScavenger *RS) const { 1746 MachineFunction *MF = MI->getParent()->getParent(); 1747 MachineBasicBlock *MBB = MI->getParent(); 1748 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1749 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1750 const SIInstrInfo *TII = ST.getInstrInfo(); 1751 DebugLoc DL = MI->getDebugLoc(); 1752 1753 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1754 1755 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1756 int Index = MI->getOperand(FIOperandNum).getIndex(); 1757 1758 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1759 ? getBaseRegister() 1760 : getFrameRegister(*MF); 1761 1762 switch (MI->getOpcode()) { 1763 // SGPR register spill 1764 case AMDGPU::SI_SPILL_S1024_SAVE: 1765 case AMDGPU::SI_SPILL_S512_SAVE: 1766 case AMDGPU::SI_SPILL_S256_SAVE: 1767 case AMDGPU::SI_SPILL_S224_SAVE: 1768 case AMDGPU::SI_SPILL_S192_SAVE: 1769 case AMDGPU::SI_SPILL_S160_SAVE: 1770 case AMDGPU::SI_SPILL_S128_SAVE: 1771 case AMDGPU::SI_SPILL_S96_SAVE: 1772 case AMDGPU::SI_SPILL_S64_SAVE: 1773 case AMDGPU::SI_SPILL_S32_SAVE: { 1774 spillSGPR(MI, Index, RS); 1775 break; 1776 } 1777 1778 // SGPR register restore 1779 case AMDGPU::SI_SPILL_S1024_RESTORE: 1780 case AMDGPU::SI_SPILL_S512_RESTORE: 1781 case AMDGPU::SI_SPILL_S256_RESTORE: 1782 case AMDGPU::SI_SPILL_S224_RESTORE: 1783 case AMDGPU::SI_SPILL_S192_RESTORE: 1784 case AMDGPU::SI_SPILL_S160_RESTORE: 1785 case AMDGPU::SI_SPILL_S128_RESTORE: 1786 case AMDGPU::SI_SPILL_S96_RESTORE: 1787 case AMDGPU::SI_SPILL_S64_RESTORE: 1788 case AMDGPU::SI_SPILL_S32_RESTORE: { 1789 restoreSGPR(MI, Index, RS); 1790 break; 1791 } 1792 1793 // VGPR register spill 1794 case AMDGPU::SI_SPILL_V1024_SAVE: 1795 case AMDGPU::SI_SPILL_V512_SAVE: 1796 case AMDGPU::SI_SPILL_V256_SAVE: 1797 case AMDGPU::SI_SPILL_V224_SAVE: 1798 case AMDGPU::SI_SPILL_V192_SAVE: 1799 case AMDGPU::SI_SPILL_V160_SAVE: 1800 case AMDGPU::SI_SPILL_V128_SAVE: 1801 case AMDGPU::SI_SPILL_V96_SAVE: 1802 case AMDGPU::SI_SPILL_V64_SAVE: 1803 case AMDGPU::SI_SPILL_V32_SAVE: 1804 case AMDGPU::SI_SPILL_A1024_SAVE: 1805 case AMDGPU::SI_SPILL_A512_SAVE: 1806 case AMDGPU::SI_SPILL_A256_SAVE: 1807 case AMDGPU::SI_SPILL_A224_SAVE: 1808 case AMDGPU::SI_SPILL_A192_SAVE: 1809 case AMDGPU::SI_SPILL_A160_SAVE: 1810 case AMDGPU::SI_SPILL_A128_SAVE: 1811 case AMDGPU::SI_SPILL_A96_SAVE: 1812 case AMDGPU::SI_SPILL_A64_SAVE: 1813 case AMDGPU::SI_SPILL_A32_SAVE: { 1814 const MachineOperand *VData = TII->getNamedOperand(*MI, 1815 AMDGPU::OpName::vdata); 1816 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1817 MFI->getStackPtrOffsetReg()); 1818 1819 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1820 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1821 auto *MBB = MI->getParent(); 1822 buildSpillLoadStore( 1823 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1824 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1825 *MI->memoperands_begin(), RS); 1826 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1827 MI->eraseFromParent(); 1828 break; 1829 } 1830 case AMDGPU::SI_SPILL_V32_RESTORE: 1831 case AMDGPU::SI_SPILL_V64_RESTORE: 1832 case AMDGPU::SI_SPILL_V96_RESTORE: 1833 case AMDGPU::SI_SPILL_V128_RESTORE: 1834 case AMDGPU::SI_SPILL_V160_RESTORE: 1835 case AMDGPU::SI_SPILL_V192_RESTORE: 1836 case AMDGPU::SI_SPILL_V224_RESTORE: 1837 case AMDGPU::SI_SPILL_V256_RESTORE: 1838 case AMDGPU::SI_SPILL_V512_RESTORE: 1839 case AMDGPU::SI_SPILL_V1024_RESTORE: 1840 case AMDGPU::SI_SPILL_A32_RESTORE: 1841 case AMDGPU::SI_SPILL_A64_RESTORE: 1842 case AMDGPU::SI_SPILL_A96_RESTORE: 1843 case AMDGPU::SI_SPILL_A128_RESTORE: 1844 case AMDGPU::SI_SPILL_A160_RESTORE: 1845 case AMDGPU::SI_SPILL_A192_RESTORE: 1846 case AMDGPU::SI_SPILL_A224_RESTORE: 1847 case AMDGPU::SI_SPILL_A256_RESTORE: 1848 case AMDGPU::SI_SPILL_A512_RESTORE: 1849 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1850 const MachineOperand *VData = TII->getNamedOperand(*MI, 1851 AMDGPU::OpName::vdata); 1852 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1853 MFI->getStackPtrOffsetReg()); 1854 1855 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1856 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1857 auto *MBB = MI->getParent(); 1858 buildSpillLoadStore( 1859 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1860 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1861 *MI->memoperands_begin(), RS); 1862 MI->eraseFromParent(); 1863 break; 1864 } 1865 1866 default: { 1867 // Other access to frame index 1868 const DebugLoc &DL = MI->getDebugLoc(); 1869 1870 int64_t Offset = FrameInfo.getObjectOffset(Index); 1871 if (ST.enableFlatScratch()) { 1872 if (TII->isFLATScratch(*MI)) { 1873 assert((int16_t)FIOperandNum == 1874 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1875 AMDGPU::OpName::saddr)); 1876 1877 // The offset is always swizzled, just replace it 1878 if (FrameReg) 1879 FIOp.ChangeToRegister(FrameReg, false); 1880 1881 if (!Offset) 1882 return; 1883 1884 MachineOperand *OffsetOp = 1885 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1886 int64_t NewOffset = Offset + OffsetOp->getImm(); 1887 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1888 SIInstrFlags::FlatScratch)) { 1889 OffsetOp->setImm(NewOffset); 1890 if (FrameReg) 1891 return; 1892 Offset = 0; 1893 } 1894 1895 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 1896 "Unexpected vaddr for flat scratch with a FI operand"); 1897 1898 // On GFX10 we have ST mode to use no registers for an address. 1899 // Otherwise we need to materialize 0 into an SGPR. 1900 if (!Offset && ST.hasFlatScratchSTMode()) { 1901 unsigned Opc = MI->getOpcode(); 1902 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 1903 MI->RemoveOperand( 1904 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 1905 MI->setDesc(TII->get(NewOpc)); 1906 return; 1907 } 1908 } 1909 1910 if (!FrameReg) { 1911 FIOp.ChangeToImmediate(Offset); 1912 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 1913 return; 1914 } 1915 1916 // We need to use register here. Check if we can use an SGPR or need 1917 // a VGPR. 1918 FIOp.ChangeToRegister(AMDGPU::M0, false); 1919 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 1920 1921 if (!Offset && FrameReg && UseSGPR) { 1922 FIOp.setReg(FrameReg); 1923 return; 1924 } 1925 1926 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 1927 : &AMDGPU::VGPR_32RegClass; 1928 1929 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 1930 FIOp.setReg(TmpReg); 1931 FIOp.setIsKill(true); 1932 1933 if ((!FrameReg || !Offset) && TmpReg) { 1934 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1935 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 1936 if (FrameReg) 1937 MIB.addReg(FrameReg); 1938 else 1939 MIB.addImm(Offset); 1940 1941 return; 1942 } 1943 1944 Register TmpSReg = 1945 UseSGPR ? TmpReg 1946 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 1947 !UseSGPR); 1948 1949 // TODO: for flat scratch another attempt can be made with a VGPR index 1950 // if no SGPRs can be scavenged. 1951 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 1952 report_fatal_error("Cannot scavenge register in FI elimination!"); 1953 1954 if (!TmpSReg) { 1955 // Use frame register and restore it after. 1956 TmpSReg = FrameReg; 1957 FIOp.setReg(FrameReg); 1958 FIOp.setIsKill(false); 1959 } 1960 1961 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 1962 .addReg(FrameReg) 1963 .addImm(Offset); 1964 1965 if (!UseSGPR) 1966 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1967 .addReg(TmpSReg, RegState::Kill); 1968 1969 if (TmpSReg == FrameReg) { 1970 // Undo frame register modification. 1971 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 1972 FrameReg) 1973 .addReg(FrameReg) 1974 .addImm(-Offset); 1975 } 1976 1977 return; 1978 } 1979 1980 bool IsMUBUF = TII->isMUBUF(*MI); 1981 1982 if (!IsMUBUF && !MFI->isEntryFunction()) { 1983 // Convert to a swizzled stack address by scaling by the wave size. 1984 // 1985 // In an entry function/kernel the offset is already swizzled. 1986 1987 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1988 Register ResultReg = 1989 IsCopy ? MI->getOperand(0).getReg() 1990 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1991 1992 int64_t Offset = FrameInfo.getObjectOffset(Index); 1993 if (Offset == 0) { 1994 // XXX - This never happens because of emergency scavenging slot at 0? 1995 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1996 .addImm(ST.getWavefrontSizeLog2()) 1997 .addReg(FrameReg); 1998 } else { 1999 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 2000 // Reuse ResultReg in intermediate step. 2001 Register ScaledReg = ResultReg; 2002 2003 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2004 ScaledReg) 2005 .addImm(ST.getWavefrontSizeLog2()) 2006 .addReg(FrameReg); 2007 2008 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2009 2010 // TODO: Fold if use instruction is another add of a constant. 2011 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2012 // FIXME: This can fail 2013 MIB.addImm(Offset); 2014 MIB.addReg(ScaledReg, RegState::Kill); 2015 if (!IsVOP2) 2016 MIB.addImm(0); // clamp bit 2017 } else { 2018 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2019 "Need to reuse carry out register"); 2020 2021 // Use scavenged unused carry out as offset register. 2022 Register ConstOffsetReg; 2023 if (!isWave32) 2024 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2025 else 2026 ConstOffsetReg = MIB.getReg(1); 2027 2028 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2029 .addImm(Offset); 2030 MIB.addReg(ConstOffsetReg, RegState::Kill); 2031 MIB.addReg(ScaledReg, RegState::Kill); 2032 MIB.addImm(0); // clamp bit 2033 } 2034 } else { 2035 // We have to produce a carry out, and there isn't a free SGPR pair 2036 // for it. We can keep the whole computation on the SALU to avoid 2037 // clobbering an additional register at the cost of an extra mov. 2038 2039 // We may have 1 free scratch SGPR even though a carry out is 2040 // unavailable. Only one additional mov is needed. 2041 Register TmpScaledReg = 2042 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 2043 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2044 2045 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2046 .addReg(FrameReg) 2047 .addImm(ST.getWavefrontSizeLog2()); 2048 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2049 .addReg(ScaledReg, RegState::Kill) 2050 .addImm(Offset); 2051 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2052 .addReg(ScaledReg, RegState::Kill); 2053 2054 // If there were truly no free SGPRs, we need to undo everything. 2055 if (!TmpScaledReg.isValid()) { 2056 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2057 .addReg(ScaledReg, RegState::Kill) 2058 .addImm(-Offset); 2059 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2060 .addReg(FrameReg) 2061 .addImm(ST.getWavefrontSizeLog2()); 2062 } 2063 } 2064 } 2065 2066 // Don't introduce an extra copy if we're just materializing in a mov. 2067 if (IsCopy) 2068 MI->eraseFromParent(); 2069 else 2070 FIOp.ChangeToRegister(ResultReg, false, false, true); 2071 return; 2072 } 2073 2074 if (IsMUBUF) { 2075 // Disable offen so we don't need a 0 vgpr base. 2076 assert(static_cast<int>(FIOperandNum) == 2077 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2078 AMDGPU::OpName::vaddr)); 2079 2080 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2081 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2082 2083 if (FrameReg != AMDGPU::NoRegister) 2084 SOffset.ChangeToRegister(FrameReg, false); 2085 2086 int64_t Offset = FrameInfo.getObjectOffset(Index); 2087 int64_t OldImm 2088 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2089 int64_t NewOffset = OldImm + Offset; 2090 2091 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2092 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2093 MI->eraseFromParent(); 2094 return; 2095 } 2096 } 2097 2098 // If the offset is simply too big, don't convert to a scratch wave offset 2099 // relative index. 2100 2101 FIOp.ChangeToImmediate(Offset); 2102 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2103 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2104 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2105 .addImm(Offset); 2106 FIOp.ChangeToRegister(TmpReg, false, false, true); 2107 } 2108 } 2109 } 2110 } 2111 2112 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2113 return AMDGPUInstPrinter::getRegisterName(Reg); 2114 } 2115 2116 static const TargetRegisterClass * 2117 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2118 if (BitWidth <= 64) 2119 return &AMDGPU::VReg_64RegClass; 2120 if (BitWidth <= 96) 2121 return &AMDGPU::VReg_96RegClass; 2122 if (BitWidth <= 128) 2123 return &AMDGPU::VReg_128RegClass; 2124 if (BitWidth <= 160) 2125 return &AMDGPU::VReg_160RegClass; 2126 if (BitWidth <= 192) 2127 return &AMDGPU::VReg_192RegClass; 2128 if (BitWidth <= 224) 2129 return &AMDGPU::VReg_224RegClass; 2130 if (BitWidth <= 256) 2131 return &AMDGPU::VReg_256RegClass; 2132 if (BitWidth <= 512) 2133 return &AMDGPU::VReg_512RegClass; 2134 if (BitWidth <= 1024) 2135 return &AMDGPU::VReg_1024RegClass; 2136 2137 return nullptr; 2138 } 2139 2140 static const TargetRegisterClass * 2141 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2142 if (BitWidth <= 64) 2143 return &AMDGPU::VReg_64_Align2RegClass; 2144 if (BitWidth <= 96) 2145 return &AMDGPU::VReg_96_Align2RegClass; 2146 if (BitWidth <= 128) 2147 return &AMDGPU::VReg_128_Align2RegClass; 2148 if (BitWidth <= 160) 2149 return &AMDGPU::VReg_160_Align2RegClass; 2150 if (BitWidth <= 192) 2151 return &AMDGPU::VReg_192_Align2RegClass; 2152 if (BitWidth <= 224) 2153 return &AMDGPU::VReg_224_Align2RegClass; 2154 if (BitWidth <= 256) 2155 return &AMDGPU::VReg_256_Align2RegClass; 2156 if (BitWidth <= 512) 2157 return &AMDGPU::VReg_512_Align2RegClass; 2158 if (BitWidth <= 1024) 2159 return &AMDGPU::VReg_1024_Align2RegClass; 2160 2161 return nullptr; 2162 } 2163 2164 const TargetRegisterClass * 2165 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2166 if (BitWidth == 1) 2167 return &AMDGPU::VReg_1RegClass; 2168 if (BitWidth <= 16) 2169 return &AMDGPU::VGPR_LO16RegClass; 2170 if (BitWidth <= 32) 2171 return &AMDGPU::VGPR_32RegClass; 2172 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2173 : getAnyVGPRClassForBitWidth(BitWidth); 2174 } 2175 2176 static const TargetRegisterClass * 2177 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2178 if (BitWidth <= 64) 2179 return &AMDGPU::AReg_64RegClass; 2180 if (BitWidth <= 96) 2181 return &AMDGPU::AReg_96RegClass; 2182 if (BitWidth <= 128) 2183 return &AMDGPU::AReg_128RegClass; 2184 if (BitWidth <= 160) 2185 return &AMDGPU::AReg_160RegClass; 2186 if (BitWidth <= 192) 2187 return &AMDGPU::AReg_192RegClass; 2188 if (BitWidth <= 224) 2189 return &AMDGPU::AReg_224RegClass; 2190 if (BitWidth <= 256) 2191 return &AMDGPU::AReg_256RegClass; 2192 if (BitWidth <= 512) 2193 return &AMDGPU::AReg_512RegClass; 2194 if (BitWidth <= 1024) 2195 return &AMDGPU::AReg_1024RegClass; 2196 2197 return nullptr; 2198 } 2199 2200 static const TargetRegisterClass * 2201 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2202 if (BitWidth <= 64) 2203 return &AMDGPU::AReg_64_Align2RegClass; 2204 if (BitWidth <= 96) 2205 return &AMDGPU::AReg_96_Align2RegClass; 2206 if (BitWidth <= 128) 2207 return &AMDGPU::AReg_128_Align2RegClass; 2208 if (BitWidth <= 160) 2209 return &AMDGPU::AReg_160_Align2RegClass; 2210 if (BitWidth <= 192) 2211 return &AMDGPU::AReg_192_Align2RegClass; 2212 if (BitWidth <= 224) 2213 return &AMDGPU::AReg_224_Align2RegClass; 2214 if (BitWidth <= 256) 2215 return &AMDGPU::AReg_256_Align2RegClass; 2216 if (BitWidth <= 512) 2217 return &AMDGPU::AReg_512_Align2RegClass; 2218 if (BitWidth <= 1024) 2219 return &AMDGPU::AReg_1024_Align2RegClass; 2220 2221 return nullptr; 2222 } 2223 2224 const TargetRegisterClass * 2225 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2226 if (BitWidth <= 16) 2227 return &AMDGPU::AGPR_LO16RegClass; 2228 if (BitWidth <= 32) 2229 return &AMDGPU::AGPR_32RegClass; 2230 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2231 : getAnyAGPRClassForBitWidth(BitWidth); 2232 } 2233 2234 static const TargetRegisterClass * 2235 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2236 if (BitWidth <= 64) 2237 return &AMDGPU::AV_64RegClass; 2238 if (BitWidth <= 96) 2239 return &AMDGPU::AV_96RegClass; 2240 if (BitWidth <= 128) 2241 return &AMDGPU::AV_128RegClass; 2242 if (BitWidth <= 160) 2243 return &AMDGPU::AV_160RegClass; 2244 if (BitWidth <= 192) 2245 return &AMDGPU::AV_192RegClass; 2246 if (BitWidth <= 224) 2247 return &AMDGPU::AV_224RegClass; 2248 if (BitWidth <= 256) 2249 return &AMDGPU::AV_256RegClass; 2250 if (BitWidth <= 512) 2251 return &AMDGPU::AV_512RegClass; 2252 if (BitWidth <= 1024) 2253 return &AMDGPU::AV_1024RegClass; 2254 2255 return nullptr; 2256 } 2257 2258 static const TargetRegisterClass * 2259 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2260 if (BitWidth <= 64) 2261 return &AMDGPU::AV_64_Align2RegClass; 2262 if (BitWidth <= 96) 2263 return &AMDGPU::AV_96_Align2RegClass; 2264 if (BitWidth <= 128) 2265 return &AMDGPU::AV_128_Align2RegClass; 2266 if (BitWidth <= 160) 2267 return &AMDGPU::AV_160_Align2RegClass; 2268 if (BitWidth <= 192) 2269 return &AMDGPU::AV_192_Align2RegClass; 2270 if (BitWidth <= 224) 2271 return &AMDGPU::AV_224_Align2RegClass; 2272 if (BitWidth <= 256) 2273 return &AMDGPU::AV_256_Align2RegClass; 2274 if (BitWidth <= 512) 2275 return &AMDGPU::AV_512_Align2RegClass; 2276 if (BitWidth <= 1024) 2277 return &AMDGPU::AV_1024_Align2RegClass; 2278 2279 return nullptr; 2280 } 2281 2282 const TargetRegisterClass * 2283 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2284 if (BitWidth <= 16) 2285 return &AMDGPU::VGPR_LO16RegClass; 2286 if (BitWidth <= 32) 2287 return &AMDGPU::AV_32RegClass; 2288 return ST.needsAlignedVGPRs() 2289 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2290 : getAnyVectorSuperClassForBitWidth(BitWidth); 2291 } 2292 2293 const TargetRegisterClass * 2294 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2295 if (BitWidth <= 16) 2296 return &AMDGPU::SGPR_LO16RegClass; 2297 if (BitWidth <= 32) 2298 return &AMDGPU::SReg_32RegClass; 2299 if (BitWidth <= 64) 2300 return &AMDGPU::SReg_64RegClass; 2301 if (BitWidth <= 96) 2302 return &AMDGPU::SGPR_96RegClass; 2303 if (BitWidth <= 128) 2304 return &AMDGPU::SGPR_128RegClass; 2305 if (BitWidth <= 160) 2306 return &AMDGPU::SGPR_160RegClass; 2307 if (BitWidth <= 192) 2308 return &AMDGPU::SGPR_192RegClass; 2309 if (BitWidth <= 224) 2310 return &AMDGPU::SGPR_224RegClass; 2311 if (BitWidth <= 256) 2312 return &AMDGPU::SGPR_256RegClass; 2313 if (BitWidth <= 512) 2314 return &AMDGPU::SGPR_512RegClass; 2315 if (BitWidth <= 1024) 2316 return &AMDGPU::SGPR_1024RegClass; 2317 2318 return nullptr; 2319 } 2320 2321 // FIXME: This is very slow. It might be worth creating a map from physreg to 2322 // register class. 2323 const TargetRegisterClass * 2324 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 2325 static const TargetRegisterClass *const BaseClasses[] = { 2326 &AMDGPU::VGPR_LO16RegClass, 2327 &AMDGPU::VGPR_HI16RegClass, 2328 &AMDGPU::SReg_LO16RegClass, 2329 &AMDGPU::AGPR_LO16RegClass, 2330 &AMDGPU::VGPR_32RegClass, 2331 &AMDGPU::SReg_32RegClass, 2332 &AMDGPU::AGPR_32RegClass, 2333 &AMDGPU::AGPR_32RegClass, 2334 &AMDGPU::VReg_64_Align2RegClass, 2335 &AMDGPU::VReg_64RegClass, 2336 &AMDGPU::SReg_64RegClass, 2337 &AMDGPU::AReg_64_Align2RegClass, 2338 &AMDGPU::AReg_64RegClass, 2339 &AMDGPU::VReg_96_Align2RegClass, 2340 &AMDGPU::VReg_96RegClass, 2341 &AMDGPU::SReg_96RegClass, 2342 &AMDGPU::AReg_96_Align2RegClass, 2343 &AMDGPU::AReg_96RegClass, 2344 &AMDGPU::VReg_128_Align2RegClass, 2345 &AMDGPU::VReg_128RegClass, 2346 &AMDGPU::SReg_128RegClass, 2347 &AMDGPU::AReg_128_Align2RegClass, 2348 &AMDGPU::AReg_128RegClass, 2349 &AMDGPU::VReg_160_Align2RegClass, 2350 &AMDGPU::VReg_160RegClass, 2351 &AMDGPU::SReg_160RegClass, 2352 &AMDGPU::AReg_160_Align2RegClass, 2353 &AMDGPU::AReg_160RegClass, 2354 &AMDGPU::VReg_192_Align2RegClass, 2355 &AMDGPU::VReg_192RegClass, 2356 &AMDGPU::SReg_192RegClass, 2357 &AMDGPU::AReg_192_Align2RegClass, 2358 &AMDGPU::AReg_192RegClass, 2359 &AMDGPU::VReg_224_Align2RegClass, 2360 &AMDGPU::VReg_224RegClass, 2361 &AMDGPU::SReg_224RegClass, 2362 &AMDGPU::AReg_224_Align2RegClass, 2363 &AMDGPU::AReg_224RegClass, 2364 &AMDGPU::VReg_256_Align2RegClass, 2365 &AMDGPU::VReg_256RegClass, 2366 &AMDGPU::SReg_256RegClass, 2367 &AMDGPU::AReg_256_Align2RegClass, 2368 &AMDGPU::AReg_256RegClass, 2369 &AMDGPU::VReg_512_Align2RegClass, 2370 &AMDGPU::VReg_512RegClass, 2371 &AMDGPU::SReg_512RegClass, 2372 &AMDGPU::AReg_512_Align2RegClass, 2373 &AMDGPU::AReg_512RegClass, 2374 &AMDGPU::SReg_1024RegClass, 2375 &AMDGPU::VReg_1024_Align2RegClass, 2376 &AMDGPU::VReg_1024RegClass, 2377 &AMDGPU::AReg_1024_Align2RegClass, 2378 &AMDGPU::AReg_1024RegClass, 2379 &AMDGPU::SCC_CLASSRegClass, 2380 &AMDGPU::Pseudo_SReg_32RegClass, 2381 &AMDGPU::Pseudo_SReg_128RegClass, 2382 }; 2383 2384 for (const TargetRegisterClass *BaseClass : BaseClasses) { 2385 if (BaseClass->contains(Reg)) { 2386 return BaseClass; 2387 } 2388 } 2389 return nullptr; 2390 } 2391 2392 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2393 Register Reg) const { 2394 const TargetRegisterClass *RC; 2395 if (Reg.isVirtual()) 2396 RC = MRI.getRegClass(Reg); 2397 else 2398 RC = getPhysRegClass(Reg); 2399 return isSGPRClass(RC); 2400 } 2401 2402 const TargetRegisterClass * 2403 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2404 unsigned Size = getRegSizeInBits(*SRC); 2405 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2406 assert(VRC && "Invalid register class size"); 2407 return VRC; 2408 } 2409 2410 const TargetRegisterClass * 2411 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2412 unsigned Size = getRegSizeInBits(*SRC); 2413 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2414 assert(ARC && "Invalid register class size"); 2415 return ARC; 2416 } 2417 2418 const TargetRegisterClass * 2419 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2420 unsigned Size = getRegSizeInBits(*VRC); 2421 if (Size == 32) 2422 return &AMDGPU::SGPR_32RegClass; 2423 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2424 assert(SRC && "Invalid register class size"); 2425 return SRC; 2426 } 2427 2428 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2429 const TargetRegisterClass *RC, unsigned SubIdx) const { 2430 if (SubIdx == AMDGPU::NoSubRegister) 2431 return RC; 2432 2433 // We can assume that each lane corresponds to one 32-bit register. 2434 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2435 if (isAGPRClass(RC)) { 2436 RC = getAGPRClassForBitWidth(Size); 2437 } else if (isVGPRClass(RC)) { 2438 RC = getVGPRClassForBitWidth(Size); 2439 } else if (isVectorSuperClass(RC)) { 2440 RC = getVectorSuperClassForBitWidth(Size); 2441 } else { 2442 RC = getSGPRClassForBitWidth(Size); 2443 } 2444 assert(RC && "Invalid sub-register class size"); 2445 return RC; 2446 } 2447 2448 const TargetRegisterClass * 2449 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2450 const TargetRegisterClass *SubRC, 2451 unsigned SubIdx) const { 2452 // Ensure this subregister index is aligned in the super register. 2453 const TargetRegisterClass *MatchRC = 2454 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2455 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2456 } 2457 2458 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2459 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2460 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2461 return !ST.hasMFMAInlineLiteralBug(); 2462 2463 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2464 OpType <= AMDGPU::OPERAND_SRC_LAST; 2465 } 2466 2467 bool SIRegisterInfo::shouldRewriteCopySrc( 2468 const TargetRegisterClass *DefRC, 2469 unsigned DefSubReg, 2470 const TargetRegisterClass *SrcRC, 2471 unsigned SrcSubReg) const { 2472 // We want to prefer the smallest register class possible, so we don't want to 2473 // stop and rewrite on anything that looks like a subregister 2474 // extract. Operations mostly don't care about the super register class, so we 2475 // only want to stop on the most basic of copies between the same register 2476 // class. 2477 // 2478 // e.g. if we have something like 2479 // %0 = ... 2480 // %1 = ... 2481 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2482 // %3 = COPY %2, sub0 2483 // 2484 // We want to look through the COPY to find: 2485 // => %3 = COPY %0 2486 2487 // Plain copy. 2488 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2489 } 2490 2491 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2492 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2493 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2494 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2495 } 2496 2497 /// Returns a lowest register that is not used at any point in the function. 2498 /// If all registers are used, then this function will return 2499 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2500 /// highest unused register. 2501 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2502 const TargetRegisterClass *RC, 2503 const MachineFunction &MF, 2504 bool ReserveHighestVGPR) const { 2505 if (ReserveHighestVGPR) { 2506 for (MCRegister Reg : reverse(*RC)) 2507 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2508 return Reg; 2509 } else { 2510 for (MCRegister Reg : *RC) 2511 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2512 return Reg; 2513 } 2514 return MCRegister(); 2515 } 2516 2517 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2518 unsigned EltSize) const { 2519 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2520 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2521 2522 const unsigned RegDWORDs = RegBitWidth / 32; 2523 const unsigned EltDWORDs = EltSize / 4; 2524 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2525 2526 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2527 const unsigned NumParts = RegDWORDs / EltDWORDs; 2528 2529 return makeArrayRef(Parts.data(), NumParts); 2530 } 2531 2532 const TargetRegisterClass* 2533 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2534 Register Reg) const { 2535 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2536 } 2537 2538 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2539 Register Reg) const { 2540 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2541 // Registers without classes are unaddressable, SGPR-like registers. 2542 return RC && isVGPRClass(RC); 2543 } 2544 2545 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2546 Register Reg) const { 2547 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2548 2549 // Registers without classes are unaddressable, SGPR-like registers. 2550 return RC && isAGPRClass(RC); 2551 } 2552 2553 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2554 const TargetRegisterClass *SrcRC, 2555 unsigned SubReg, 2556 const TargetRegisterClass *DstRC, 2557 unsigned DstSubReg, 2558 const TargetRegisterClass *NewRC, 2559 LiveIntervals &LIS) const { 2560 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2561 unsigned DstSize = getRegSizeInBits(*DstRC); 2562 unsigned NewSize = getRegSizeInBits(*NewRC); 2563 2564 // Do not increase size of registers beyond dword, we would need to allocate 2565 // adjacent registers and constraint regalloc more than needed. 2566 2567 // Always allow dword coalescing. 2568 if (SrcSize <= 32 || DstSize <= 32) 2569 return true; 2570 2571 return NewSize <= DstSize || NewSize <= SrcSize; 2572 } 2573 2574 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2575 MachineFunction &MF) const { 2576 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2577 2578 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2579 MF.getFunction()); 2580 switch (RC->getID()) { 2581 default: 2582 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2583 case AMDGPU::VGPR_32RegClassID: 2584 case AMDGPU::VGPR_LO16RegClassID: 2585 case AMDGPU::VGPR_HI16RegClassID: 2586 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2587 case AMDGPU::SGPR_32RegClassID: 2588 case AMDGPU::SGPR_LO16RegClassID: 2589 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2590 } 2591 } 2592 2593 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2594 unsigned Idx) const { 2595 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2596 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2597 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2598 const_cast<MachineFunction &>(MF)); 2599 2600 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2601 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2602 const_cast<MachineFunction &>(MF)); 2603 2604 llvm_unreachable("Unexpected register pressure set!"); 2605 } 2606 2607 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2608 static const int Empty[] = { -1 }; 2609 2610 if (RegPressureIgnoredUnits[RegUnit]) 2611 return Empty; 2612 2613 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2614 } 2615 2616 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2617 // Not a callee saved register. 2618 return AMDGPU::SGPR30_SGPR31; 2619 } 2620 2621 const TargetRegisterClass * 2622 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2623 const RegisterBank &RB, 2624 const MachineRegisterInfo &MRI) const { 2625 switch (RB.getID()) { 2626 case AMDGPU::VGPRRegBankID: 2627 return getVGPRClassForBitWidth(std::max(32u, Size)); 2628 case AMDGPU::VCCRegBankID: 2629 assert(Size == 1); 2630 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2631 : &AMDGPU::SReg_64_XEXECRegClass; 2632 case AMDGPU::SGPRRegBankID: 2633 return getSGPRClassForBitWidth(std::max(32u, Size)); 2634 case AMDGPU::AGPRRegBankID: 2635 return getAGPRClassForBitWidth(std::max(32u, Size)); 2636 default: 2637 llvm_unreachable("unknown register bank"); 2638 } 2639 } 2640 2641 const TargetRegisterClass * 2642 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2643 const MachineRegisterInfo &MRI) const { 2644 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2645 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2646 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2647 2648 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 2649 return getAllocatableClass(RC); 2650 2651 return nullptr; 2652 } 2653 2654 MCRegister SIRegisterInfo::getVCC() const { 2655 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2656 } 2657 2658 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2659 // VGPR tuples have an alignment requirement on gfx90a variants. 2660 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2661 : &AMDGPU::VReg_64RegClass; 2662 } 2663 2664 const TargetRegisterClass * 2665 SIRegisterInfo::getRegClass(unsigned RCID) const { 2666 switch ((int)RCID) { 2667 case AMDGPU::SReg_1RegClassID: 2668 return getBoolRC(); 2669 case AMDGPU::SReg_1_XEXECRegClassID: 2670 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2671 : &AMDGPU::SReg_64_XEXECRegClass; 2672 case -1: 2673 return nullptr; 2674 default: 2675 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2676 } 2677 } 2678 2679 // Find reaching register definition 2680 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2681 MachineInstr &Use, 2682 MachineRegisterInfo &MRI, 2683 LiveIntervals *LIS) const { 2684 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2685 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2686 SlotIndex DefIdx; 2687 2688 if (Reg.isVirtual()) { 2689 if (!LIS->hasInterval(Reg)) 2690 return nullptr; 2691 LiveInterval &LI = LIS->getInterval(Reg); 2692 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2693 : MRI.getMaxLaneMaskForVReg(Reg); 2694 VNInfo *V = nullptr; 2695 if (LI.hasSubRanges()) { 2696 for (auto &S : LI.subranges()) { 2697 if ((S.LaneMask & SubLanes) == SubLanes) { 2698 V = S.getVNInfoAt(UseIdx); 2699 break; 2700 } 2701 } 2702 } else { 2703 V = LI.getVNInfoAt(UseIdx); 2704 } 2705 if (!V) 2706 return nullptr; 2707 DefIdx = V->def; 2708 } else { 2709 // Find last def. 2710 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2711 ++Units) { 2712 LiveRange &LR = LIS->getRegUnit(*Units); 2713 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2714 if (!DefIdx.isValid() || 2715 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2716 LIS->getInstructionFromIndex(V->def))) 2717 DefIdx = V->def; 2718 } else { 2719 return nullptr; 2720 } 2721 } 2722 } 2723 2724 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2725 2726 if (!Def || !MDT.dominates(Def, &Use)) 2727 return nullptr; 2728 2729 assert(Def->modifiesRegister(Reg, this)); 2730 2731 return Def; 2732 } 2733 2734 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2735 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2736 2737 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2738 AMDGPU::SReg_32RegClass, 2739 AMDGPU::AGPR_32RegClass } ) { 2740 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2741 return Super; 2742 } 2743 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2744 &AMDGPU::VGPR_32RegClass)) { 2745 return Super; 2746 } 2747 2748 return AMDGPU::NoRegister; 2749 } 2750 2751 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 2752 if (!ST.needsAlignedVGPRs()) 2753 return true; 2754 2755 if (isVGPRClass(&RC)) 2756 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 2757 if (isAGPRClass(&RC)) 2758 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 2759 if (isVectorSuperClass(&RC)) 2760 return RC.hasSuperClassEq( 2761 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 2762 2763 return true; 2764 } 2765 2766 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 2767 switch (PhysReg) { 2768 case AMDGPU::SGPR_NULL: 2769 case AMDGPU::SRC_SHARED_BASE: 2770 case AMDGPU::SRC_PRIVATE_BASE: 2771 case AMDGPU::SRC_SHARED_LIMIT: 2772 case AMDGPU::SRC_PRIVATE_LIMIT: 2773 return true; 2774 default: 2775 return false; 2776 } 2777 } 2778 2779 ArrayRef<MCPhysReg> 2780 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 2781 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 2782 ST.getMaxNumSGPRs(MF) / 4); 2783 } 2784 2785 ArrayRef<MCPhysReg> 2786 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 2787 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 2788 ST.getMaxNumSGPRs(MF) / 2); 2789 } 2790 2791 ArrayRef<MCPhysReg> 2792 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 2793 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 2794 } 2795