1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveRegUnits.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling SGPRs to VGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, 49 const Twine &ErrMsg) { 50 Fn.getContext().diagnose( 51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc())); 52 } 53 54 namespace llvm { 55 56 // A temporary struct to spill SGPRs. 57 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 58 // just v_writelane and v_readlane. 59 // 60 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 61 // is saved to scratch (or the other way around for loads). 62 // For this, a VGPR is required where the needed lanes can be clobbered. The 63 // RegScavenger can provide a VGPR where currently active lanes can be 64 // clobbered, but we still need to save inactive lanes. 65 // The high-level steps are: 66 // - Try to scavenge SGPR(s) to save exec 67 // - Try to scavenge VGPR 68 // - Save needed, all or inactive lanes of a TmpVGPR 69 // - Spill/Restore SGPRs using TmpVGPR 70 // - Restore TmpVGPR 71 // 72 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 73 // cannot scavenge temporary SGPRs to save exec, we use the following code: 74 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 75 // s_not exec, exec 76 // buffer_store_dword TmpVGPR ; save inactive lanes 77 // s_not exec, exec 78 struct SGPRSpillBuilder { 79 struct PerVGPRData { 80 unsigned PerVGPR; 81 unsigned NumVGPRs; 82 int64_t VGPRLanes; 83 }; 84 85 // The SGPR to save 86 Register SuperReg; 87 MachineBasicBlock::iterator MI; 88 ArrayRef<int16_t> SplitParts; 89 unsigned NumSubRegs; 90 bool IsKill; 91 const DebugLoc &DL; 92 93 /* When spilling to stack */ 94 // The SGPRs are written into this VGPR, which is then written to scratch 95 // (or vice versa for loads). 96 Register TmpVGPR = AMDGPU::NoRegister; 97 // Temporary spill slot to save TmpVGPR to. 98 int TmpVGPRIndex = 0; 99 // If TmpVGPR is live before the spill or if it is scavenged. 100 bool TmpVGPRLive = false; 101 // Scavenged SGPR to save EXEC. 102 Register SavedExecReg = AMDGPU::NoRegister; 103 // Stack index to write the SGPRs to. 104 int Index; 105 unsigned EltSize = 4; 106 107 RegScavenger *RS; 108 MachineBasicBlock *MBB; 109 MachineFunction &MF; 110 SIMachineFunctionInfo &MFI; 111 const SIInstrInfo &TII; 112 const SIRegisterInfo &TRI; 113 bool IsWave32; 114 Register ExecReg; 115 unsigned MovOpc; 116 unsigned NotOpc; 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 120 RegScavenger *RS) 121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 122 MI->getOperand(0).isKill(), Index, RS) {} 123 124 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 125 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 126 bool IsKill, int Index, RegScavenger *RS) 127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 130 IsWave32(IsWave32) { 131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 132 SplitParts = TRI.getRegSplitParts(RC, EltSize); 133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 134 135 if (IsWave32) { 136 ExecReg = AMDGPU::EXEC_LO; 137 MovOpc = AMDGPU::S_MOV_B32; 138 NotOpc = AMDGPU::S_NOT_B32; 139 } else { 140 ExecReg = AMDGPU::EXEC; 141 MovOpc = AMDGPU::S_MOV_B64; 142 NotOpc = AMDGPU::S_NOT_B64; 143 } 144 145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 147 SuperReg != AMDGPU::EXEC && "exec should never spill"); 148 } 149 150 PerVGPRData getPerVGPRData() { 151 PerVGPRData Data; 152 Data.PerVGPR = IsWave32 ? 32 : 64; 153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 155 return Data; 156 } 157 158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 159 // free. 160 // Writes these instructions if an SGPR can be scavenged: 161 // s_mov_b64 s[6:7], exec ; Save exec 162 // s_mov_b64 exec, 3 ; Wanted lanemask 163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 164 // 165 // Writes these instructions if no SGPR can be scavenged: 166 // buffer_store_dword v0 ; Only if no free VGPR was found 167 // s_not_b64 exec, exec 168 // buffer_store_dword v0 ; Save inactive lanes 169 // ; exec stays inverted, it is flipped back in 170 // ; restore. 171 void prepare() { 172 // Scavenged temporary VGPR to use. It must be scavenged once for any number 173 // of spilled subregs. 174 // FIXME: The liveness analysis is limited and does not tell if a register 175 // is in use in lanes that are currently inactive. We can never be sure if 176 // a register as actually in use in another lane, so we need to save all 177 // used lanes of the chosen VGPR. 178 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 180 0, false); 181 182 // Reserve temporary stack slot 183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 184 if (TmpVGPR) { 185 // Found a register that is dead in the currently active lanes, we only 186 // need to spill inactive lanes. 187 TmpVGPRLive = false; 188 } else { 189 // Pick v0 because it doesn't make a difference. 190 TmpVGPR = AMDGPU::VGPR0; 191 TmpVGPRLive = true; 192 } 193 194 if (TmpVGPRLive) { 195 // We need to inform the scavenger that this index is already in use until 196 // we're done with the custom emergency spill. 197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 198 } 199 200 // We may end up recursively calling the scavenger, and don't want to re-use 201 // the same register. 202 RS->setRegUsed(TmpVGPR); 203 204 // Try to scavenge SGPRs to save exec 205 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 206 const TargetRegisterClass &RC = 207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 208 RS->setRegUsed(SuperReg); 209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); 210 211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 212 213 if (SavedExecReg) { 214 RS->setRegUsed(SavedExecReg); 215 // Set exec to needed lanes 216 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 217 auto I = 218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 219 if (!TmpVGPRLive) 220 I.addReg(TmpVGPR, RegState::ImplicitDefine); 221 // Spill needed lanes 222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 223 } else { 224 // The modify and restore of exec clobber SCC, which we would have to save 225 // and restore. FIXME: We probably would need to reserve a register for 226 // this. 227 if (RS->isRegUsed(AMDGPU::SCC)) 228 emitUnsupportedError(MF.getFunction(), *MI, 229 "unhandled SGPR spill to memory"); 230 231 // Spill active lanes 232 if (TmpVGPRLive) 233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 234 /*IsKill*/ false); 235 // Spill inactive lanes 236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 237 if (!TmpVGPRLive) 238 I.addReg(TmpVGPR, RegState::ImplicitDefine); 239 I->getOperand(2).setIsDead(); // Mark SCC as dead. 240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 241 } 242 } 243 244 // Writes these instructions if an SGPR can be scavenged: 245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 246 // s_waitcnt vmcnt(0) ; If a free VGPR was found 247 // s_mov_b64 exec, s[6:7] ; Save exec 248 // 249 // Writes these instructions if no SGPR can be scavenged: 250 // buffer_load_dword v0 ; Restore inactive lanes 251 // s_waitcnt vmcnt(0) ; If a free VGPR was found 252 // s_not_b64 exec, exec 253 // buffer_load_dword v0 ; Only if no free VGPR was found 254 void restore() { 255 if (SavedExecReg) { 256 // Restore used lanes 257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 258 /*IsKill*/ false); 259 // Restore exec 260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 261 .addReg(SavedExecReg, RegState::Kill); 262 // Add an implicit use of the load so it is not dead. 263 // FIXME This inserts an unnecessary waitcnt 264 if (!TmpVGPRLive) { 265 I.addReg(TmpVGPR, RegState::ImplicitKill); 266 } 267 } else { 268 // Restore inactive lanes 269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 270 /*IsKill*/ false); 271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 272 if (!TmpVGPRLive) 273 I.addReg(TmpVGPR, RegState::ImplicitKill); 274 I->getOperand(2).setIsDead(); // Mark SCC as dead. 275 276 // Restore active lanes 277 if (TmpVGPRLive) 278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 279 } 280 281 // Inform the scavenger where we're releasing our custom scavenged register. 282 if (TmpVGPRLive) { 283 MachineBasicBlock::iterator RestorePt = std::prev(MI); 284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 285 } 286 } 287 288 // Write TmpVGPR to memory or read TmpVGPR from memory. 289 // Either using a single buffer_load/store if exec is set to the needed mask 290 // or using 291 // buffer_load 292 // s_not exec, exec 293 // buffer_load 294 // s_not exec, exec 295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 296 if (SavedExecReg) { 297 // Spill needed lanes 298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 299 } else { 300 // The modify and restore of exec clobber SCC, which we would have to save 301 // and restore. FIXME: We probably would need to reserve a register for 302 // this. 303 if (RS->isRegUsed(AMDGPU::SCC)) 304 emitUnsupportedError(MF.getFunction(), *MI, 305 "unhandled SGPR spill to memory"); 306 307 // Spill active lanes 308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 309 /*IsKill*/ false); 310 // Spill inactive lanes 311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 316 } 317 } 318 319 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 320 assert(MBB->getParent() == &MF); 321 MI = NewMI; 322 MBB = NewMBB; 323 } 324 }; 325 326 } // namespace llvm 327 328 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), 330 ST.getAMDGPUDwarfFlavour(), 331 /*PC=*/0, ST.getHwMode()), 332 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 333 334 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 335 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 336 (getSubRegIndexLaneMask(AMDGPU::lo16) | 337 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 338 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 339 "getNumCoveredRegs() will not work with generated subreg masks!"); 340 341 RegPressureIgnoredUnits.resize(getNumRegUnits()); 342 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); 343 for (auto Reg : AMDGPU::VGPR_16RegClass) { 344 if (AMDGPU::isHi16Reg(Reg, *this)) 345 RegPressureIgnoredUnits.set(*regunits(Reg).begin()); 346 } 347 348 // HACK: Until this is fully tablegen'd. 349 static llvm::once_flag InitializeRegSplitPartsFlag; 350 351 static auto InitializeRegSplitPartsOnce = [this]() { 352 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 353 unsigned Size = getSubRegIdxSize(Idx); 354 if (Size & 15) 355 continue; 356 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1]; 357 unsigned Pos = getSubRegIdxOffset(Idx); 358 if (Pos % Size) 359 continue; 360 Pos /= Size; 361 if (Vec.empty()) { 362 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 363 Vec.resize(MaxNumParts); 364 } 365 Vec[Pos] = Idx; 366 } 367 }; 368 369 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 370 371 static auto InitializeSubRegFromChannelTableOnce = [this]() { 372 for (auto &Row : SubRegFromChannelTable) 373 Row.fill(AMDGPU::NoSubRegister); 374 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 375 unsigned Width = getSubRegIdxSize(Idx) / 32; 376 unsigned Offset = getSubRegIdxOffset(Idx) / 32; 377 assert(Width < SubRegFromChannelTableWidthMap.size()); 378 Width = SubRegFromChannelTableWidthMap[Width]; 379 if (Width == 0) 380 continue; 381 unsigned TableIdx = Width - 1; 382 assert(TableIdx < SubRegFromChannelTable.size()); 383 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 384 SubRegFromChannelTable[TableIdx][Offset] = Idx; 385 } 386 }; 387 388 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 389 llvm::call_once(InitializeSubRegFromChannelTableFlag, 390 InitializeSubRegFromChannelTableOnce); 391 } 392 393 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 394 MCRegister Reg) const { 395 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) 396 Reserved.set(*R); 397 } 398 399 // Forced to be here by one .inc 400 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 401 const MachineFunction *MF) const { 402 CallingConv::ID CC = MF->getFunction().getCallingConv(); 403 switch (CC) { 404 case CallingConv::C: 405 case CallingConv::Fast: 406 case CallingConv::Cold: 407 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 408 : CSR_AMDGPU_SaveList; 409 case CallingConv::AMDGPU_Gfx: 410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 411 : CSR_AMDGPU_SI_Gfx_SaveList; 412 case CallingConv::AMDGPU_CS_ChainPreserve: 413 return CSR_AMDGPU_CS_ChainPreserve_SaveList; 414 default: { 415 // Dummy to not crash RegisterClassInfo. 416 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 417 return &NoCalleeSavedReg; 418 } 419 } 420 } 421 422 const MCPhysReg * 423 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 424 return nullptr; 425 } 426 427 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 428 CallingConv::ID CC) const { 429 switch (CC) { 430 case CallingConv::C: 431 case CallingConv::Fast: 432 case CallingConv::Cold: 433 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 434 : CSR_AMDGPU_RegMask; 435 case CallingConv::AMDGPU_Gfx: 436 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 437 : CSR_AMDGPU_SI_Gfx_RegMask; 438 case CallingConv::AMDGPU_CS_Chain: 439 case CallingConv::AMDGPU_CS_ChainPreserve: 440 // Calls to these functions never return, so we can pretend everything is 441 // preserved. 442 return AMDGPU_AllVGPRs_RegMask; 443 default: 444 return nullptr; 445 } 446 } 447 448 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 449 return CSR_AMDGPU_NoRegs_RegMask; 450 } 451 452 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { 453 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; 454 } 455 456 const TargetRegisterClass * 457 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 458 const MachineFunction &MF) const { 459 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 460 // equivalent AV class. If used one, the verifier will crash after 461 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 462 // until Instruction selection. 463 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 464 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 465 return &AMDGPU::AV_32RegClass; 466 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 467 return &AMDGPU::AV_64RegClass; 468 if (RC == &AMDGPU::VReg_64_Align2RegClass || 469 RC == &AMDGPU::AReg_64_Align2RegClass) 470 return &AMDGPU::AV_64_Align2RegClass; 471 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 472 return &AMDGPU::AV_96RegClass; 473 if (RC == &AMDGPU::VReg_96_Align2RegClass || 474 RC == &AMDGPU::AReg_96_Align2RegClass) 475 return &AMDGPU::AV_96_Align2RegClass; 476 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 477 return &AMDGPU::AV_128RegClass; 478 if (RC == &AMDGPU::VReg_128_Align2RegClass || 479 RC == &AMDGPU::AReg_128_Align2RegClass) 480 return &AMDGPU::AV_128_Align2RegClass; 481 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 482 return &AMDGPU::AV_160RegClass; 483 if (RC == &AMDGPU::VReg_160_Align2RegClass || 484 RC == &AMDGPU::AReg_160_Align2RegClass) 485 return &AMDGPU::AV_160_Align2RegClass; 486 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 487 return &AMDGPU::AV_192RegClass; 488 if (RC == &AMDGPU::VReg_192_Align2RegClass || 489 RC == &AMDGPU::AReg_192_Align2RegClass) 490 return &AMDGPU::AV_192_Align2RegClass; 491 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 492 return &AMDGPU::AV_256RegClass; 493 if (RC == &AMDGPU::VReg_256_Align2RegClass || 494 RC == &AMDGPU::AReg_256_Align2RegClass) 495 return &AMDGPU::AV_256_Align2RegClass; 496 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 497 return &AMDGPU::AV_512RegClass; 498 if (RC == &AMDGPU::VReg_512_Align2RegClass || 499 RC == &AMDGPU::AReg_512_Align2RegClass) 500 return &AMDGPU::AV_512_Align2RegClass; 501 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 502 return &AMDGPU::AV_1024RegClass; 503 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 504 RC == &AMDGPU::AReg_1024_Align2RegClass) 505 return &AMDGPU::AV_1024_Align2RegClass; 506 } 507 508 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 509 } 510 511 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 512 const SIFrameLowering *TFI = ST.getFrameLowering(); 513 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 514 515 // During ISel lowering we always reserve the stack pointer in entry and chain 516 // functions, but never actually want to reference it when accessing our own 517 // frame. If we need a frame pointer we use it, but otherwise we can just use 518 // an immediate "0" which we represent by returning NoRegister. 519 if (FuncInfo->isBottomOfStack()) { 520 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 521 } 522 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 523 : FuncInfo->getStackPtrOffsetReg(); 524 } 525 526 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 527 // When we need stack realignment, we can't reference off of the 528 // stack pointer, so we reserve a base pointer. 529 return shouldRealignStack(MF); 530 } 531 532 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 533 534 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 535 return AMDGPU_AllVGPRs_RegMask; 536 } 537 538 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 539 return AMDGPU_AllAGPRs_RegMask; 540 } 541 542 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 543 return AMDGPU_AllVectorRegs_RegMask; 544 } 545 546 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 547 return AMDGPU_AllAllocatableSRegs_RegMask; 548 } 549 550 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 551 unsigned NumRegs) { 552 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 553 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 554 assert(NumRegIndex && "Not implemented"); 555 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 556 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 557 } 558 559 MCRegister 560 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, 561 const unsigned Align, 562 const TargetRegisterClass *RC) const { 563 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; 564 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 565 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); 566 } 567 568 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 569 const MachineFunction &MF) const { 570 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); 571 } 572 573 std::pair<unsigned, unsigned> 574 SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { 575 const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); 576 577 unsigned MaxNumVGPRs = MaxVectorRegs; 578 unsigned MaxNumAGPRs = 0; 579 580 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 581 // a wave may have up to 512 total vector registers combining together both 582 // VGPRs and AGPRs. Hence, in an entry function without calls and without 583 // AGPRs used within it, it is possible to use the whole vector register 584 // budget for VGPRs. 585 // 586 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 587 // register file accordingly. 588 if (ST.hasGFX90AInsts()) { 589 unsigned MinNumAGPRs = 0; 590 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); 591 const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 592 593 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; 594 595 // TODO: Move this logic into subtarget on IR function 596 // 597 // TODO: The lower bound should probably force the number of required 598 // registers up, overriding amdgpu-waves-per-eu. 599 std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( 600 MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR, 601 /*OnlyFirstRequired=*/true); 602 603 if (MinNumAGPRs == DefaultNumAGPR.first) { 604 // Default to splitting half the registers if AGPRs are required. 605 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; 606 } else { 607 // Align to accum_offset's allocation granularity. 608 MinNumAGPRs = alignTo(MinNumAGPRs, 4); 609 610 MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); 611 } 612 613 // Clamp values to be inbounds of our limits, and ensure min <= max. 614 615 MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); 616 MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); 617 618 MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); 619 MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); 620 621 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && 622 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && 623 "invalid register counts"); 624 } else if (ST.hasMAIInsts()) { 625 // On gfx908 the number of AGPRs always equals the number of VGPRs. 626 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; 627 } 628 629 return std::pair(MaxNumVGPRs, MaxNumAGPRs); 630 } 631 632 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 633 BitVector Reserved(getNumRegs()); 634 Reserved.set(AMDGPU::MODE); 635 636 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 637 638 // Reserve special purpose registers. 639 // 640 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 641 // this seems likely to result in bugs, so I'm marking them as reserved. 642 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 643 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 644 645 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 646 reserveRegisterTuples(Reserved, AMDGPU::M0); 647 648 // Reserve src_vccz, src_execz, src_scc. 649 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 650 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 651 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 652 653 // Reserve the memory aperture registers 654 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 655 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 656 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 657 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 658 659 // Reserve async counters pseudo registers 660 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt); 661 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt); 662 663 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 664 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 665 666 // Reserve xnack_mask registers - support is not implemented in Codegen. 667 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 668 669 // Reserve lds_direct register - support is not implemented in Codegen. 670 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 671 672 // Reserve Trap Handler registers - support is not implemented in Codegen. 673 reserveRegisterTuples(Reserved, AMDGPU::TBA); 674 reserveRegisterTuples(Reserved, AMDGPU::TMA); 675 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 676 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 677 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 678 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 679 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 680 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 681 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 682 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 683 684 // Reserve null register - it shall never be allocated 685 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 686 687 // Reserve SGPRs. 688 // 689 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 690 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 691 for (const TargetRegisterClass *RC : regclasses()) { 692 if (RC->isBaseClass() && isSGPRClass(RC)) { 693 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 694 for (MCPhysReg Reg : *RC) { 695 unsigned Index = getHWRegIndex(Reg); 696 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs) 697 Reserved.set(Reg); 698 } 699 } 700 } 701 702 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 703 if (ScratchRSrcReg != AMDGPU::NoRegister) { 704 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 705 // need to spill. 706 // TODO: May need to reserve a VGPR if doing LDS spilling. 707 reserveRegisterTuples(Reserved, ScratchRSrcReg); 708 } 709 710 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 711 if (LongBranchReservedReg) 712 reserveRegisterTuples(Reserved, LongBranchReservedReg); 713 714 // We have to assume the SP is needed in case there are calls in the function, 715 // which is detected after the function is lowered. If we aren't really going 716 // to need SP, don't bother reserving it. 717 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 718 if (StackPtrReg) { 719 reserveRegisterTuples(Reserved, StackPtrReg); 720 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 721 } 722 723 MCRegister FrameReg = MFI->getFrameOffsetReg(); 724 if (FrameReg) { 725 reserveRegisterTuples(Reserved, FrameReg); 726 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 727 } 728 729 if (hasBasePointer(MF)) { 730 MCRegister BasePtrReg = getBaseRegister(); 731 reserveRegisterTuples(Reserved, BasePtrReg); 732 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 733 } 734 735 // FIXME: Use same reserved register introduced in D149775 736 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. 737 Register ExecCopyReg = MFI->getSGPRForEXECCopy(); 738 if (ExecCopyReg) 739 reserveRegisterTuples(Reserved, ExecCopyReg); 740 741 // Reserve VGPRs/AGPRs. 742 // 743 auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); 744 745 for (const TargetRegisterClass *RC : regclasses()) { 746 if (RC->isBaseClass() && isVGPRClass(RC)) { 747 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 748 for (MCPhysReg Reg : *RC) { 749 unsigned Index = getHWRegIndex(Reg); 750 if (Index + NumRegs > MaxNumVGPRs) 751 Reserved.set(Reg); 752 } 753 } 754 } 755 756 // Reserve all the AGPRs if there are no instructions to use it. 757 if (!ST.hasMAIInsts()) 758 MaxNumAGPRs = 0; 759 for (const TargetRegisterClass *RC : regclasses()) { 760 if (RC->isBaseClass() && isAGPRClass(RC)) { 761 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32); 762 for (MCPhysReg Reg : *RC) { 763 unsigned Index = getHWRegIndex(Reg); 764 if (Index + NumRegs > MaxNumAGPRs) 765 Reserved.set(Reg); 766 } 767 } 768 } 769 770 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 771 // VGPR available at all times. 772 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 773 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 774 } 775 776 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The 777 // MFI->getNonWWMRegMask() field will have a valid bitmask only during 778 // wwm-regalloc and it would be empty otherwise. 779 BitVector NonWWMRegMask = MFI->getNonWWMRegMask(); 780 if (!NonWWMRegMask.empty()) { 781 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs; 782 RegI < RegE; ++RegI) { 783 if (NonWWMRegMask.test(RegI)) 784 reserveRegisterTuples(Reserved, RegI); 785 } 786 } 787 788 for (Register Reg : MFI->getWWMReservedRegs()) 789 reserveRegisterTuples(Reserved, Reg); 790 791 // FIXME: Stop using reserved registers for this. 792 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 793 reserveRegisterTuples(Reserved, Reg); 794 795 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 796 reserveRegisterTuples(Reserved, Reg); 797 798 return Reserved; 799 } 800 801 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 802 MCRegister PhysReg) const { 803 return !MF.getRegInfo().isReserved(PhysReg); 804 } 805 806 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 807 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 808 // On entry or in chain functions, the base address is 0, so it can't possibly 809 // need any more alignment. 810 811 // FIXME: Should be able to specify the entry frame alignment per calling 812 // convention instead. 813 if (Info->isBottomOfStack()) 814 return false; 815 816 return TargetRegisterInfo::shouldRealignStack(MF); 817 } 818 819 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 820 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 821 if (Info->isEntryFunction()) { 822 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 823 return MFI.hasStackObjects() || MFI.hasCalls(); 824 } 825 826 // May need scavenger for dealing with callee saved registers. 827 return true; 828 } 829 830 bool SIRegisterInfo::requiresFrameIndexScavenging( 831 const MachineFunction &MF) const { 832 // Do not use frame virtual registers. They used to be used for SGPRs, but 833 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 834 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 835 // spill. 836 return false; 837 } 838 839 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 840 const MachineFunction &MF) const { 841 const MachineFrameInfo &MFI = MF.getFrameInfo(); 842 return MFI.hasStackObjects(); 843 } 844 845 bool SIRegisterInfo::requiresVirtualBaseRegisters( 846 const MachineFunction &) const { 847 // There are no special dedicated stack or frame pointers. 848 return true; 849 } 850 851 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 852 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 853 854 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 855 AMDGPU::OpName::offset); 856 return MI->getOperand(OffIdx).getImm(); 857 } 858 859 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 860 int Idx) const { 861 switch (MI->getOpcode()) { 862 case AMDGPU::V_ADD_U32_e32: 863 case AMDGPU::V_ADD_U32_e64: 864 case AMDGPU::V_ADD_CO_U32_e32: { 865 int OtherIdx = Idx == 1 ? 2 : 1; 866 const MachineOperand &OtherOp = MI->getOperand(OtherIdx); 867 return OtherOp.isImm() ? OtherOp.getImm() : 0; 868 } 869 case AMDGPU::V_ADD_CO_U32_e64: { 870 int OtherIdx = Idx == 2 ? 3 : 2; 871 const MachineOperand &OtherOp = MI->getOperand(OtherIdx); 872 return OtherOp.isImm() ? OtherOp.getImm() : 0; 873 } 874 default: 875 break; 876 } 877 878 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 879 return 0; 880 881 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 882 AMDGPU::OpName::vaddr) || 883 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 884 AMDGPU::OpName::saddr))) && 885 "Should never see frame index on non-address operand"); 886 887 return getScratchInstrOffset(MI); 888 } 889 890 static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, 891 const MachineInstr &MI) { 892 assert(MI.getDesc().isAdd()); 893 const MachineOperand &Src0 = MI.getOperand(1); 894 const MachineOperand &Src1 = MI.getOperand(2); 895 896 if (Src0.isFI()) { 897 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(), 898 Src1.getReg())); 899 } 900 901 if (Src1.isFI()) { 902 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(), 903 Src0.getReg())); 904 } 905 906 return false; 907 } 908 909 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 910 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes. 911 switch (MI->getOpcode()) { 912 case AMDGPU::V_ADD_U32_e32: { 913 // TODO: We could handle this but it requires work to avoid violating 914 // operand restrictions. 915 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 && 916 !isFIPlusImmOrVGPR(*this, *MI)) 917 return false; 918 [[fallthrough]]; 919 } 920 case AMDGPU::V_ADD_U32_e64: 921 // FIXME: This optimization is barely profitable enableFlatScratch as-is. 922 // 923 // Much of the benefit with the MUBUF handling is we avoid duplicating the 924 // shift of the frame register, which isn't needed with scratch. 925 // 926 // materializeFrameBaseRegister doesn't know the register classes of the 927 // uses, and unconditionally uses an s_add_i32, which will end up using a 928 // copy for the vector uses. 929 return !ST.enableFlatScratch(); 930 case AMDGPU::V_ADD_CO_U32_e32: 931 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 && 932 !isFIPlusImmOrVGPR(*this, *MI)) 933 return false; 934 // We can't deal with the case where the carry out has a use (though this 935 // should never happen) 936 return MI->getOperand(3).isDead(); 937 case AMDGPU::V_ADD_CO_U32_e64: 938 // TODO: Should we check use_empty instead? 939 return MI->getOperand(1).isDead(); 940 default: 941 break; 942 } 943 944 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 945 return false; 946 947 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 948 949 const SIInstrInfo *TII = ST.getInstrInfo(); 950 if (SIInstrInfo::isMUBUF(*MI)) 951 return !TII->isLegalMUBUFImmOffset(FullOffset); 952 953 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 954 SIInstrFlags::FlatScratch); 955 } 956 957 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 958 int FrameIdx, 959 int64_t Offset) const { 960 MachineBasicBlock::iterator Ins = MBB->begin(); 961 DebugLoc DL; // Defaults to "unknown" 962 963 if (Ins != MBB->end()) 964 DL = Ins->getDebugLoc(); 965 966 MachineFunction *MF = MBB->getParent(); 967 const SIInstrInfo *TII = ST.getInstrInfo(); 968 MachineRegisterInfo &MRI = MF->getRegInfo(); 969 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 970 : AMDGPU::V_MOV_B32_e32; 971 972 Register BaseReg = MRI.createVirtualRegister( 973 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 974 : &AMDGPU::VGPR_32RegClass); 975 976 if (Offset == 0) { 977 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 978 .addFrameIndex(FrameIdx); 979 return BaseReg; 980 } 981 982 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 983 984 Register FIReg = MRI.createVirtualRegister( 985 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 986 : &AMDGPU::VGPR_32RegClass); 987 988 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 989 .addImm(Offset); 990 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 991 .addFrameIndex(FrameIdx); 992 993 if (ST.enableFlatScratch() ) { 994 // FIXME: Make sure scc isn't live in. 995 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 996 .addReg(OffsetReg, RegState::Kill) 997 .addReg(FIReg) 998 .setOperandDead(3); // scc 999 return BaseReg; 1000 } 1001 1002 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 1003 .addReg(OffsetReg, RegState::Kill) 1004 .addReg(FIReg) 1005 .addImm(0); // clamp bit 1006 1007 return BaseReg; 1008 } 1009 1010 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 1011 int64_t Offset) const { 1012 const SIInstrInfo *TII = ST.getInstrInfo(); 1013 1014 switch (MI.getOpcode()) { 1015 case AMDGPU::V_ADD_U32_e32: 1016 case AMDGPU::V_ADD_CO_U32_e32: { 1017 MachineOperand *FIOp = &MI.getOperand(2); 1018 MachineOperand *ImmOp = &MI.getOperand(1); 1019 if (!FIOp->isFI()) 1020 std::swap(FIOp, ImmOp); 1021 1022 if (!ImmOp->isImm()) { 1023 assert(Offset == 0); 1024 FIOp->ChangeToRegister(BaseReg, false); 1025 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI); 1026 return; 1027 } 1028 1029 int64_t TotalOffset = ImmOp->getImm() + Offset; 1030 if (TotalOffset == 0) { 1031 MI.setDesc(TII->get(AMDGPU::COPY)); 1032 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I) 1033 MI.removeOperand(I); 1034 1035 MI.getOperand(1).ChangeToRegister(BaseReg, false); 1036 return; 1037 } 1038 1039 ImmOp->setImm(TotalOffset); 1040 1041 MachineBasicBlock *MBB = MI.getParent(); 1042 MachineFunction *MF = MBB->getParent(); 1043 MachineRegisterInfo &MRI = MF->getRegInfo(); 1044 1045 // FIXME: materializeFrameBaseRegister does not know the register class of 1046 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit 1047 // a copy so we have a legal operand and hope the register coalescer can 1048 // clean it up. 1049 if (isSGPRReg(MRI, BaseReg)) { 1050 Register BaseRegVGPR = 1051 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1052 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR) 1053 .addReg(BaseReg); 1054 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false); 1055 } else { 1056 MI.getOperand(2).ChangeToRegister(BaseReg, false); 1057 } 1058 return; 1059 } 1060 case AMDGPU::V_ADD_U32_e64: 1061 case AMDGPU::V_ADD_CO_U32_e64: { 1062 int Src0Idx = MI.getNumExplicitDefs(); 1063 MachineOperand *FIOp = &MI.getOperand(Src0Idx); 1064 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1); 1065 if (!FIOp->isFI()) 1066 std::swap(FIOp, ImmOp); 1067 1068 if (!ImmOp->isImm()) { 1069 FIOp->ChangeToRegister(BaseReg, false); 1070 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI); 1071 return; 1072 } 1073 1074 int64_t TotalOffset = ImmOp->getImm() + Offset; 1075 if (TotalOffset == 0) { 1076 MI.setDesc(TII->get(AMDGPU::COPY)); 1077 1078 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I) 1079 MI.removeOperand(I); 1080 1081 MI.getOperand(1).ChangeToRegister(BaseReg, false); 1082 } else { 1083 FIOp->ChangeToRegister(BaseReg, false); 1084 ImmOp->setImm(TotalOffset); 1085 } 1086 1087 return; 1088 } 1089 default: 1090 break; 1091 } 1092 1093 bool IsFlat = TII->isFLATScratch(MI); 1094 1095 #ifndef NDEBUG 1096 // FIXME: Is it possible to be storing a frame index to itself? 1097 bool SeenFI = false; 1098 for (const MachineOperand &MO: MI.operands()) { 1099 if (MO.isFI()) { 1100 if (SeenFI) 1101 llvm_unreachable("should not see multiple frame indices"); 1102 1103 SeenFI = true; 1104 } 1105 } 1106 #endif 1107 1108 MachineOperand *FIOp = 1109 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 1110 : AMDGPU::OpName::vaddr); 1111 1112 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 1113 int64_t NewOffset = OffsetOp->getImm() + Offset; 1114 1115 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 1116 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 1117 1118 if (IsFlat) { 1119 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1120 SIInstrFlags::FlatScratch) && 1121 "offset should be legal"); 1122 FIOp->ChangeToRegister(BaseReg, false); 1123 OffsetOp->setImm(NewOffset); 1124 return; 1125 } 1126 1127 #ifndef NDEBUG 1128 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 1129 assert(SOffset->isImm() && SOffset->getImm() == 0); 1130 #endif 1131 1132 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); 1133 1134 FIOp->ChangeToRegister(BaseReg, false); 1135 OffsetOp->setImm(NewOffset); 1136 } 1137 1138 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 1139 Register BaseReg, 1140 int64_t Offset) const { 1141 1142 switch (MI->getOpcode()) { 1143 case AMDGPU::V_ADD_U32_e32: 1144 case AMDGPU::V_ADD_CO_U32_e32: 1145 return true; 1146 case AMDGPU::V_ADD_U32_e64: 1147 case AMDGPU::V_ADD_CO_U32_e64: 1148 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset); 1149 default: 1150 break; 1151 } 1152 1153 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 1154 return false; 1155 1156 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 1157 1158 const SIInstrInfo *TII = ST.getInstrInfo(); 1159 if (SIInstrInfo::isMUBUF(*MI)) 1160 return TII->isLegalMUBUFImmOffset(NewOffset); 1161 1162 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1163 SIInstrFlags::FlatScratch); 1164 } 1165 1166 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 1167 const MachineFunction &MF, unsigned Kind) const { 1168 // This is inaccurate. It depends on the instruction and address space. The 1169 // only place where we should hit this is for dealing with frame indexes / 1170 // private accesses, so this is correct in that case. 1171 return &AMDGPU::VGPR_32RegClass; 1172 } 1173 1174 const TargetRegisterClass * 1175 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 1176 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 1177 return getEquivalentVGPRClass(RC); 1178 if (RC == &AMDGPU::SCC_CLASSRegClass) 1179 return getWaveMaskRegClass(); 1180 1181 return RC; 1182 } 1183 1184 static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, 1185 const SIInstrInfo *TII) { 1186 1187 unsigned Op = MI.getOpcode(); 1188 switch (Op) { 1189 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: 1190 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: 1191 // FIXME: This assumes the mask is statically known and not computed at 1192 // runtime. However, some ABIs may want to compute the mask dynamically and 1193 // this will need to be updated. 1194 return llvm::popcount( 1195 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm()); 1196 case AMDGPU::SI_SPILL_S1024_SAVE: 1197 case AMDGPU::SI_SPILL_S1024_RESTORE: 1198 case AMDGPU::SI_SPILL_V1024_SAVE: 1199 case AMDGPU::SI_SPILL_V1024_RESTORE: 1200 case AMDGPU::SI_SPILL_A1024_SAVE: 1201 case AMDGPU::SI_SPILL_A1024_RESTORE: 1202 case AMDGPU::SI_SPILL_AV1024_SAVE: 1203 case AMDGPU::SI_SPILL_AV1024_RESTORE: 1204 return 32; 1205 case AMDGPU::SI_SPILL_S512_SAVE: 1206 case AMDGPU::SI_SPILL_S512_RESTORE: 1207 case AMDGPU::SI_SPILL_V512_SAVE: 1208 case AMDGPU::SI_SPILL_V512_RESTORE: 1209 case AMDGPU::SI_SPILL_A512_SAVE: 1210 case AMDGPU::SI_SPILL_A512_RESTORE: 1211 case AMDGPU::SI_SPILL_AV512_SAVE: 1212 case AMDGPU::SI_SPILL_AV512_RESTORE: 1213 return 16; 1214 case AMDGPU::SI_SPILL_S384_SAVE: 1215 case AMDGPU::SI_SPILL_S384_RESTORE: 1216 case AMDGPU::SI_SPILL_V384_SAVE: 1217 case AMDGPU::SI_SPILL_V384_RESTORE: 1218 case AMDGPU::SI_SPILL_A384_SAVE: 1219 case AMDGPU::SI_SPILL_A384_RESTORE: 1220 case AMDGPU::SI_SPILL_AV384_SAVE: 1221 case AMDGPU::SI_SPILL_AV384_RESTORE: 1222 return 12; 1223 case AMDGPU::SI_SPILL_S352_SAVE: 1224 case AMDGPU::SI_SPILL_S352_RESTORE: 1225 case AMDGPU::SI_SPILL_V352_SAVE: 1226 case AMDGPU::SI_SPILL_V352_RESTORE: 1227 case AMDGPU::SI_SPILL_A352_SAVE: 1228 case AMDGPU::SI_SPILL_A352_RESTORE: 1229 case AMDGPU::SI_SPILL_AV352_SAVE: 1230 case AMDGPU::SI_SPILL_AV352_RESTORE: 1231 return 11; 1232 case AMDGPU::SI_SPILL_S320_SAVE: 1233 case AMDGPU::SI_SPILL_S320_RESTORE: 1234 case AMDGPU::SI_SPILL_V320_SAVE: 1235 case AMDGPU::SI_SPILL_V320_RESTORE: 1236 case AMDGPU::SI_SPILL_A320_SAVE: 1237 case AMDGPU::SI_SPILL_A320_RESTORE: 1238 case AMDGPU::SI_SPILL_AV320_SAVE: 1239 case AMDGPU::SI_SPILL_AV320_RESTORE: 1240 return 10; 1241 case AMDGPU::SI_SPILL_S288_SAVE: 1242 case AMDGPU::SI_SPILL_S288_RESTORE: 1243 case AMDGPU::SI_SPILL_V288_SAVE: 1244 case AMDGPU::SI_SPILL_V288_RESTORE: 1245 case AMDGPU::SI_SPILL_A288_SAVE: 1246 case AMDGPU::SI_SPILL_A288_RESTORE: 1247 case AMDGPU::SI_SPILL_AV288_SAVE: 1248 case AMDGPU::SI_SPILL_AV288_RESTORE: 1249 return 9; 1250 case AMDGPU::SI_SPILL_S256_SAVE: 1251 case AMDGPU::SI_SPILL_S256_RESTORE: 1252 case AMDGPU::SI_SPILL_V256_SAVE: 1253 case AMDGPU::SI_SPILL_V256_RESTORE: 1254 case AMDGPU::SI_SPILL_A256_SAVE: 1255 case AMDGPU::SI_SPILL_A256_RESTORE: 1256 case AMDGPU::SI_SPILL_AV256_SAVE: 1257 case AMDGPU::SI_SPILL_AV256_RESTORE: 1258 return 8; 1259 case AMDGPU::SI_SPILL_S224_SAVE: 1260 case AMDGPU::SI_SPILL_S224_RESTORE: 1261 case AMDGPU::SI_SPILL_V224_SAVE: 1262 case AMDGPU::SI_SPILL_V224_RESTORE: 1263 case AMDGPU::SI_SPILL_A224_SAVE: 1264 case AMDGPU::SI_SPILL_A224_RESTORE: 1265 case AMDGPU::SI_SPILL_AV224_SAVE: 1266 case AMDGPU::SI_SPILL_AV224_RESTORE: 1267 return 7; 1268 case AMDGPU::SI_SPILL_S192_SAVE: 1269 case AMDGPU::SI_SPILL_S192_RESTORE: 1270 case AMDGPU::SI_SPILL_V192_SAVE: 1271 case AMDGPU::SI_SPILL_V192_RESTORE: 1272 case AMDGPU::SI_SPILL_A192_SAVE: 1273 case AMDGPU::SI_SPILL_A192_RESTORE: 1274 case AMDGPU::SI_SPILL_AV192_SAVE: 1275 case AMDGPU::SI_SPILL_AV192_RESTORE: 1276 return 6; 1277 case AMDGPU::SI_SPILL_S160_SAVE: 1278 case AMDGPU::SI_SPILL_S160_RESTORE: 1279 case AMDGPU::SI_SPILL_V160_SAVE: 1280 case AMDGPU::SI_SPILL_V160_RESTORE: 1281 case AMDGPU::SI_SPILL_A160_SAVE: 1282 case AMDGPU::SI_SPILL_A160_RESTORE: 1283 case AMDGPU::SI_SPILL_AV160_SAVE: 1284 case AMDGPU::SI_SPILL_AV160_RESTORE: 1285 return 5; 1286 case AMDGPU::SI_SPILL_S128_SAVE: 1287 case AMDGPU::SI_SPILL_S128_RESTORE: 1288 case AMDGPU::SI_SPILL_V128_SAVE: 1289 case AMDGPU::SI_SPILL_V128_RESTORE: 1290 case AMDGPU::SI_SPILL_A128_SAVE: 1291 case AMDGPU::SI_SPILL_A128_RESTORE: 1292 case AMDGPU::SI_SPILL_AV128_SAVE: 1293 case AMDGPU::SI_SPILL_AV128_RESTORE: 1294 return 4; 1295 case AMDGPU::SI_SPILL_S96_SAVE: 1296 case AMDGPU::SI_SPILL_S96_RESTORE: 1297 case AMDGPU::SI_SPILL_V96_SAVE: 1298 case AMDGPU::SI_SPILL_V96_RESTORE: 1299 case AMDGPU::SI_SPILL_A96_SAVE: 1300 case AMDGPU::SI_SPILL_A96_RESTORE: 1301 case AMDGPU::SI_SPILL_AV96_SAVE: 1302 case AMDGPU::SI_SPILL_AV96_RESTORE: 1303 return 3; 1304 case AMDGPU::SI_SPILL_S64_SAVE: 1305 case AMDGPU::SI_SPILL_S64_RESTORE: 1306 case AMDGPU::SI_SPILL_V64_SAVE: 1307 case AMDGPU::SI_SPILL_V64_RESTORE: 1308 case AMDGPU::SI_SPILL_A64_SAVE: 1309 case AMDGPU::SI_SPILL_A64_RESTORE: 1310 case AMDGPU::SI_SPILL_AV64_SAVE: 1311 case AMDGPU::SI_SPILL_AV64_RESTORE: 1312 return 2; 1313 case AMDGPU::SI_SPILL_S32_SAVE: 1314 case AMDGPU::SI_SPILL_S32_RESTORE: 1315 case AMDGPU::SI_SPILL_V32_SAVE: 1316 case AMDGPU::SI_SPILL_V32_RESTORE: 1317 case AMDGPU::SI_SPILL_A32_SAVE: 1318 case AMDGPU::SI_SPILL_A32_RESTORE: 1319 case AMDGPU::SI_SPILL_AV32_SAVE: 1320 case AMDGPU::SI_SPILL_AV32_RESTORE: 1321 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 1322 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 1323 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: 1324 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: 1325 case AMDGPU::SI_SPILL_V16_SAVE: 1326 case AMDGPU::SI_SPILL_V16_RESTORE: 1327 return 1; 1328 default: llvm_unreachable("Invalid spill opcode"); 1329 } 1330 } 1331 1332 static int getOffsetMUBUFStore(unsigned Opc) { 1333 switch (Opc) { 1334 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1335 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1336 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1337 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1338 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1339 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1340 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1341 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1342 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1343 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1344 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1345 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1346 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1347 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1348 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1349 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1350 default: 1351 return -1; 1352 } 1353 } 1354 1355 static int getOffsetMUBUFLoad(unsigned Opc) { 1356 switch (Opc) { 1357 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1358 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1359 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1360 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1361 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1362 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1363 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1364 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1365 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1366 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1367 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1368 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1369 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1370 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1371 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1372 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1373 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1374 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1375 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1376 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1377 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1378 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1379 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1380 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1381 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1382 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1383 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1384 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1385 default: 1386 return -1; 1387 } 1388 } 1389 1390 static int getOffenMUBUFStore(unsigned Opc) { 1391 switch (Opc) { 1392 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1393 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1394 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1395 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1396 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1397 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1398 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1399 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1400 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1401 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1402 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1403 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1404 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1405 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1406 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1407 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1408 default: 1409 return -1; 1410 } 1411 } 1412 1413 static int getOffenMUBUFLoad(unsigned Opc) { 1414 switch (Opc) { 1415 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1416 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1417 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1418 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1419 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1420 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1421 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1422 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1423 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1424 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1425 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1426 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1427 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1428 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1429 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1430 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1431 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1432 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1433 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1434 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1435 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1436 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1437 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1438 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1439 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1440 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1441 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1442 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1443 default: 1444 return -1; 1445 } 1446 } 1447 1448 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1449 MachineBasicBlock &MBB, 1450 MachineBasicBlock::iterator MI, 1451 int Index, unsigned Lane, 1452 unsigned ValueReg, bool IsKill) { 1453 MachineFunction *MF = MBB.getParent(); 1454 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1455 const SIInstrInfo *TII = ST.getInstrInfo(); 1456 1457 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1458 1459 if (Reg == AMDGPU::NoRegister) 1460 return MachineInstrBuilder(); 1461 1462 bool IsStore = MI->mayStore(); 1463 MachineRegisterInfo &MRI = MF->getRegInfo(); 1464 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1465 1466 unsigned Dst = IsStore ? Reg : ValueReg; 1467 unsigned Src = IsStore ? ValueReg : Reg; 1468 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1469 DebugLoc DL = MI->getDebugLoc(); 1470 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1471 // Spiller during regalloc may restore a spilled register to its superclass. 1472 // It could result in AGPR spills restored to VGPRs or the other way around, 1473 // making the src and dst with identical regclasses at this point. It just 1474 // needs a copy in such cases. 1475 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1476 .addReg(Src, getKillRegState(IsKill)); 1477 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1478 return CopyMIB; 1479 } 1480 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1481 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1482 1483 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1484 .addReg(Src, getKillRegState(IsKill)); 1485 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1486 return MIB; 1487 } 1488 1489 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1490 // need to handle the case where an SGPR may need to be spilled while spilling. 1491 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1492 MachineFrameInfo &MFI, 1493 MachineBasicBlock::iterator MI, 1494 int Index, 1495 int64_t Offset) { 1496 const SIInstrInfo *TII = ST.getInstrInfo(); 1497 MachineBasicBlock *MBB = MI->getParent(); 1498 const DebugLoc &DL = MI->getDebugLoc(); 1499 bool IsStore = MI->mayStore(); 1500 1501 unsigned Opc = MI->getOpcode(); 1502 int LoadStoreOp = IsStore ? 1503 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1504 if (LoadStoreOp == -1) 1505 return false; 1506 1507 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1508 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1509 return true; 1510 1511 MachineInstrBuilder NewMI = 1512 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1513 .add(*Reg) 1514 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1515 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1516 .addImm(Offset) 1517 .addImm(0) // cpol 1518 .addImm(0) // swz 1519 .cloneMemRefs(*MI); 1520 1521 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1522 AMDGPU::OpName::vdata_in); 1523 if (VDataIn) 1524 NewMI.add(*VDataIn); 1525 return true; 1526 } 1527 1528 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1529 unsigned LoadStoreOp, 1530 unsigned EltSize) { 1531 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1532 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1533 bool UseST = 1534 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1535 1536 // Handle block load/store first. 1537 if (TII->isBlockLoadStore(LoadStoreOp)) 1538 return LoadStoreOp; 1539 1540 switch (EltSize) { 1541 case 4: 1542 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1543 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1544 break; 1545 case 8: 1546 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1547 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1548 break; 1549 case 12: 1550 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1551 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1552 break; 1553 case 16: 1554 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1555 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1556 break; 1557 default: 1558 llvm_unreachable("Unexpected spill load/store size!"); 1559 } 1560 1561 if (HasVAddr) 1562 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1563 else if (UseST) 1564 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1565 1566 return LoadStoreOp; 1567 } 1568 1569 void SIRegisterInfo::buildSpillLoadStore( 1570 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1571 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1572 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1573 RegScavenger *RS, LiveRegUnits *LiveUnits) const { 1574 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both"); 1575 1576 MachineFunction *MF = MBB.getParent(); 1577 const SIInstrInfo *TII = ST.getInstrInfo(); 1578 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1579 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1580 1581 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1582 bool IsStore = Desc->mayStore(); 1583 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1584 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp); 1585 1586 bool CanClobberSCC = false; 1587 bool Scavenged = false; 1588 MCRegister SOffset = ScratchOffsetReg; 1589 1590 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1591 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1592 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1593 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; 1594 1595 // Always use 4 byte operations for AGPRs because we need to scavenge 1596 // a temporary VGPR. 1597 // If we're using a block operation, the element should be the whole block. 1598 unsigned EltSize = IsBlock ? RegWidth 1599 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) 1600 : 4u; 1601 unsigned NumSubRegs = RegWidth / EltSize; 1602 unsigned Size = NumSubRegs * EltSize; 1603 unsigned RemSize = RegWidth - Size; 1604 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1605 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1606 int64_t MaterializedOffset = Offset; 1607 1608 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1609 int64_t ScratchOffsetRegDelta = 0; 1610 1611 if (IsFlat && EltSize > 4) { 1612 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1613 Desc = &TII->get(LoadStoreOp); 1614 } 1615 1616 Align Alignment = MFI.getObjectAlign(Index); 1617 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1618 1619 assert((IsFlat || ((Offset % EltSize) == 0)) && 1620 "unexpected VGPR spill offset"); 1621 1622 // Track a VGPR to use for a constant offset we need to materialize. 1623 Register TmpOffsetVGPR; 1624 1625 // Track a VGPR to use as an intermediate value. 1626 Register TmpIntermediateVGPR; 1627 bool UseVGPROffset = false; 1628 1629 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1630 // combination. 1631 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1632 int64_t VOffset) { 1633 // We are using a VGPR offset 1634 if (IsFlat && SGPRBase) { 1635 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1636 // SGPR, so perform the add as vector. 1637 // We don't need a base SGPR in the kernel. 1638 1639 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1640 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1641 .addReg(SGPRBase) 1642 .addImm(VOffset) 1643 .addImm(0); // clamp 1644 } else { 1645 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1646 .addReg(SGPRBase); 1647 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1648 .addImm(VOffset) 1649 .addReg(TmpOffsetVGPR); 1650 } 1651 } else { 1652 assert(TmpOffsetVGPR); 1653 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1654 .addImm(VOffset); 1655 } 1656 }; 1657 1658 bool IsOffsetLegal = 1659 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1660 SIInstrFlags::FlatScratch) 1661 : TII->isLegalMUBUFImmOffset(MaxOffset); 1662 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1663 SOffset = MCRegister(); 1664 1665 // We don't have access to the register scavenger if this function is called 1666 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. 1667 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1668 // entry. 1669 if (RS) { 1670 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1671 1672 // Piggy back on the liveness scan we just did see if SCC is dead. 1673 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1674 } else if (LiveUnits) { 1675 CanClobberSCC = LiveUnits->available(AMDGPU::SCC); 1676 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1677 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1678 SOffset = Reg; 1679 break; 1680 } 1681 } 1682 } 1683 1684 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1685 SOffset = Register(); 1686 1687 if (!SOffset) { 1688 UseVGPROffset = true; 1689 1690 if (RS) { 1691 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1692 } else { 1693 assert(LiveUnits); 1694 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1695 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { 1696 TmpOffsetVGPR = Reg; 1697 break; 1698 } 1699 } 1700 } 1701 1702 assert(TmpOffsetVGPR); 1703 } else if (!SOffset && CanClobberSCC) { 1704 // There are no free SGPRs, and since we are in the process of spilling 1705 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1706 // on SI/CI and on VI it is true until we implement spilling using scalar 1707 // stores), we have no way to free up an SGPR. Our solution here is to 1708 // add the offset directly to the ScratchOffset or StackPtrOffset 1709 // register, and then subtract the offset after the spill to return the 1710 // register to it's original value. 1711 1712 // TODO: If we don't have to do an emergency stack slot spill, converting 1713 // to use the VGPR offset is fewer instructions. 1714 if (!ScratchOffsetReg) 1715 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1716 SOffset = ScratchOffsetReg; 1717 ScratchOffsetRegDelta = Offset; 1718 } else { 1719 Scavenged = true; 1720 } 1721 1722 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1723 // we can simplify the adjustment of Offset here to just scale with 1724 // WavefrontSize. 1725 if (!IsFlat && !UseVGPROffset) 1726 Offset *= ST.getWavefrontSize(); 1727 1728 if (!UseVGPROffset && !SOffset) 1729 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1730 1731 if (UseVGPROffset) { 1732 // We are using a VGPR offset 1733 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1734 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1735 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1736 } else { 1737 assert(Offset != 0); 1738 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1739 .addReg(ScratchOffsetReg) 1740 .addImm(Offset); 1741 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1742 } 1743 1744 Offset = 0; 1745 } 1746 1747 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1748 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1749 && "Unexpected vaddr for flat scratch with a FI operand"); 1750 1751 if (UseVGPROffset) { 1752 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1753 } else { 1754 assert(ST.hasFlatScratchSTMode()); 1755 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST"); 1756 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1757 } 1758 1759 Desc = &TII->get(LoadStoreOp); 1760 } 1761 1762 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1763 ++i, RegOffset += EltSize) { 1764 if (i == NumSubRegs) { 1765 EltSize = RemSize; 1766 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1767 } 1768 Desc = &TII->get(LoadStoreOp); 1769 1770 if (!IsFlat && UseVGPROffset) { 1771 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1772 : getOffenMUBUFLoad(LoadStoreOp); 1773 Desc = &TII->get(NewLoadStoreOp); 1774 } 1775 1776 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1777 // If we are spilling an AGPR beyond the range of the memory instruction 1778 // offset and need to use a VGPR offset, we ideally have at least 2 1779 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1780 // recycle the VGPR used for the offset which requires resetting after 1781 // each subregister. 1782 1783 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1784 } 1785 1786 unsigned NumRegs = EltSize / 4; 1787 Register SubReg = e == 1 1788 ? ValueReg 1789 : Register(getSubReg(ValueReg, 1790 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1791 1792 unsigned SOffsetRegState = 0; 1793 unsigned SrcDstRegState = getDefRegState(!IsStore); 1794 const bool IsLastSubReg = i + 1 == e; 1795 const bool IsFirstSubReg = i == 0; 1796 if (IsLastSubReg) { 1797 SOffsetRegState |= getKillRegState(Scavenged); 1798 // The last implicit use carries the "Kill" flag. 1799 SrcDstRegState |= getKillRegState(IsKill); 1800 } 1801 1802 // Make sure the whole register is defined if there are undef components by 1803 // adding an implicit def of the super-reg on the first instruction. 1804 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1805 bool NeedSuperRegImpOperand = e > 1; 1806 1807 // Remaining element size to spill into memory after some parts of it 1808 // spilled into either AGPRs or VGPRs. 1809 unsigned RemEltSize = EltSize; 1810 1811 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1812 // starting from the last lane. In case if a register cannot be completely 1813 // spilled into another register that will ensure its alignment does not 1814 // change. For targets with VGPR alignment requirement this is important 1815 // in case of flat scratch usage as we might get a scratch_load or 1816 // scratch_store of an unaligned register otherwise. 1817 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1818 LaneE = RegOffset / 4; 1819 Lane >= LaneE; --Lane) { 1820 bool IsSubReg = e > 1 || EltSize > 4; 1821 Register Sub = IsSubReg 1822 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1823 : ValueReg; 1824 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1825 if (!MIB.getInstr()) 1826 break; 1827 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1828 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1829 NeedSuperRegDef = false; 1830 } 1831 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1832 NeedSuperRegImpOperand = true; 1833 unsigned State = SrcDstRegState; 1834 if (!IsLastSubReg || (Lane != LaneE)) 1835 State &= ~RegState::Kill; 1836 if (!IsFirstSubReg || (Lane != LaneS)) 1837 State &= ~RegState::Define; 1838 MIB.addReg(ValueReg, RegState::Implicit | State); 1839 } 1840 RemEltSize -= 4; 1841 } 1842 1843 if (!RemEltSize) // Fully spilled into AGPRs. 1844 continue; 1845 1846 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1847 assert(IsFlat && EltSize > 4); 1848 1849 unsigned NumRegs = RemEltSize / 4; 1850 SubReg = Register(getSubReg(ValueReg, 1851 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1852 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1853 Desc = &TII->get(Opc); 1854 } 1855 1856 unsigned FinalReg = SubReg; 1857 1858 if (IsAGPR) { 1859 assert(EltSize == 4); 1860 1861 if (!TmpIntermediateVGPR) { 1862 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1863 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1864 } 1865 if (IsStore) { 1866 auto AccRead = BuildMI(MBB, MI, DL, 1867 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1868 TmpIntermediateVGPR) 1869 .addReg(SubReg, getKillRegState(IsKill)); 1870 if (NeedSuperRegDef) 1871 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1872 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1873 AccRead.addReg(ValueReg, RegState::Implicit); 1874 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1875 } 1876 SubReg = TmpIntermediateVGPR; 1877 } else if (UseVGPROffset) { 1878 if (!TmpOffsetVGPR) { 1879 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 1880 MI, false, 0); 1881 RS->setRegUsed(TmpOffsetVGPR); 1882 } 1883 } 1884 1885 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1886 MachineMemOperand *NewMMO = 1887 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1888 commonAlignment(Alignment, RegOffset)); 1889 1890 auto MIB = 1891 BuildMI(MBB, MI, DL, *Desc) 1892 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1893 1894 if (UseVGPROffset) { 1895 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1896 // intermediate accvgpr_write. 1897 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1898 } 1899 1900 if (!IsFlat) 1901 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1902 1903 if (SOffset == AMDGPU::NoRegister) { 1904 if (!IsFlat) { 1905 if (UseVGPROffset && ScratchOffsetReg) { 1906 MIB.addReg(ScratchOffsetReg); 1907 } else { 1908 assert(FuncInfo->isBottomOfStack()); 1909 MIB.addImm(0); 1910 } 1911 } 1912 } else { 1913 MIB.addReg(SOffset, SOffsetRegState); 1914 } 1915 1916 MIB.addImm(Offset + RegOffset); 1917 1918 bool LastUse = MMO->getFlags() & MOLastUse; 1919 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol 1920 1921 if (!IsFlat) 1922 MIB.addImm(0); // swz 1923 MIB.addMemOperand(NewMMO); 1924 1925 if (!IsAGPR && NeedSuperRegDef) 1926 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1927 1928 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1929 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1930 FinalReg) 1931 .addReg(TmpIntermediateVGPR, RegState::Kill); 1932 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1933 } 1934 1935 bool IsSrcDstDef = SrcDstRegState & RegState::Define; 1936 if (NeedSuperRegImpOperand && 1937 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) 1938 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1939 1940 // The epilog restore of a wwm-scratch register can cause undesired 1941 // optimization during machine-cp post PrologEpilogInserter if the same 1942 // register was assigned for return value ABI lowering with a COPY 1943 // instruction. As given below, with the epilog reload, the earlier COPY 1944 // appeared to be dead during machine-cp. 1945 // ... 1946 // v0 in WWM operation, needs the WWM spill at prolog/epilog. 1947 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 1948 // ... 1949 // Epilog block: 1950 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 1951 // ... 1952 // WWM spill restore to preserve the inactive lanes of v0. 1953 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 1954 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 1955 // $exec = S_MOV_B64 killed $sgpr4_sgpr5 1956 // ... 1957 // SI_RETURN implicit $vgpr0 1958 // ... 1959 // To fix it, mark the same reg as a tied op for such restore instructions 1960 // so that it marks a usage for the preceding COPY. 1961 if (!IsStore && MI != MBB.end() && MI->isReturn() && 1962 MI->readsRegister(SubReg, this)) { 1963 MIB.addReg(SubReg, RegState::Implicit); 1964 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1965 } 1966 1967 // If we're building a block load, we should add artificial uses for the 1968 // CSR VGPRs that are *not* being transferred. This is because liveness 1969 // analysis is not aware of the mask, so we need to somehow inform it that 1970 // those registers are not available before the load and they should not be 1971 // scavenged. 1972 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp)) 1973 addImplicitUsesForBlockCSRLoad(MIB, ValueReg); 1974 } 1975 1976 if (ScratchOffsetRegDelta != 0) { 1977 // Subtract the offset we added to the ScratchOffset register. 1978 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1979 .addReg(SOffset) 1980 .addImm(-ScratchOffsetRegDelta); 1981 } 1982 } 1983 1984 void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, 1985 Register BlockReg) const { 1986 const MachineFunction *MF = MIB->getParent()->getParent(); 1987 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1988 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg); 1989 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0); 1990 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset) 1991 if (!(Mask & (1 << RegOffset)) && 1992 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF)) 1993 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit); 1994 } 1995 1996 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1997 int Offset, bool IsLoad, 1998 bool IsKill) const { 1999 // Load/store VGPR 2000 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 2001 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 2002 2003 Register FrameReg = 2004 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 2005 ? getBaseRegister() 2006 : getFrameRegister(SB.MF); 2007 2008 Align Alignment = FrameInfo.getObjectAlign(Index); 2009 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 2010 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 2011 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 2012 SB.EltSize, Alignment); 2013 2014 if (IsLoad) { 2015 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2016 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2017 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 2018 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); 2019 } else { 2020 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2021 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2022 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 2023 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); 2024 // This only ever adds one VGPR spill 2025 SB.MFI.addToSpilledVGPRs(1); 2026 } 2027 } 2028 2029 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 2030 RegScavenger *RS, SlotIndexes *Indexes, 2031 LiveIntervals *LIS, bool OnlyToVGPR, 2032 bool SpillToPhysVGPRLane) const { 2033 assert(!MI->getOperand(0).isUndef() && 2034 "undef spill should have been deleted earlier"); 2035 2036 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 2037 2038 ArrayRef<SpilledReg> VGPRSpills = 2039 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 2040 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 2041 bool SpillToVGPR = !VGPRSpills.empty(); 2042 if (OnlyToVGPR && !SpillToVGPR) 2043 return false; 2044 2045 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 2046 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 2047 2048 if (SpillToVGPR) { 2049 2050 // Since stack slot coloring pass is trying to optimize SGPR spills, 2051 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR 2052 // spills of different sizes. This accounts for number of VGPR lanes alloted 2053 // equal to the largest SGPR being spilled in them. 2054 assert(SB.NumSubRegs <= VGPRSpills.size() && 2055 "Num of SGPRs spilled should be less than or equal to num of " 2056 "the VGPR lanes."); 2057 2058 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 2059 Register SubReg = 2060 SB.NumSubRegs == 1 2061 ? SB.SuperReg 2062 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2063 SpilledReg Spill = VGPRSpills[i]; 2064 2065 bool IsFirstSubreg = i == 0; 2066 bool IsLastSubreg = i == SB.NumSubRegs - 1; 2067 bool UseKill = SB.IsKill && IsLastSubreg; 2068 2069 2070 // Mark the "old value of vgpr" input undef only if this is the first sgpr 2071 // spill to this specific vgpr in the first basic block. 2072 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 2073 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR) 2074 .addReg(SubReg, getKillRegState(UseKill)) 2075 .addImm(Spill.Lane) 2076 .addReg(Spill.VGPR); 2077 if (Indexes) { 2078 if (IsFirstSubreg) 2079 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 2080 else 2081 Indexes->insertMachineInstrInMaps(*MIB); 2082 } 2083 2084 if (IsFirstSubreg && SB.NumSubRegs > 1) { 2085 // We may be spilling a super-register which is only partially defined, 2086 // and need to ensure later spills think the value is defined. 2087 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2088 } 2089 2090 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 2091 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 2092 2093 // FIXME: Since this spills to another register instead of an actual 2094 // frame index, we should delete the frame index when all references to 2095 // it are fixed. 2096 } 2097 } else { 2098 SB.prepare(); 2099 2100 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 2101 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 2102 2103 // Per VGPR helper data 2104 auto PVD = SB.getPerVGPRData(); 2105 2106 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2107 unsigned TmpVGPRFlags = RegState::Undef; 2108 2109 // Write sub registers into the VGPR 2110 for (unsigned i = Offset * PVD.PerVGPR, 2111 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2112 i < e; ++i) { 2113 Register SubReg = 2114 SB.NumSubRegs == 1 2115 ? SB.SuperReg 2116 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2117 2118 MachineInstrBuilder WriteLane = 2119 BuildMI(*SB.MBB, MI, SB.DL, 2120 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR) 2121 .addReg(SubReg, SubKillState) 2122 .addImm(i % PVD.PerVGPR) 2123 .addReg(SB.TmpVGPR, TmpVGPRFlags); 2124 TmpVGPRFlags = 0; 2125 2126 if (Indexes) { 2127 if (i == 0) 2128 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 2129 else 2130 Indexes->insertMachineInstrInMaps(*WriteLane); 2131 } 2132 2133 // There could be undef components of a spilled super register. 2134 // TODO: Can we detect this and skip the spill? 2135 if (SB.NumSubRegs > 1) { 2136 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 2137 unsigned SuperKillState = 0; 2138 if (i + 1 == SB.NumSubRegs) 2139 SuperKillState |= getKillRegState(SB.IsKill); 2140 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 2141 } 2142 } 2143 2144 // Write out VGPR 2145 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 2146 } 2147 2148 SB.restore(); 2149 } 2150 2151 MI->eraseFromParent(); 2152 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2153 2154 if (LIS) 2155 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 2156 2157 return true; 2158 } 2159 2160 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 2161 RegScavenger *RS, SlotIndexes *Indexes, 2162 LiveIntervals *LIS, bool OnlyToVGPR, 2163 bool SpillToPhysVGPRLane) const { 2164 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 2165 2166 ArrayRef<SpilledReg> VGPRSpills = 2167 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) 2168 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); 2169 bool SpillToVGPR = !VGPRSpills.empty(); 2170 if (OnlyToVGPR && !SpillToVGPR) 2171 return false; 2172 2173 if (SpillToVGPR) { 2174 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 2175 Register SubReg = 2176 SB.NumSubRegs == 1 2177 ? SB.SuperReg 2178 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2179 2180 SpilledReg Spill = VGPRSpills[i]; 2181 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 2182 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 2183 .addReg(Spill.VGPR) 2184 .addImm(Spill.Lane); 2185 if (SB.NumSubRegs > 1 && i == 0) 2186 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2187 if (Indexes) { 2188 if (i == e - 1) 2189 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 2190 else 2191 Indexes->insertMachineInstrInMaps(*MIB); 2192 } 2193 } 2194 } else { 2195 SB.prepare(); 2196 2197 // Per VGPR helper data 2198 auto PVD = SB.getPerVGPRData(); 2199 2200 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2201 // Load in VGPR data 2202 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 2203 2204 // Unpack lanes 2205 for (unsigned i = Offset * PVD.PerVGPR, 2206 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2207 i < e; ++i) { 2208 Register SubReg = 2209 SB.NumSubRegs == 1 2210 ? SB.SuperReg 2211 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2212 2213 bool LastSubReg = (i + 1 == e); 2214 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 2215 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) 2216 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2217 .addImm(i); 2218 if (SB.NumSubRegs > 1 && i == 0) 2219 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2220 if (Indexes) { 2221 if (i == e - 1) 2222 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 2223 else 2224 Indexes->insertMachineInstrInMaps(*MIB); 2225 } 2226 } 2227 } 2228 2229 SB.restore(); 2230 } 2231 2232 MI->eraseFromParent(); 2233 2234 if (LIS) 2235 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 2236 2237 return true; 2238 } 2239 2240 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 2241 MachineBasicBlock &RestoreMBB, 2242 Register SGPR, RegScavenger *RS) const { 2243 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 2244 RS); 2245 SB.prepare(); 2246 // Generate the spill of SGPR to SB.TmpVGPR. 2247 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 2248 auto PVD = SB.getPerVGPRData(); 2249 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2250 unsigned TmpVGPRFlags = RegState::Undef; 2251 // Write sub registers into the VGPR 2252 for (unsigned i = Offset * PVD.PerVGPR, 2253 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2254 i < e; ++i) { 2255 Register SubReg = 2256 SB.NumSubRegs == 1 2257 ? SB.SuperReg 2258 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2259 2260 MachineInstrBuilder WriteLane = 2261 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 2262 SB.TmpVGPR) 2263 .addReg(SubReg, SubKillState) 2264 .addImm(i % PVD.PerVGPR) 2265 .addReg(SB.TmpVGPR, TmpVGPRFlags); 2266 TmpVGPRFlags = 0; 2267 // There could be undef components of a spilled super register. 2268 // TODO: Can we detect this and skip the spill? 2269 if (SB.NumSubRegs > 1) { 2270 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 2271 unsigned SuperKillState = 0; 2272 if (i + 1 == SB.NumSubRegs) 2273 SuperKillState |= getKillRegState(SB.IsKill); 2274 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 2275 } 2276 } 2277 // Don't need to write VGPR out. 2278 } 2279 2280 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); 2281 2282 // Restore clobbered registers in the specified restore block. 2283 MI = RestoreMBB.end(); 2284 SB.setMI(&RestoreMBB, MI); 2285 // Generate the restore of SGPR from SB.TmpVGPR. 2286 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 2287 // Don't need to load VGPR in. 2288 // Unpack lanes 2289 for (unsigned i = Offset * PVD.PerVGPR, 2290 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 2291 i < e; ++i) { 2292 Register SubReg = 2293 SB.NumSubRegs == 1 2294 ? SB.SuperReg 2295 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 2296 MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); 2297 bool LastSubReg = (i + 1 == e); 2298 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 2299 SubReg) 2300 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 2301 .addImm(i); 2302 if (SB.NumSubRegs > 1 && i == 0) 2303 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 2304 } 2305 } 2306 SB.restore(); 2307 2308 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2309 return false; 2310 } 2311 2312 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 2313 /// a VGPR and the stack slot can be safely eliminated when all other users are 2314 /// handled. 2315 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 2316 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 2317 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { 2318 switch (MI->getOpcode()) { 2319 case AMDGPU::SI_SPILL_S1024_SAVE: 2320 case AMDGPU::SI_SPILL_S512_SAVE: 2321 case AMDGPU::SI_SPILL_S384_SAVE: 2322 case AMDGPU::SI_SPILL_S352_SAVE: 2323 case AMDGPU::SI_SPILL_S320_SAVE: 2324 case AMDGPU::SI_SPILL_S288_SAVE: 2325 case AMDGPU::SI_SPILL_S256_SAVE: 2326 case AMDGPU::SI_SPILL_S224_SAVE: 2327 case AMDGPU::SI_SPILL_S192_SAVE: 2328 case AMDGPU::SI_SPILL_S160_SAVE: 2329 case AMDGPU::SI_SPILL_S128_SAVE: 2330 case AMDGPU::SI_SPILL_S96_SAVE: 2331 case AMDGPU::SI_SPILL_S64_SAVE: 2332 case AMDGPU::SI_SPILL_S32_SAVE: 2333 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2334 case AMDGPU::SI_SPILL_S1024_RESTORE: 2335 case AMDGPU::SI_SPILL_S512_RESTORE: 2336 case AMDGPU::SI_SPILL_S384_RESTORE: 2337 case AMDGPU::SI_SPILL_S352_RESTORE: 2338 case AMDGPU::SI_SPILL_S320_RESTORE: 2339 case AMDGPU::SI_SPILL_S288_RESTORE: 2340 case AMDGPU::SI_SPILL_S256_RESTORE: 2341 case AMDGPU::SI_SPILL_S224_RESTORE: 2342 case AMDGPU::SI_SPILL_S192_RESTORE: 2343 case AMDGPU::SI_SPILL_S160_RESTORE: 2344 case AMDGPU::SI_SPILL_S128_RESTORE: 2345 case AMDGPU::SI_SPILL_S96_RESTORE: 2346 case AMDGPU::SI_SPILL_S64_RESTORE: 2347 case AMDGPU::SI_SPILL_S32_RESTORE: 2348 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); 2349 default: 2350 llvm_unreachable("not an SGPR spill instruction"); 2351 } 2352 } 2353 2354 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2355 int SPAdj, unsigned FIOperandNum, 2356 RegScavenger *RS) const { 2357 MachineFunction *MF = MI->getParent()->getParent(); 2358 MachineBasicBlock *MBB = MI->getParent(); 2359 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2360 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2361 const SIInstrInfo *TII = ST.getInstrInfo(); 2362 const DebugLoc &DL = MI->getDebugLoc(); 2363 2364 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2365 2366 assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) && 2367 "unreserved scratch RSRC register"); 2368 2369 MachineOperand *FIOp = &MI->getOperand(FIOperandNum); 2370 int Index = MI->getOperand(FIOperandNum).getIndex(); 2371 2372 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2373 ? getBaseRegister() 2374 : getFrameRegister(*MF); 2375 2376 switch (MI->getOpcode()) { 2377 // SGPR register spill 2378 case AMDGPU::SI_SPILL_S1024_SAVE: 2379 case AMDGPU::SI_SPILL_S512_SAVE: 2380 case AMDGPU::SI_SPILL_S384_SAVE: 2381 case AMDGPU::SI_SPILL_S352_SAVE: 2382 case AMDGPU::SI_SPILL_S320_SAVE: 2383 case AMDGPU::SI_SPILL_S288_SAVE: 2384 case AMDGPU::SI_SPILL_S256_SAVE: 2385 case AMDGPU::SI_SPILL_S224_SAVE: 2386 case AMDGPU::SI_SPILL_S192_SAVE: 2387 case AMDGPU::SI_SPILL_S160_SAVE: 2388 case AMDGPU::SI_SPILL_S128_SAVE: 2389 case AMDGPU::SI_SPILL_S96_SAVE: 2390 case AMDGPU::SI_SPILL_S64_SAVE: 2391 case AMDGPU::SI_SPILL_S32_SAVE: { 2392 return spillSGPR(MI, Index, RS); 2393 } 2394 2395 // SGPR register restore 2396 case AMDGPU::SI_SPILL_S1024_RESTORE: 2397 case AMDGPU::SI_SPILL_S512_RESTORE: 2398 case AMDGPU::SI_SPILL_S384_RESTORE: 2399 case AMDGPU::SI_SPILL_S352_RESTORE: 2400 case AMDGPU::SI_SPILL_S320_RESTORE: 2401 case AMDGPU::SI_SPILL_S288_RESTORE: 2402 case AMDGPU::SI_SPILL_S256_RESTORE: 2403 case AMDGPU::SI_SPILL_S224_RESTORE: 2404 case AMDGPU::SI_SPILL_S192_RESTORE: 2405 case AMDGPU::SI_SPILL_S160_RESTORE: 2406 case AMDGPU::SI_SPILL_S128_RESTORE: 2407 case AMDGPU::SI_SPILL_S96_RESTORE: 2408 case AMDGPU::SI_SPILL_S64_RESTORE: 2409 case AMDGPU::SI_SPILL_S32_RESTORE: { 2410 return restoreSGPR(MI, Index, RS); 2411 } 2412 2413 // VGPR register spill 2414 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: { 2415 // Put mask into M0. 2416 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), 2417 AMDGPU::M0) 2418 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); 2419 LLVM_FALLTHROUGH; 2420 } 2421 case AMDGPU::SI_SPILL_V1024_SAVE: 2422 case AMDGPU::SI_SPILL_V512_SAVE: 2423 case AMDGPU::SI_SPILL_V384_SAVE: 2424 case AMDGPU::SI_SPILL_V352_SAVE: 2425 case AMDGPU::SI_SPILL_V320_SAVE: 2426 case AMDGPU::SI_SPILL_V288_SAVE: 2427 case AMDGPU::SI_SPILL_V256_SAVE: 2428 case AMDGPU::SI_SPILL_V224_SAVE: 2429 case AMDGPU::SI_SPILL_V192_SAVE: 2430 case AMDGPU::SI_SPILL_V160_SAVE: 2431 case AMDGPU::SI_SPILL_V128_SAVE: 2432 case AMDGPU::SI_SPILL_V96_SAVE: 2433 case AMDGPU::SI_SPILL_V64_SAVE: 2434 case AMDGPU::SI_SPILL_V32_SAVE: 2435 case AMDGPU::SI_SPILL_V16_SAVE: 2436 case AMDGPU::SI_SPILL_A1024_SAVE: 2437 case AMDGPU::SI_SPILL_A512_SAVE: 2438 case AMDGPU::SI_SPILL_A384_SAVE: 2439 case AMDGPU::SI_SPILL_A352_SAVE: 2440 case AMDGPU::SI_SPILL_A320_SAVE: 2441 case AMDGPU::SI_SPILL_A288_SAVE: 2442 case AMDGPU::SI_SPILL_A256_SAVE: 2443 case AMDGPU::SI_SPILL_A224_SAVE: 2444 case AMDGPU::SI_SPILL_A192_SAVE: 2445 case AMDGPU::SI_SPILL_A160_SAVE: 2446 case AMDGPU::SI_SPILL_A128_SAVE: 2447 case AMDGPU::SI_SPILL_A96_SAVE: 2448 case AMDGPU::SI_SPILL_A64_SAVE: 2449 case AMDGPU::SI_SPILL_A32_SAVE: 2450 case AMDGPU::SI_SPILL_AV1024_SAVE: 2451 case AMDGPU::SI_SPILL_AV512_SAVE: 2452 case AMDGPU::SI_SPILL_AV384_SAVE: 2453 case AMDGPU::SI_SPILL_AV352_SAVE: 2454 case AMDGPU::SI_SPILL_AV320_SAVE: 2455 case AMDGPU::SI_SPILL_AV288_SAVE: 2456 case AMDGPU::SI_SPILL_AV256_SAVE: 2457 case AMDGPU::SI_SPILL_AV224_SAVE: 2458 case AMDGPU::SI_SPILL_AV192_SAVE: 2459 case AMDGPU::SI_SPILL_AV160_SAVE: 2460 case AMDGPU::SI_SPILL_AV128_SAVE: 2461 case AMDGPU::SI_SPILL_AV96_SAVE: 2462 case AMDGPU::SI_SPILL_AV64_SAVE: 2463 case AMDGPU::SI_SPILL_AV32_SAVE: 2464 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 2465 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { 2466 const MachineOperand *VData = TII->getNamedOperand(*MI, 2467 AMDGPU::OpName::vdata); 2468 if (VData->isUndef()) { 2469 MI->eraseFromParent(); 2470 return true; 2471 } 2472 2473 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2474 MFI->getStackPtrOffsetReg()); 2475 2476 unsigned Opc; 2477 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) { 2478 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); 2479 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16; 2480 } else { 2481 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE 2482 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR 2483 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2484 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2485 } 2486 2487 auto *MBB = MI->getParent(); 2488 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2489 if (IsWWMRegSpill) { 2490 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2491 RS->isRegUsed(AMDGPU::SCC)); 2492 } 2493 buildSpillLoadStore( 2494 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2495 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2496 *MI->memoperands_begin(), RS); 2497 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(*MI, TII)); 2498 if (IsWWMRegSpill) 2499 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2500 2501 MI->eraseFromParent(); 2502 return true; 2503 } 2504 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: { 2505 // Put mask into M0. 2506 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), 2507 AMDGPU::M0) 2508 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); 2509 LLVM_FALLTHROUGH; 2510 } 2511 case AMDGPU::SI_SPILL_V16_RESTORE: 2512 case AMDGPU::SI_SPILL_V32_RESTORE: 2513 case AMDGPU::SI_SPILL_V64_RESTORE: 2514 case AMDGPU::SI_SPILL_V96_RESTORE: 2515 case AMDGPU::SI_SPILL_V128_RESTORE: 2516 case AMDGPU::SI_SPILL_V160_RESTORE: 2517 case AMDGPU::SI_SPILL_V192_RESTORE: 2518 case AMDGPU::SI_SPILL_V224_RESTORE: 2519 case AMDGPU::SI_SPILL_V256_RESTORE: 2520 case AMDGPU::SI_SPILL_V288_RESTORE: 2521 case AMDGPU::SI_SPILL_V320_RESTORE: 2522 case AMDGPU::SI_SPILL_V352_RESTORE: 2523 case AMDGPU::SI_SPILL_V384_RESTORE: 2524 case AMDGPU::SI_SPILL_V512_RESTORE: 2525 case AMDGPU::SI_SPILL_V1024_RESTORE: 2526 case AMDGPU::SI_SPILL_A32_RESTORE: 2527 case AMDGPU::SI_SPILL_A64_RESTORE: 2528 case AMDGPU::SI_SPILL_A96_RESTORE: 2529 case AMDGPU::SI_SPILL_A128_RESTORE: 2530 case AMDGPU::SI_SPILL_A160_RESTORE: 2531 case AMDGPU::SI_SPILL_A192_RESTORE: 2532 case AMDGPU::SI_SPILL_A224_RESTORE: 2533 case AMDGPU::SI_SPILL_A256_RESTORE: 2534 case AMDGPU::SI_SPILL_A288_RESTORE: 2535 case AMDGPU::SI_SPILL_A320_RESTORE: 2536 case AMDGPU::SI_SPILL_A352_RESTORE: 2537 case AMDGPU::SI_SPILL_A384_RESTORE: 2538 case AMDGPU::SI_SPILL_A512_RESTORE: 2539 case AMDGPU::SI_SPILL_A1024_RESTORE: 2540 case AMDGPU::SI_SPILL_AV32_RESTORE: 2541 case AMDGPU::SI_SPILL_AV64_RESTORE: 2542 case AMDGPU::SI_SPILL_AV96_RESTORE: 2543 case AMDGPU::SI_SPILL_AV128_RESTORE: 2544 case AMDGPU::SI_SPILL_AV160_RESTORE: 2545 case AMDGPU::SI_SPILL_AV192_RESTORE: 2546 case AMDGPU::SI_SPILL_AV224_RESTORE: 2547 case AMDGPU::SI_SPILL_AV256_RESTORE: 2548 case AMDGPU::SI_SPILL_AV288_RESTORE: 2549 case AMDGPU::SI_SPILL_AV320_RESTORE: 2550 case AMDGPU::SI_SPILL_AV352_RESTORE: 2551 case AMDGPU::SI_SPILL_AV384_RESTORE: 2552 case AMDGPU::SI_SPILL_AV512_RESTORE: 2553 case AMDGPU::SI_SPILL_AV1024_RESTORE: 2554 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 2555 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { 2556 const MachineOperand *VData = TII->getNamedOperand(*MI, 2557 AMDGPU::OpName::vdata); 2558 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2559 MFI->getStackPtrOffsetReg()); 2560 2561 unsigned Opc; 2562 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { 2563 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); 2564 Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; 2565 } else { 2566 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE 2567 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR 2568 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2569 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2570 } 2571 2572 auto *MBB = MI->getParent(); 2573 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2574 if (IsWWMRegSpill) { 2575 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2576 RS->isRegUsed(AMDGPU::SCC)); 2577 } 2578 2579 buildSpillLoadStore( 2580 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2581 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2582 *MI->memoperands_begin(), RS); 2583 2584 if (IsWWMRegSpill) 2585 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2586 2587 MI->eraseFromParent(); 2588 return true; 2589 } 2590 case AMDGPU::V_ADD_U32_e32: 2591 case AMDGPU::V_ADD_U32_e64: 2592 case AMDGPU::V_ADD_CO_U32_e32: 2593 case AMDGPU::V_ADD_CO_U32_e64: { 2594 // TODO: Handle sub, and, or. 2595 unsigned NumDefs = MI->getNumExplicitDefs(); 2596 unsigned Src0Idx = NumDefs; 2597 2598 bool HasClamp = false; 2599 MachineOperand *VCCOp = nullptr; 2600 2601 switch (MI->getOpcode()) { 2602 case AMDGPU::V_ADD_U32_e32: 2603 break; 2604 case AMDGPU::V_ADD_U32_e64: 2605 HasClamp = MI->getOperand(3).getImm(); 2606 break; 2607 case AMDGPU::V_ADD_CO_U32_e32: 2608 VCCOp = &MI->getOperand(3); 2609 break; 2610 case AMDGPU::V_ADD_CO_U32_e64: 2611 VCCOp = &MI->getOperand(1); 2612 HasClamp = MI->getOperand(4).getImm(); 2613 break; 2614 default: 2615 break; 2616 } 2617 bool DeadVCC = !VCCOp || VCCOp->isDead(); 2618 MachineOperand &DstOp = MI->getOperand(0); 2619 Register DstReg = DstOp.getReg(); 2620 2621 unsigned OtherOpIdx = 2622 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx; 2623 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx); 2624 2625 unsigned Src1Idx = Src0Idx + 1; 2626 Register MaterializedReg = FrameReg; 2627 Register ScavengedVGPR; 2628 2629 int64_t Offset = FrameInfo.getObjectOffset(Index); 2630 // For the non-immediate case, we could fall through to the default 2631 // handling, but we do an in-place update of the result register here to 2632 // avoid scavenging another register. 2633 if (OtherOp->isImm()) { 2634 int64_t TotalOffset = OtherOp->getImm() + Offset; 2635 2636 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) && 2637 !AMDGPU::isInlinableIntLiteral(TotalOffset)) { 2638 // If we can't support a VOP3 literal in the VALU instruction, we 2639 // can't specially fold into the add. 2640 // TODO: Handle VOP3->VOP2 shrink to support the fold. 2641 break; 2642 } 2643 2644 OtherOp->setImm(TotalOffset); 2645 Offset = 0; 2646 } 2647 2648 if (FrameReg && !ST.enableFlatScratch()) { 2649 // We should just do an in-place update of the result register. However, 2650 // the value there may also be used by the add, in which case we need a 2651 // temporary register. 2652 // 2653 // FIXME: The scavenger is not finding the result register in the 2654 // common case where the add does not read the register. 2655 2656 ScavengedVGPR = RS->scavengeRegisterBackwards( 2657 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0); 2658 2659 // TODO: If we have a free SGPR, it's sometimes better to use a scalar 2660 // shift. 2661 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64)) 2662 .addDef(ScavengedVGPR, RegState::Renamable) 2663 .addImm(ST.getWavefrontSizeLog2()) 2664 .addReg(FrameReg); 2665 MaterializedReg = ScavengedVGPR; 2666 } 2667 2668 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) { 2669 if (ST.enableFlatScratch() && 2670 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) { 2671 // We didn't need the shift above, so we have an SGPR for the frame 2672 // register, but may have a VGPR only operand. 2673 // 2674 // TODO: On gfx10+, we can easily change the opcode to the e64 version 2675 // and use the higher constant bus restriction to avoid this copy. 2676 2677 if (!ScavengedVGPR) { 2678 ScavengedVGPR = RS->scavengeRegisterBackwards( 2679 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, 2680 /*SPAdj=*/0); 2681 } 2682 2683 assert(ScavengedVGPR != DstReg); 2684 2685 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) 2686 .addReg(MaterializedReg, 2687 MaterializedReg != FrameReg ? RegState::Kill : 0); 2688 MaterializedReg = ScavengedVGPR; 2689 } 2690 2691 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC 2692 // is not live, we could use a scalar add + vector add instead of 2 2693 // vector adds. 2694 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode())) 2695 .addDef(DstReg, RegState::Renamable); 2696 if (NumDefs == 2) 2697 AddI32.add(MI->getOperand(1)); 2698 2699 unsigned MaterializedRegFlags = 2700 MaterializedReg != FrameReg ? RegState::Kill : 0; 2701 2702 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) { 2703 // If we know we have a VGPR already, it's more likely the other 2704 // operand is a legal vsrc0. 2705 AddI32 2706 .add(*OtherOp) 2707 .addReg(MaterializedReg, MaterializedRegFlags); 2708 } else { 2709 // Commute operands to avoid violating VOP2 restrictions. This will 2710 // typically happen when using scratch. 2711 AddI32 2712 .addReg(MaterializedReg, MaterializedRegFlags) 2713 .add(*OtherOp); 2714 } 2715 2716 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 2717 MI->getOpcode() == AMDGPU::V_ADD_U32_e64) 2718 AddI32.addImm(0); // clamp 2719 2720 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32) 2721 AddI32.setOperandDead(3); // Dead vcc 2722 2723 MaterializedReg = DstReg; 2724 2725 OtherOp->ChangeToRegister(MaterializedReg, false); 2726 OtherOp->setIsKill(true); 2727 FIOp->ChangeToImmediate(Offset); 2728 Offset = 0; 2729 } else if (Offset != 0) { 2730 assert(!MaterializedReg); 2731 FIOp->ChangeToImmediate(Offset); 2732 Offset = 0; 2733 } else { 2734 if (DeadVCC && !HasClamp) { 2735 assert(Offset == 0); 2736 2737 // TODO: Losing kills and implicit operands. Just mutate to copy and 2738 // let lowerCopy deal with it? 2739 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) { 2740 // Folded to an identity copy. 2741 MI->eraseFromParent(); 2742 return true; 2743 } 2744 2745 // The immediate value should be in OtherOp 2746 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); 2747 MI->removeOperand(FIOperandNum); 2748 2749 unsigned NumOps = MI->getNumOperands(); 2750 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I) 2751 MI->removeOperand(I); 2752 2753 if (NumDefs == 2) 2754 MI->removeOperand(1); 2755 2756 // The code below can't deal with a mov. 2757 return true; 2758 } 2759 2760 // This folded to a constant, but we have to keep the add around for 2761 // pointless implicit defs or clamp modifier. 2762 FIOp->ChangeToImmediate(0); 2763 } 2764 2765 // Try to improve legality by commuting. 2766 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) { 2767 std::swap(FIOp, OtherOp); 2768 std::swap(FIOperandNum, OtherOpIdx); 2769 } 2770 2771 // We need at most one mov to satisfy the operand constraints. Prefer to 2772 // move the FI operand first, as it may be a literal in a VOP3 2773 // instruction. 2774 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) { 2775 if (!TII->isOperandLegal(*MI, SrcIdx)) { 2776 // If commuting didn't make the operands legal, we need to materialize 2777 // in a register. 2778 // TODO: Can use SGPR on gfx10+ in some cases. 2779 if (!ScavengedVGPR) { 2780 ScavengedVGPR = RS->scavengeRegisterBackwards( 2781 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, 2782 /*SPAdj=*/0); 2783 } 2784 2785 assert(ScavengedVGPR != DstReg); 2786 2787 MachineOperand &Src = MI->getOperand(SrcIdx); 2788 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) 2789 .add(Src); 2790 2791 Src.ChangeToRegister(ScavengedVGPR, false); 2792 Src.setIsKill(true); 2793 break; 2794 } 2795 } 2796 2797 // Fold out add of 0 case that can appear in kernels. 2798 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) { 2799 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) { 2800 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp); 2801 } 2802 2803 MI->eraseFromParent(); 2804 } 2805 2806 return true; 2807 } 2808 case AMDGPU::S_ADD_I32: 2809 case AMDGPU::S_ADD_U32: { 2810 // TODO: Handle s_or_b32, s_and_b32. 2811 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1; 2812 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx); 2813 2814 assert(FrameReg || MFI->isBottomOfStack()); 2815 2816 MachineOperand &DstOp = MI->getOperand(0); 2817 const DebugLoc &DL = MI->getDebugLoc(); 2818 Register MaterializedReg = FrameReg; 2819 2820 // Defend against live scc, which should never happen in practice. 2821 bool DeadSCC = MI->getOperand(3).isDead(); 2822 2823 Register TmpReg; 2824 2825 // FIXME: Scavenger should figure out that the result register is 2826 // available. Also should do this for the v_add case. 2827 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg()) 2828 TmpReg = DstOp.getReg(); 2829 2830 if (FrameReg && !ST.enableFlatScratch()) { 2831 // FIXME: In the common case where the add does not also read its result 2832 // (i.e. this isn't a reg += fi), it's not finding the dest reg as 2833 // available. 2834 if (!TmpReg) 2835 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2836 MI, /*RestoreAfter=*/false, 0, 2837 /*AllowSpill=*/false); 2838 if (TmpReg) { 2839 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32)) 2840 .addDef(TmpReg, RegState::Renamable) 2841 .addReg(FrameReg) 2842 .addImm(ST.getWavefrontSizeLog2()) 2843 .setOperandDead(3); // Set SCC dead 2844 } 2845 MaterializedReg = TmpReg; 2846 } 2847 2848 int64_t Offset = FrameInfo.getObjectOffset(Index); 2849 2850 // For the non-immediate case, we could fall through to the default 2851 // handling, but we do an in-place update of the result register here to 2852 // avoid scavenging another register. 2853 if (OtherOp.isImm()) { 2854 OtherOp.setImm(OtherOp.getImm() + Offset); 2855 Offset = 0; 2856 2857 if (MaterializedReg) 2858 FIOp->ChangeToRegister(MaterializedReg, false); 2859 else 2860 FIOp->ChangeToImmediate(0); 2861 } else if (MaterializedReg) { 2862 // If we can't fold the other operand, do another increment. 2863 Register DstReg = DstOp.getReg(); 2864 2865 if (!TmpReg && MaterializedReg == FrameReg) { 2866 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2867 MI, /*RestoreAfter=*/false, 0, 2868 /*AllowSpill=*/false); 2869 DstReg = TmpReg; 2870 } 2871 2872 if (TmpReg) { 2873 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc()) 2874 .addDef(DstReg, RegState::Renamable) 2875 .addReg(MaterializedReg, RegState::Kill) 2876 .add(OtherOp); 2877 if (DeadSCC) 2878 AddI32.setOperandDead(3); 2879 2880 MaterializedReg = DstReg; 2881 2882 OtherOp.ChangeToRegister(MaterializedReg, false); 2883 OtherOp.setIsKill(true); 2884 OtherOp.setIsRenamable(true); 2885 } 2886 FIOp->ChangeToImmediate(Offset); 2887 } else { 2888 // If we don't have any other offset to apply, we can just directly 2889 // interpret the frame index as the offset. 2890 FIOp->ChangeToImmediate(Offset); 2891 } 2892 2893 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) { 2894 assert(Offset == 0); 2895 MI->removeOperand(3); 2896 MI->removeOperand(OtherOpIdx); 2897 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 2898 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) { 2899 assert(Offset == 0); 2900 MI->removeOperand(3); 2901 MI->removeOperand(FIOperandNum); 2902 MI->setDesc( 2903 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 2904 } 2905 2906 assert(!FIOp->isFI()); 2907 return true; 2908 } 2909 default: { 2910 break; 2911 } 2912 } 2913 2914 int64_t Offset = FrameInfo.getObjectOffset(Index); 2915 if (ST.enableFlatScratch()) { 2916 if (TII->isFLATScratch(*MI)) { 2917 assert( 2918 (int16_t)FIOperandNum == 2919 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr)); 2920 2921 // The offset is always swizzled, just replace it 2922 if (FrameReg) 2923 FIOp->ChangeToRegister(FrameReg, false); 2924 2925 MachineOperand *OffsetOp = 2926 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2927 int64_t NewOffset = Offset + OffsetOp->getImm(); 2928 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2929 SIInstrFlags::FlatScratch)) { 2930 OffsetOp->setImm(NewOffset); 2931 if (FrameReg) 2932 return false; 2933 Offset = 0; 2934 } 2935 2936 if (!Offset) { 2937 unsigned Opc = MI->getOpcode(); 2938 int NewOpc = -1; 2939 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2940 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2941 } else if (ST.hasFlatScratchSTMode()) { 2942 // On GFX10 we have ST mode to use no registers for an address. 2943 // Otherwise we need to materialize 0 into an SGPR. 2944 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2945 } 2946 2947 if (NewOpc != -1) { 2948 // removeOperand doesn't fixup tied operand indexes as it goes, so 2949 // it asserts. Untie vdst_in for now and retie them afterwards. 2950 int VDstIn = 2951 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); 2952 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() && 2953 MI->getOperand(VDstIn).isTied(); 2954 if (TiedVDst) 2955 MI->untieRegOperand(VDstIn); 2956 2957 MI->removeOperand( 2958 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2959 2960 if (TiedVDst) { 2961 int NewVDst = 2962 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2963 int NewVDstIn = 2964 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2965 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2966 MI->tieOperands(NewVDst, NewVDstIn); 2967 } 2968 MI->setDesc(TII->get(NewOpc)); 2969 return false; 2970 } 2971 } 2972 } 2973 2974 if (!FrameReg) { 2975 FIOp->ChangeToImmediate(Offset); 2976 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) 2977 return false; 2978 } 2979 2980 // We need to use register here. Check if we can use an SGPR or need 2981 // a VGPR. 2982 FIOp->ChangeToRegister(AMDGPU::M0, false); 2983 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp); 2984 2985 if (!Offset && FrameReg && UseSGPR) { 2986 FIOp->setReg(FrameReg); 2987 return false; 2988 } 2989 2990 const TargetRegisterClass *RC = 2991 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass; 2992 2993 Register TmpReg = 2994 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); 2995 FIOp->setReg(TmpReg); 2996 FIOp->setIsKill(); 2997 2998 if ((!FrameReg || !Offset) && TmpReg) { 2999 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 3000 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 3001 if (FrameReg) 3002 MIB.addReg(FrameReg); 3003 else 3004 MIB.addImm(Offset); 3005 3006 return false; 3007 } 3008 3009 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) && 3010 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); 3011 3012 Register TmpSReg = 3013 UseSGPR ? TmpReg 3014 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 3015 MI, false, 0, !UseSGPR); 3016 3017 // TODO: for flat scratch another attempt can be made with a VGPR index 3018 // if no SGPRs can be scavenged. 3019 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 3020 report_fatal_error("Cannot scavenge register in FI elimination!"); 3021 3022 if (!TmpSReg) { 3023 // Use frame register and restore it after. 3024 TmpSReg = FrameReg; 3025 FIOp->setReg(FrameReg); 3026 FIOp->setIsKill(false); 3027 } 3028 3029 if (NeedSaveSCC) { 3030 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 3031 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 3032 .addReg(FrameReg) 3033 .addImm(Offset); 3034 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 3035 .addReg(TmpSReg) 3036 .addImm(0); 3037 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 3038 .addImm(0) 3039 .addReg(TmpSReg); 3040 } else { 3041 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 3042 .addReg(FrameReg) 3043 .addImm(Offset); 3044 } 3045 3046 if (!UseSGPR) 3047 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 3048 .addReg(TmpSReg, RegState::Kill); 3049 3050 if (TmpSReg == FrameReg) { 3051 // Undo frame register modification. 3052 if (NeedSaveSCC && 3053 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) { 3054 MachineBasicBlock::iterator I = 3055 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 3056 TmpSReg) 3057 .addReg(FrameReg) 3058 .addImm(-Offset); 3059 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 3060 .addReg(TmpSReg) 3061 .addImm(0); 3062 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 3063 TmpSReg) 3064 .addImm(0) 3065 .addReg(TmpSReg); 3066 } else { 3067 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 3068 FrameReg) 3069 .addReg(FrameReg) 3070 .addImm(-Offset); 3071 } 3072 } 3073 3074 return false; 3075 } 3076 3077 bool IsMUBUF = TII->isMUBUF(*MI); 3078 3079 if (!IsMUBUF && !MFI->isBottomOfStack()) { 3080 // Convert to a swizzled stack address by scaling by the wave size. 3081 // In an entry function/kernel the offset is already swizzled. 3082 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 3083 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) && 3084 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); 3085 const TargetRegisterClass *RC = IsSALU && !LiveSCC 3086 ? &AMDGPU::SReg_32RegClass 3087 : &AMDGPU::VGPR_32RegClass; 3088 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 3089 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 || 3090 MI->getOpcode() == AMDGPU::S_MOV_B32; 3091 Register ResultReg = 3092 IsCopy ? MI->getOperand(0).getReg() 3093 : RS->scavengeRegisterBackwards(*RC, MI, false, 0); 3094 3095 int64_t Offset = FrameInfo.getObjectOffset(Index); 3096 if (Offset == 0) { 3097 unsigned OpCode = 3098 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64; 3099 Register TmpResultReg = ResultReg; 3100 if (IsSALU && LiveSCC) { 3101 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 3102 MI, false, 0); 3103 } 3104 3105 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg); 3106 if (OpCode == AMDGPU::V_LSHRREV_B32_e64) 3107 // For V_LSHRREV, the operands are reversed (the shift count goes 3108 // first). 3109 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg); 3110 else 3111 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2()); 3112 if (IsSALU && !LiveSCC) 3113 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 3114 if (IsSALU && LiveSCC) { 3115 Register NewDest; 3116 if (IsCopy) { 3117 MF->getRegInfo().constrainRegClass(ResultReg, 3118 &AMDGPU::SReg_32_XM0RegClass); 3119 NewDest = ResultReg; 3120 } else { 3121 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 3122 Shift, false, 0); 3123 } 3124 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest) 3125 .addReg(TmpResultReg); 3126 ResultReg = NewDest; 3127 } 3128 } else { 3129 MachineInstrBuilder MIB; 3130 if (!IsSALU) { 3131 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 3132 nullptr) { 3133 // Reuse ResultReg in intermediate step. 3134 Register ScaledReg = ResultReg; 3135 3136 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 3137 ScaledReg) 3138 .addImm(ST.getWavefrontSizeLog2()) 3139 .addReg(FrameReg); 3140 3141 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 3142 3143 // TODO: Fold if use instruction is another add of a constant. 3144 if (IsVOP2 || 3145 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 3146 // FIXME: This can fail 3147 MIB.addImm(Offset); 3148 MIB.addReg(ScaledReg, RegState::Kill); 3149 if (!IsVOP2) 3150 MIB.addImm(0); // clamp bit 3151 } else { 3152 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 3153 "Need to reuse carry out register"); 3154 3155 // Use scavenged unused carry out as offset register. 3156 Register ConstOffsetReg; 3157 if (!isWave32) 3158 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 3159 else 3160 ConstOffsetReg = MIB.getReg(1); 3161 3162 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), 3163 ConstOffsetReg) 3164 .addImm(Offset); 3165 MIB.addReg(ConstOffsetReg, RegState::Kill); 3166 MIB.addReg(ScaledReg, RegState::Kill); 3167 MIB.addImm(0); // clamp bit 3168 } 3169 } 3170 } 3171 if (!MIB || IsSALU) { 3172 // We have to produce a carry out, and there isn't a free SGPR pair 3173 // for it. We can keep the whole computation on the SALU to avoid 3174 // clobbering an additional register at the cost of an extra mov. 3175 3176 // We may have 1 free scratch SGPR even though a carry out is 3177 // unavailable. Only one additional mov is needed. 3178 Register TmpScaledReg = IsCopy && IsSALU 3179 ? ResultReg 3180 : RS->scavengeRegisterBackwards( 3181 AMDGPU::SReg_32_XM0RegClass, MI, 3182 false, 0, /*AllowSpill=*/false); 3183 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 3184 Register TmpResultReg = ScaledReg; 3185 3186 if (!LiveSCC) { 3187 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg) 3188 .addReg(FrameReg) 3189 .addImm(ST.getWavefrontSizeLog2()); 3190 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg) 3191 .addReg(TmpResultReg, RegState::Kill) 3192 .addImm(Offset); 3193 } else { 3194 TmpResultReg = RS->scavengeRegisterBackwards( 3195 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true); 3196 3197 MachineInstrBuilder Add; 3198 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) { 3199 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 3200 TmpResultReg) 3201 .addImm(ST.getWavefrontSizeLog2()) 3202 .addReg(FrameReg); 3203 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) { 3204 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg) 3205 .addImm(Offset); 3206 Add.addReg(ResultReg, RegState::Kill) 3207 .addReg(TmpResultReg, RegState::Kill) 3208 .addImm(0); 3209 } else 3210 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill); 3211 } else { 3212 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) && 3213 "offset is unsafe for v_mad_u32_u24"); 3214 3215 // We start with a frame pointer with a wave space value, and 3216 // an offset in lane-space. We are materializing a lane space 3217 // value. We can either do a right shift of the frame pointer 3218 // to get to lane space, or a left shift of the offset to get 3219 // to wavespace. We can right shift after the computation to 3220 // get back to the desired per-lane value. We are using the 3221 // mad_u32_u24 primarily as an add with no carry out clobber. 3222 bool IsInlinableLiteral = 3223 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()); 3224 if (!IsInlinableLiteral) { 3225 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), 3226 TmpResultReg) 3227 .addImm(Offset); 3228 } 3229 3230 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64), 3231 TmpResultReg); 3232 3233 if (!IsInlinableLiteral) { 3234 Add.addReg(TmpResultReg, RegState::Kill); 3235 } else { 3236 // We fold the offset into mad itself if its inlinable. 3237 Add.addImm(Offset); 3238 } 3239 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0); 3240 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 3241 TmpResultReg) 3242 .addImm(ST.getWavefrontSizeLog2()) 3243 .addReg(TmpResultReg); 3244 } 3245 3246 Register NewDest; 3247 if (IsCopy) { 3248 MF->getRegInfo().constrainRegClass(ResultReg, 3249 &AMDGPU::SReg_32_XM0RegClass); 3250 NewDest = ResultReg; 3251 } else { 3252 NewDest = RS->scavengeRegisterBackwards( 3253 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0, 3254 /*AllowSpill=*/true); 3255 } 3256 3257 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 3258 NewDest) 3259 .addReg(TmpResultReg); 3260 ResultReg = NewDest; 3261 } 3262 if (!IsSALU) 3263 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 3264 .addReg(TmpResultReg, RegState::Kill); 3265 else 3266 ResultReg = TmpResultReg; 3267 // If there were truly no free SGPRs, we need to undo everything. 3268 if (!TmpScaledReg.isValid()) { 3269 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 3270 .addReg(ScaledReg, RegState::Kill) 3271 .addImm(-Offset); 3272 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 3273 .addReg(FrameReg) 3274 .addImm(ST.getWavefrontSizeLog2()); 3275 } 3276 } 3277 } 3278 3279 // Don't introduce an extra copy if we're just materializing in a mov. 3280 if (IsCopy) { 3281 MI->eraseFromParent(); 3282 return true; 3283 } 3284 FIOp->ChangeToRegister(ResultReg, false, false, true); 3285 return false; 3286 } 3287 3288 if (IsMUBUF) { 3289 // Disable offen so we don't need a 0 vgpr base. 3290 assert( 3291 static_cast<int>(FIOperandNum) == 3292 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); 3293 3294 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 3295 assert((SOffset.isImm() && SOffset.getImm() == 0)); 3296 3297 if (FrameReg != AMDGPU::NoRegister) 3298 SOffset.ChangeToRegister(FrameReg, false); 3299 3300 int64_t Offset = FrameInfo.getObjectOffset(Index); 3301 int64_t OldImm = 3302 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 3303 int64_t NewOffset = OldImm + Offset; 3304 3305 if (TII->isLegalMUBUFImmOffset(NewOffset) && 3306 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 3307 MI->eraseFromParent(); 3308 return true; 3309 } 3310 } 3311 3312 // If the offset is simply too big, don't convert to a scratch wave offset 3313 // relative index. 3314 3315 FIOp->ChangeToImmediate(Offset); 3316 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) { 3317 Register TmpReg = 3318 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 3319 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 3320 .addImm(Offset); 3321 FIOp->ChangeToRegister(TmpReg, false, false, true); 3322 } 3323 3324 return false; 3325 } 3326 3327 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 3328 return AMDGPUInstPrinter::getRegisterName(Reg); 3329 } 3330 3331 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { 3332 return getRegBitWidth(RC.getID()); 3333 } 3334 3335 static const TargetRegisterClass * 3336 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 3337 if (BitWidth == 64) 3338 return &AMDGPU::VReg_64RegClass; 3339 if (BitWidth == 96) 3340 return &AMDGPU::VReg_96RegClass; 3341 if (BitWidth == 128) 3342 return &AMDGPU::VReg_128RegClass; 3343 if (BitWidth == 160) 3344 return &AMDGPU::VReg_160RegClass; 3345 if (BitWidth == 192) 3346 return &AMDGPU::VReg_192RegClass; 3347 if (BitWidth == 224) 3348 return &AMDGPU::VReg_224RegClass; 3349 if (BitWidth == 256) 3350 return &AMDGPU::VReg_256RegClass; 3351 if (BitWidth == 288) 3352 return &AMDGPU::VReg_288RegClass; 3353 if (BitWidth == 320) 3354 return &AMDGPU::VReg_320RegClass; 3355 if (BitWidth == 352) 3356 return &AMDGPU::VReg_352RegClass; 3357 if (BitWidth == 384) 3358 return &AMDGPU::VReg_384RegClass; 3359 if (BitWidth == 512) 3360 return &AMDGPU::VReg_512RegClass; 3361 if (BitWidth == 1024) 3362 return &AMDGPU::VReg_1024RegClass; 3363 3364 return nullptr; 3365 } 3366 3367 static const TargetRegisterClass * 3368 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 3369 if (BitWidth == 64) 3370 return &AMDGPU::VReg_64_Align2RegClass; 3371 if (BitWidth == 96) 3372 return &AMDGPU::VReg_96_Align2RegClass; 3373 if (BitWidth == 128) 3374 return &AMDGPU::VReg_128_Align2RegClass; 3375 if (BitWidth == 160) 3376 return &AMDGPU::VReg_160_Align2RegClass; 3377 if (BitWidth == 192) 3378 return &AMDGPU::VReg_192_Align2RegClass; 3379 if (BitWidth == 224) 3380 return &AMDGPU::VReg_224_Align2RegClass; 3381 if (BitWidth == 256) 3382 return &AMDGPU::VReg_256_Align2RegClass; 3383 if (BitWidth == 288) 3384 return &AMDGPU::VReg_288_Align2RegClass; 3385 if (BitWidth == 320) 3386 return &AMDGPU::VReg_320_Align2RegClass; 3387 if (BitWidth == 352) 3388 return &AMDGPU::VReg_352_Align2RegClass; 3389 if (BitWidth == 384) 3390 return &AMDGPU::VReg_384_Align2RegClass; 3391 if (BitWidth == 512) 3392 return &AMDGPU::VReg_512_Align2RegClass; 3393 if (BitWidth == 1024) 3394 return &AMDGPU::VReg_1024_Align2RegClass; 3395 3396 return nullptr; 3397 } 3398 3399 const TargetRegisterClass * 3400 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 3401 if (BitWidth == 1) 3402 return &AMDGPU::VReg_1RegClass; 3403 if (BitWidth == 16) 3404 return &AMDGPU::VGPR_16RegClass; 3405 if (BitWidth == 32) 3406 return &AMDGPU::VGPR_32RegClass; 3407 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 3408 : getAnyVGPRClassForBitWidth(BitWidth); 3409 } 3410 3411 static const TargetRegisterClass * 3412 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 3413 if (BitWidth == 64) 3414 return &AMDGPU::AReg_64RegClass; 3415 if (BitWidth == 96) 3416 return &AMDGPU::AReg_96RegClass; 3417 if (BitWidth == 128) 3418 return &AMDGPU::AReg_128RegClass; 3419 if (BitWidth == 160) 3420 return &AMDGPU::AReg_160RegClass; 3421 if (BitWidth == 192) 3422 return &AMDGPU::AReg_192RegClass; 3423 if (BitWidth == 224) 3424 return &AMDGPU::AReg_224RegClass; 3425 if (BitWidth == 256) 3426 return &AMDGPU::AReg_256RegClass; 3427 if (BitWidth == 288) 3428 return &AMDGPU::AReg_288RegClass; 3429 if (BitWidth == 320) 3430 return &AMDGPU::AReg_320RegClass; 3431 if (BitWidth == 352) 3432 return &AMDGPU::AReg_352RegClass; 3433 if (BitWidth == 384) 3434 return &AMDGPU::AReg_384RegClass; 3435 if (BitWidth == 512) 3436 return &AMDGPU::AReg_512RegClass; 3437 if (BitWidth == 1024) 3438 return &AMDGPU::AReg_1024RegClass; 3439 3440 return nullptr; 3441 } 3442 3443 static const TargetRegisterClass * 3444 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 3445 if (BitWidth == 64) 3446 return &AMDGPU::AReg_64_Align2RegClass; 3447 if (BitWidth == 96) 3448 return &AMDGPU::AReg_96_Align2RegClass; 3449 if (BitWidth == 128) 3450 return &AMDGPU::AReg_128_Align2RegClass; 3451 if (BitWidth == 160) 3452 return &AMDGPU::AReg_160_Align2RegClass; 3453 if (BitWidth == 192) 3454 return &AMDGPU::AReg_192_Align2RegClass; 3455 if (BitWidth == 224) 3456 return &AMDGPU::AReg_224_Align2RegClass; 3457 if (BitWidth == 256) 3458 return &AMDGPU::AReg_256_Align2RegClass; 3459 if (BitWidth == 288) 3460 return &AMDGPU::AReg_288_Align2RegClass; 3461 if (BitWidth == 320) 3462 return &AMDGPU::AReg_320_Align2RegClass; 3463 if (BitWidth == 352) 3464 return &AMDGPU::AReg_352_Align2RegClass; 3465 if (BitWidth == 384) 3466 return &AMDGPU::AReg_384_Align2RegClass; 3467 if (BitWidth == 512) 3468 return &AMDGPU::AReg_512_Align2RegClass; 3469 if (BitWidth == 1024) 3470 return &AMDGPU::AReg_1024_Align2RegClass; 3471 3472 return nullptr; 3473 } 3474 3475 const TargetRegisterClass * 3476 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 3477 if (BitWidth == 16) 3478 return &AMDGPU::AGPR_LO16RegClass; 3479 if (BitWidth == 32) 3480 return &AMDGPU::AGPR_32RegClass; 3481 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 3482 : getAnyAGPRClassForBitWidth(BitWidth); 3483 } 3484 3485 static const TargetRegisterClass * 3486 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 3487 if (BitWidth == 64) 3488 return &AMDGPU::AV_64RegClass; 3489 if (BitWidth == 96) 3490 return &AMDGPU::AV_96RegClass; 3491 if (BitWidth == 128) 3492 return &AMDGPU::AV_128RegClass; 3493 if (BitWidth == 160) 3494 return &AMDGPU::AV_160RegClass; 3495 if (BitWidth == 192) 3496 return &AMDGPU::AV_192RegClass; 3497 if (BitWidth == 224) 3498 return &AMDGPU::AV_224RegClass; 3499 if (BitWidth == 256) 3500 return &AMDGPU::AV_256RegClass; 3501 if (BitWidth == 288) 3502 return &AMDGPU::AV_288RegClass; 3503 if (BitWidth == 320) 3504 return &AMDGPU::AV_320RegClass; 3505 if (BitWidth == 352) 3506 return &AMDGPU::AV_352RegClass; 3507 if (BitWidth == 384) 3508 return &AMDGPU::AV_384RegClass; 3509 if (BitWidth == 512) 3510 return &AMDGPU::AV_512RegClass; 3511 if (BitWidth == 1024) 3512 return &AMDGPU::AV_1024RegClass; 3513 3514 return nullptr; 3515 } 3516 3517 static const TargetRegisterClass * 3518 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 3519 if (BitWidth == 64) 3520 return &AMDGPU::AV_64_Align2RegClass; 3521 if (BitWidth == 96) 3522 return &AMDGPU::AV_96_Align2RegClass; 3523 if (BitWidth == 128) 3524 return &AMDGPU::AV_128_Align2RegClass; 3525 if (BitWidth == 160) 3526 return &AMDGPU::AV_160_Align2RegClass; 3527 if (BitWidth == 192) 3528 return &AMDGPU::AV_192_Align2RegClass; 3529 if (BitWidth == 224) 3530 return &AMDGPU::AV_224_Align2RegClass; 3531 if (BitWidth == 256) 3532 return &AMDGPU::AV_256_Align2RegClass; 3533 if (BitWidth == 288) 3534 return &AMDGPU::AV_288_Align2RegClass; 3535 if (BitWidth == 320) 3536 return &AMDGPU::AV_320_Align2RegClass; 3537 if (BitWidth == 352) 3538 return &AMDGPU::AV_352_Align2RegClass; 3539 if (BitWidth == 384) 3540 return &AMDGPU::AV_384_Align2RegClass; 3541 if (BitWidth == 512) 3542 return &AMDGPU::AV_512_Align2RegClass; 3543 if (BitWidth == 1024) 3544 return &AMDGPU::AV_1024_Align2RegClass; 3545 3546 return nullptr; 3547 } 3548 3549 const TargetRegisterClass * 3550 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 3551 if (BitWidth == 32) 3552 return &AMDGPU::AV_32RegClass; 3553 return ST.needsAlignedVGPRs() 3554 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 3555 : getAnyVectorSuperClassForBitWidth(BitWidth); 3556 } 3557 3558 const TargetRegisterClass * 3559 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 3560 if (BitWidth == 16 || BitWidth == 32) 3561 return &AMDGPU::SReg_32RegClass; 3562 if (BitWidth == 64) 3563 return &AMDGPU::SReg_64RegClass; 3564 if (BitWidth == 96) 3565 return &AMDGPU::SGPR_96RegClass; 3566 if (BitWidth == 128) 3567 return &AMDGPU::SGPR_128RegClass; 3568 if (BitWidth == 160) 3569 return &AMDGPU::SGPR_160RegClass; 3570 if (BitWidth == 192) 3571 return &AMDGPU::SGPR_192RegClass; 3572 if (BitWidth == 224) 3573 return &AMDGPU::SGPR_224RegClass; 3574 if (BitWidth == 256) 3575 return &AMDGPU::SGPR_256RegClass; 3576 if (BitWidth == 288) 3577 return &AMDGPU::SGPR_288RegClass; 3578 if (BitWidth == 320) 3579 return &AMDGPU::SGPR_320RegClass; 3580 if (BitWidth == 352) 3581 return &AMDGPU::SGPR_352RegClass; 3582 if (BitWidth == 384) 3583 return &AMDGPU::SGPR_384RegClass; 3584 if (BitWidth == 512) 3585 return &AMDGPU::SGPR_512RegClass; 3586 if (BitWidth == 1024) 3587 return &AMDGPU::SGPR_1024RegClass; 3588 3589 return nullptr; 3590 } 3591 3592 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 3593 Register Reg) const { 3594 const TargetRegisterClass *RC; 3595 if (Reg.isVirtual()) 3596 RC = MRI.getRegClass(Reg); 3597 else 3598 RC = getPhysRegBaseClass(Reg); 3599 return RC && isSGPRClass(RC); 3600 } 3601 3602 const TargetRegisterClass * 3603 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 3604 unsigned Size = getRegSizeInBits(*SRC); 3605 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 3606 assert(VRC && "Invalid register class size"); 3607 return VRC; 3608 } 3609 3610 const TargetRegisterClass * 3611 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 3612 unsigned Size = getRegSizeInBits(*SRC); 3613 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 3614 assert(ARC && "Invalid register class size"); 3615 return ARC; 3616 } 3617 3618 const TargetRegisterClass * 3619 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 3620 unsigned Size = getRegSizeInBits(*VRC); 3621 if (Size == 32) 3622 return &AMDGPU::SGPR_32RegClass; 3623 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 3624 assert(SRC && "Invalid register class size"); 3625 return SRC; 3626 } 3627 3628 const TargetRegisterClass * 3629 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 3630 const TargetRegisterClass *SubRC, 3631 unsigned SubIdx) const { 3632 // Ensure this subregister index is aligned in the super register. 3633 const TargetRegisterClass *MatchRC = 3634 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 3635 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 3636 } 3637 3638 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 3639 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 3640 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 3641 return !ST.hasMFMAInlineLiteralBug(); 3642 3643 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 3644 OpType <= AMDGPU::OPERAND_SRC_LAST; 3645 } 3646 3647 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 3648 // TODO: 64-bit operands have extending behavior from 32-bit literal. 3649 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 3650 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 3651 } 3652 3653 /// Returns a lowest register that is not used at any point in the function. 3654 /// If all registers are used, then this function will return 3655 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return 3656 /// highest unused register. 3657 MCRegister SIRegisterInfo::findUnusedRegister( 3658 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, 3659 const MachineFunction &MF, bool ReserveHighestRegister) const { 3660 if (ReserveHighestRegister) { 3661 for (MCRegister Reg : reverse(*RC)) 3662 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 3663 return Reg; 3664 } else { 3665 for (MCRegister Reg : *RC) 3666 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 3667 return Reg; 3668 } 3669 return MCRegister(); 3670 } 3671 3672 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, 3673 const RegisterBankInfo &RBI, 3674 Register Reg) const { 3675 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); 3676 if (!RB) 3677 return false; 3678 3679 return !RBI.isDivergentRegBank(RB); 3680 } 3681 3682 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 3683 unsigned EltSize) const { 3684 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); 3685 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2); 3686 3687 const unsigned RegHalves = RegBitWidth / 16; 3688 const unsigned EltHalves = EltSize / 2; 3689 assert(RegSplitParts.size() + 1 >= EltHalves); 3690 3691 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1]; 3692 const unsigned NumParts = RegHalves / EltHalves; 3693 3694 return ArrayRef(Parts.data(), NumParts); 3695 } 3696 3697 const TargetRegisterClass* 3698 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 3699 Register Reg) const { 3700 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 3701 } 3702 3703 const TargetRegisterClass * 3704 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 3705 const MachineOperand &MO) const { 3706 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 3707 return getSubRegisterClass(SrcRC, MO.getSubReg()); 3708 } 3709 3710 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 3711 Register Reg) const { 3712 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3713 // Registers without classes are unaddressable, SGPR-like registers. 3714 return RC && isVGPRClass(RC); 3715 } 3716 3717 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 3718 Register Reg) const { 3719 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 3720 3721 // Registers without classes are unaddressable, SGPR-like registers. 3722 return RC && isAGPRClass(RC); 3723 } 3724 3725 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 3726 const TargetRegisterClass *SrcRC, 3727 unsigned SubReg, 3728 const TargetRegisterClass *DstRC, 3729 unsigned DstSubReg, 3730 const TargetRegisterClass *NewRC, 3731 LiveIntervals &LIS) const { 3732 unsigned SrcSize = getRegSizeInBits(*SrcRC); 3733 unsigned DstSize = getRegSizeInBits(*DstRC); 3734 unsigned NewSize = getRegSizeInBits(*NewRC); 3735 3736 // Do not increase size of registers beyond dword, we would need to allocate 3737 // adjacent registers and constraint regalloc more than needed. 3738 3739 // Always allow dword coalescing. 3740 if (SrcSize <= 32 || DstSize <= 32) 3741 return true; 3742 3743 return NewSize <= DstSize || NewSize <= SrcSize; 3744 } 3745 3746 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 3747 MachineFunction &MF) const { 3748 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; 3749 switch (RC->getID()) { 3750 default: 3751 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 3752 case AMDGPU::VGPR_32RegClassID: 3753 return std::min( 3754 ST.getMaxNumVGPRs( 3755 MinOcc, 3756 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()), 3757 ST.getMaxNumVGPRs(MF)); 3758 case AMDGPU::SGPR_32RegClassID: 3759 case AMDGPU::SGPR_LO16RegClassID: 3760 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF)); 3761 } 3762 } 3763 3764 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 3765 unsigned Idx) const { 3766 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 3767 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 3768 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 3769 const_cast<MachineFunction &>(MF)); 3770 3771 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 3772 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 3773 const_cast<MachineFunction &>(MF)); 3774 3775 llvm_unreachable("Unexpected register pressure set!"); 3776 } 3777 3778 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 3779 static const int Empty[] = { -1 }; 3780 3781 if (RegPressureIgnoredUnits[RegUnit]) 3782 return Empty; 3783 3784 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 3785 } 3786 3787 bool SIRegisterInfo::getRegAllocationHints(Register VirtReg, 3788 ArrayRef<MCPhysReg> Order, 3789 SmallVectorImpl<MCPhysReg> &Hints, 3790 const MachineFunction &MF, 3791 const VirtRegMap *VRM, 3792 const LiveRegMatrix *Matrix) const { 3793 3794 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3795 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 3796 3797 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg); 3798 3799 switch (Hint.first) { 3800 case AMDGPURI::Size32: { 3801 Register Paired = Hint.second; 3802 assert(Paired); 3803 Register PairedPhys; 3804 if (Paired.isPhysical()) { 3805 PairedPhys = 3806 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass); 3807 } else if (VRM && VRM->hasPhys(Paired)) { 3808 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16, 3809 &AMDGPU::VGPR_32RegClass); 3810 } 3811 3812 // Prefer the paired physreg. 3813 if (PairedPhys) 3814 // isLo(Paired) is implicitly true here from the API of 3815 // getMatchingSuperReg. 3816 Hints.push_back(PairedPhys); 3817 return false; 3818 } 3819 case AMDGPURI::Size16: { 3820 Register Paired = Hint.second; 3821 assert(Paired); 3822 Register PairedPhys; 3823 if (Paired.isPhysical()) { 3824 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16); 3825 } else if (VRM && VRM->hasPhys(Paired)) { 3826 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16); 3827 } 3828 3829 // First prefer the paired physreg. 3830 if (PairedPhys) 3831 Hints.push_back(PairedPhys); 3832 else { 3833 // Add all the lo16 physregs. 3834 // When the Paired operand has not yet been assigned a physreg it is 3835 // better to try putting VirtReg in a lo16 register, because possibly 3836 // later Paired can be assigned to the overlapping register and the COPY 3837 // can be eliminated. 3838 for (MCPhysReg PhysReg : Order) { 3839 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this)) 3840 continue; 3841 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) && 3842 !MRI.isReserved(PhysReg)) 3843 Hints.push_back(PhysReg); 3844 } 3845 } 3846 return false; 3847 } 3848 default: 3849 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, 3850 VRM); 3851 } 3852 } 3853 3854 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 3855 // Not a callee saved register. 3856 return AMDGPU::SGPR30_SGPR31; 3857 } 3858 3859 const TargetRegisterClass * 3860 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 3861 const RegisterBank &RB) const { 3862 switch (RB.getID()) { 3863 case AMDGPU::VGPRRegBankID: 3864 return getVGPRClassForBitWidth( 3865 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); 3866 case AMDGPU::VCCRegBankID: 3867 assert(Size == 1); 3868 return getWaveMaskRegClass(); 3869 case AMDGPU::SGPRRegBankID: 3870 return getSGPRClassForBitWidth(std::max(32u, Size)); 3871 case AMDGPU::AGPRRegBankID: 3872 return getAGPRClassForBitWidth(std::max(32u, Size)); 3873 default: 3874 llvm_unreachable("unknown register bank"); 3875 } 3876 } 3877 3878 const TargetRegisterClass * 3879 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3880 const MachineRegisterInfo &MRI) const { 3881 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3882 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB)) 3883 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3884 3885 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB)) 3886 return getAllocatableClass(RC); 3887 3888 return nullptr; 3889 } 3890 3891 MCRegister SIRegisterInfo::getVCC() const { 3892 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3893 } 3894 3895 MCRegister SIRegisterInfo::getExec() const { 3896 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3897 } 3898 3899 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3900 // VGPR tuples have an alignment requirement on gfx90a variants. 3901 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3902 : &AMDGPU::VReg_64RegClass; 3903 } 3904 3905 const TargetRegisterClass * 3906 SIRegisterInfo::getRegClass(unsigned RCID) const { 3907 switch ((int)RCID) { 3908 case AMDGPU::SReg_1RegClassID: 3909 return getBoolRC(); 3910 case AMDGPU::SReg_1_XEXECRegClassID: 3911 return getWaveMaskRegClass(); 3912 case -1: 3913 return nullptr; 3914 default: 3915 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3916 } 3917 } 3918 3919 // Find reaching register definition 3920 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3921 MachineInstr &Use, 3922 MachineRegisterInfo &MRI, 3923 LiveIntervals *LIS) const { 3924 auto &MDT = LIS->getDomTree(); 3925 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3926 SlotIndex DefIdx; 3927 3928 if (Reg.isVirtual()) { 3929 if (!LIS->hasInterval(Reg)) 3930 return nullptr; 3931 LiveInterval &LI = LIS->getInterval(Reg); 3932 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3933 : MRI.getMaxLaneMaskForVReg(Reg); 3934 VNInfo *V = nullptr; 3935 if (LI.hasSubRanges()) { 3936 for (auto &S : LI.subranges()) { 3937 if ((S.LaneMask & SubLanes) == SubLanes) { 3938 V = S.getVNInfoAt(UseIdx); 3939 break; 3940 } 3941 } 3942 } else { 3943 V = LI.getVNInfoAt(UseIdx); 3944 } 3945 if (!V) 3946 return nullptr; 3947 DefIdx = V->def; 3948 } else { 3949 // Find last def. 3950 for (MCRegUnit Unit : regunits(Reg.asMCReg())) { 3951 LiveRange &LR = LIS->getRegUnit(Unit); 3952 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3953 if (!DefIdx.isValid() || 3954 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3955 LIS->getInstructionFromIndex(V->def))) 3956 DefIdx = V->def; 3957 } else { 3958 return nullptr; 3959 } 3960 } 3961 } 3962 3963 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3964 3965 if (!Def || !MDT.dominates(Def, &Use)) 3966 return nullptr; 3967 3968 assert(Def->modifiesRegister(Reg, this)); 3969 3970 return Def; 3971 } 3972 3973 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3974 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3975 3976 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3977 AMDGPU::SReg_32RegClass, 3978 AMDGPU::AGPR_32RegClass } ) { 3979 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3980 return Super; 3981 } 3982 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3983 &AMDGPU::VGPR_32RegClass)) { 3984 return Super; 3985 } 3986 3987 return AMDGPU::NoRegister; 3988 } 3989 3990 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3991 if (!ST.needsAlignedVGPRs()) 3992 return true; 3993 3994 if (isVGPRClass(&RC)) 3995 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3996 if (isAGPRClass(&RC)) 3997 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3998 if (isVectorSuperClass(&RC)) 3999 return RC.hasSuperClassEq( 4000 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 4001 4002 return true; 4003 } 4004 4005 const TargetRegisterClass * 4006 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 4007 if (!RC || !ST.needsAlignedVGPRs()) 4008 return RC; 4009 4010 unsigned Size = getRegSizeInBits(*RC); 4011 if (Size <= 32) 4012 return RC; 4013 4014 if (isVGPRClass(RC)) 4015 return getAlignedVGPRClassForBitWidth(Size); 4016 if (isAGPRClass(RC)) 4017 return getAlignedAGPRClassForBitWidth(Size); 4018 if (isVectorSuperClass(RC)) 4019 return getAlignedVectorSuperClassForBitWidth(Size); 4020 4021 return RC; 4022 } 4023 4024 ArrayRef<MCPhysReg> 4025 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 4026 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 4027 } 4028 4029 ArrayRef<MCPhysReg> 4030 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 4031 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 4032 } 4033 4034 ArrayRef<MCPhysReg> 4035 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 4036 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 4037 } 4038 4039 unsigned 4040 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, 4041 unsigned SubReg) const { 4042 switch (RC->TSFlags & SIRCFlags::RegKindMask) { 4043 case SIRCFlags::HasSGPR: 4044 return std::min(128u, getSubRegIdxSize(SubReg)); 4045 case SIRCFlags::HasAGPR: 4046 case SIRCFlags::HasVGPR: 4047 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: 4048 return std::min(32u, getSubRegIdxSize(SubReg)); 4049 default: 4050 break; 4051 } 4052 return 0; 4053 } 4054 4055 unsigned 4056 SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, 4057 const TargetRegisterClass &RC) const { 4058 for (MCPhysReg Reg : reverse(RC.getRegisters())) 4059 if (MRI.isPhysRegUsed(Reg)) 4060 return getHWRegIndex(Reg) + 1; 4061 return 0; 4062 } 4063 4064 SmallVector<StringLiteral> 4065 SIRegisterInfo::getVRegFlagsOfReg(Register Reg, 4066 const MachineFunction &MF) const { 4067 SmallVector<StringLiteral> RegFlags; 4068 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 4069 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) 4070 RegFlags.push_back("WWM_REG"); 4071 return RegFlags; 4072 } 4073