1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "MCTargetDesc/AMDGPUInstPrinter.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/CodeGen/SlotIndexes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/LLVMContext.h" 29 30 using namespace llvm; 31 32 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 33 for (unsigned i = 0; PSets[i] != -1; ++i) { 34 if (PSets[i] == (int)PSetID) 35 return true; 36 } 37 return false; 38 } 39 40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 41 BitVector &PressureSets) const { 42 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 43 const int *PSets = getRegUnitPressureSets(*U); 44 if (hasPressureSet(PSets, PSetID)) { 45 PressureSets.set(PSetID); 46 break; 47 } 48 } 49 } 50 51 static cl::opt<bool> EnableSpillSGPRToVGPR( 52 "amdgpu-spill-sgpr-to-vgpr", 53 cl::desc("Enable spilling VGPRs to SGPRs"), 54 cl::ReallyHidden, 55 cl::init(true)); 56 57 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : 58 AMDGPURegisterInfo(), 59 ST(ST), 60 SGPRPressureSets(getNumRegPressureSets()), 61 VGPRPressureSets(getNumRegPressureSets()), 62 AGPRPressureSets(getNumRegPressureSets()), 63 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), 64 isWave32(ST.isWave32()) { 65 unsigned NumRegPressureSets = getNumRegPressureSets(); 66 67 SGPRSetID = NumRegPressureSets; 68 VGPRSetID = NumRegPressureSets; 69 AGPRSetID = NumRegPressureSets; 70 71 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 72 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 73 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 74 classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); 75 } 76 77 // Determine the number of reg units for each pressure set. 78 std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); 79 for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { 80 const int *PSets = getRegUnitPressureSets(i); 81 for (unsigned j = 0; PSets[j] != -1; ++j) { 82 ++PressureSetRegUnits[PSets[j]]; 83 } 84 } 85 86 unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; 87 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 88 if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { 89 VGPRSetID = i; 90 VGPRMax = PressureSetRegUnits[i]; 91 continue; 92 } 93 if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { 94 SGPRSetID = i; 95 SGPRMax = PressureSetRegUnits[i]; 96 } 97 if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { 98 AGPRSetID = i; 99 AGPRMax = PressureSetRegUnits[i]; 100 continue; 101 } 102 } 103 104 assert(SGPRSetID < NumRegPressureSets && 105 VGPRSetID < NumRegPressureSets && 106 AGPRSetID < NumRegPressureSets); 107 } 108 109 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 110 const MachineFunction &MF) const { 111 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 112 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 113 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 114 } 115 116 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { 117 unsigned Reg; 118 119 // Try to place it in a hole after PrivateSegmentBufferReg. 120 if (RegCount & 3) { 121 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 122 // alignment constraints, so we have a hole where can put the wave offset. 123 Reg = RegCount - 1; 124 } else { 125 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 126 // wave offset before it. 127 Reg = RegCount - 5; 128 } 129 130 return Reg; 131 } 132 133 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 134 const MachineFunction &MF) const { 135 unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); 136 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 137 } 138 139 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 140 BitVector Reserved(getNumRegs()); 141 142 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 143 // this seems likely to result in bugs, so I'm marking them as reserved. 144 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 145 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 146 147 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 148 reserveRegisterTuples(Reserved, AMDGPU::M0); 149 150 // Reserve src_vccz, src_execz, src_scc. 151 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 152 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 153 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 154 155 // Reserve the memory aperture registers. 156 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 157 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 158 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 159 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 160 161 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 162 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 163 164 // Reserve xnack_mask registers - support is not implemented in Codegen. 165 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 166 167 // Reserve lds_direct register - support is not implemented in Codegen. 168 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 169 170 // Reserve Trap Handler registers - support is not implemented in Codegen. 171 reserveRegisterTuples(Reserved, AMDGPU::TBA); 172 reserveRegisterTuples(Reserved, AMDGPU::TMA); 173 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 174 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 175 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 176 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 177 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 178 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 179 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 180 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 181 182 // Reserve null register - it shall never be allocated 183 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 184 185 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 186 // will result in bugs. 187 if (isWave32) { 188 Reserved.set(AMDGPU::VCC); 189 Reserved.set(AMDGPU::VCC_HI); 190 } 191 192 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 193 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 194 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 195 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 196 reserveRegisterTuples(Reserved, Reg); 197 } 198 199 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 200 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 201 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 202 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 203 reserveRegisterTuples(Reserved, Reg); 204 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 205 reserveRegisterTuples(Reserved, Reg); 206 } 207 208 // Reserve all the rest AGPRs if there are no instructions to use it. 209 if (!ST.hasMAIInsts()) { 210 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 211 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 212 reserveRegisterTuples(Reserved, Reg); 213 } 214 } 215 216 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 217 218 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 219 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 220 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 221 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 222 } 223 224 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 225 if (ScratchRSrcReg != AMDGPU::NoRegister) { 226 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 227 // to spill. 228 // TODO: May need to reserve a VGPR if doing LDS spilling. 229 reserveRegisterTuples(Reserved, ScratchRSrcReg); 230 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 231 } 232 233 // We have to assume the SP is needed in case there are calls in the function, 234 // which is detected after the function is lowered. If we aren't really going 235 // to need SP, don't bother reserving it. 236 unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); 237 238 if (StackPtrReg != AMDGPU::NoRegister) { 239 reserveRegisterTuples(Reserved, StackPtrReg); 240 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 241 } 242 243 unsigned FrameReg = MFI->getFrameOffsetReg(); 244 if (FrameReg != AMDGPU::NoRegister) { 245 reserveRegisterTuples(Reserved, FrameReg); 246 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 247 } 248 249 for (unsigned Reg : MFI->WWMReservedRegs) { 250 reserveRegisterTuples(Reserved, Reg); 251 } 252 253 // FIXME: Stop using reserved registers for this. 254 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 255 reserveRegisterTuples(Reserved, Reg); 256 257 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 258 reserveRegisterTuples(Reserved, Reg); 259 260 return Reserved; 261 } 262 263 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 264 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 265 // On entry, the base address is 0, so it can't possibly need any more 266 // alignment. 267 268 // FIXME: Should be able to specify the entry frame alignment per calling 269 // convention instead. 270 if (Info->isEntryFunction()) 271 return false; 272 273 return TargetRegisterInfo::canRealignStack(MF); 274 } 275 276 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 277 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 278 if (Info->isEntryFunction()) { 279 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 280 return MFI.hasStackObjects() || MFI.hasCalls(); 281 } 282 283 // May need scavenger for dealing with callee saved registers. 284 return true; 285 } 286 287 bool SIRegisterInfo::requiresFrameIndexScavenging( 288 const MachineFunction &MF) const { 289 // Do not use frame virtual registers. They used to be used for SGPRs, but 290 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 291 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 292 // spill. 293 return false; 294 } 295 296 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 297 const MachineFunction &MF) const { 298 const MachineFrameInfo &MFI = MF.getFrameInfo(); 299 return MFI.hasStackObjects(); 300 } 301 302 bool SIRegisterInfo::requiresVirtualBaseRegisters( 303 const MachineFunction &) const { 304 // There are no special dedicated stack or frame pointers. 305 return true; 306 } 307 308 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 309 // This helps catch bugs as verifier errors. 310 return true; 311 } 312 313 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 314 assert(SIInstrInfo::isMUBUF(*MI)); 315 316 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 317 AMDGPU::OpName::offset); 318 return MI->getOperand(OffIdx).getImm(); 319 } 320 321 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 322 int Idx) const { 323 if (!SIInstrInfo::isMUBUF(*MI)) 324 return 0; 325 326 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 327 AMDGPU::OpName::vaddr) && 328 "Should never see frame index on non-address operand"); 329 330 return getMUBUFInstrOffset(MI); 331 } 332 333 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 334 if (!MI->mayLoadOrStore()) 335 return false; 336 337 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 338 339 return !isUInt<12>(FullOffset); 340 } 341 342 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 343 unsigned BaseReg, 344 int FrameIdx, 345 int64_t Offset) const { 346 MachineBasicBlock::iterator Ins = MBB->begin(); 347 DebugLoc DL; // Defaults to "unknown" 348 349 if (Ins != MBB->end()) 350 DL = Ins->getDebugLoc(); 351 352 MachineFunction *MF = MBB->getParent(); 353 const SIInstrInfo *TII = ST.getInstrInfo(); 354 355 if (Offset == 0) { 356 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 357 .addFrameIndex(FrameIdx); 358 return; 359 } 360 361 MachineRegisterInfo &MRI = MF->getRegInfo(); 362 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 363 364 Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 365 366 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 367 .addImm(Offset); 368 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 369 .addFrameIndex(FrameIdx); 370 371 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 372 .addReg(OffsetReg, RegState::Kill) 373 .addReg(FIReg) 374 .addImm(0); // clamp bit 375 } 376 377 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 378 int64_t Offset) const { 379 const SIInstrInfo *TII = ST.getInstrInfo(); 380 381 #ifndef NDEBUG 382 // FIXME: Is it possible to be storing a frame index to itself? 383 bool SeenFI = false; 384 for (const MachineOperand &MO: MI.operands()) { 385 if (MO.isFI()) { 386 if (SeenFI) 387 llvm_unreachable("should not see multiple frame indices"); 388 389 SeenFI = true; 390 } 391 } 392 #endif 393 394 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 395 #ifndef NDEBUG 396 MachineBasicBlock *MBB = MI.getParent(); 397 MachineFunction *MF = MBB->getParent(); 398 #endif 399 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 400 assert(TII->isMUBUF(MI)); 401 assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == 402 MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() && 403 "should only be seeing stack pointer offset relative FrameIndex"); 404 405 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 406 int64_t NewOffset = OffsetOp->getImm() + Offset; 407 assert(isUInt<12>(NewOffset) && "offset should be legal"); 408 409 FIOp->ChangeToRegister(BaseReg, false); 410 OffsetOp->setImm(NewOffset); 411 } 412 413 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 414 unsigned BaseReg, 415 int64_t Offset) const { 416 if (!SIInstrInfo::isMUBUF(*MI)) 417 return false; 418 419 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 420 421 return isUInt<12>(NewOffset); 422 } 423 424 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 425 const MachineFunction &MF, unsigned Kind) const { 426 // This is inaccurate. It depends on the instruction and address space. The 427 // only place where we should hit this is for dealing with frame indexes / 428 // private accesses, so this is correct in that case. 429 return &AMDGPU::VGPR_32RegClass; 430 } 431 432 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 433 434 switch (Op) { 435 case AMDGPU::SI_SPILL_S1024_SAVE: 436 case AMDGPU::SI_SPILL_S1024_RESTORE: 437 case AMDGPU::SI_SPILL_V1024_SAVE: 438 case AMDGPU::SI_SPILL_V1024_RESTORE: 439 case AMDGPU::SI_SPILL_A1024_SAVE: 440 case AMDGPU::SI_SPILL_A1024_RESTORE: 441 return 32; 442 case AMDGPU::SI_SPILL_S512_SAVE: 443 case AMDGPU::SI_SPILL_S512_RESTORE: 444 case AMDGPU::SI_SPILL_V512_SAVE: 445 case AMDGPU::SI_SPILL_V512_RESTORE: 446 case AMDGPU::SI_SPILL_A512_SAVE: 447 case AMDGPU::SI_SPILL_A512_RESTORE: 448 return 16; 449 case AMDGPU::SI_SPILL_S256_SAVE: 450 case AMDGPU::SI_SPILL_S256_RESTORE: 451 case AMDGPU::SI_SPILL_V256_SAVE: 452 case AMDGPU::SI_SPILL_V256_RESTORE: 453 return 8; 454 case AMDGPU::SI_SPILL_S160_SAVE: 455 case AMDGPU::SI_SPILL_S160_RESTORE: 456 case AMDGPU::SI_SPILL_V160_SAVE: 457 case AMDGPU::SI_SPILL_V160_RESTORE: 458 return 5; 459 case AMDGPU::SI_SPILL_S128_SAVE: 460 case AMDGPU::SI_SPILL_S128_RESTORE: 461 case AMDGPU::SI_SPILL_V128_SAVE: 462 case AMDGPU::SI_SPILL_V128_RESTORE: 463 case AMDGPU::SI_SPILL_A128_SAVE: 464 case AMDGPU::SI_SPILL_A128_RESTORE: 465 return 4; 466 case AMDGPU::SI_SPILL_S96_SAVE: 467 case AMDGPU::SI_SPILL_S96_RESTORE: 468 case AMDGPU::SI_SPILL_V96_SAVE: 469 case AMDGPU::SI_SPILL_V96_RESTORE: 470 return 3; 471 case AMDGPU::SI_SPILL_S64_SAVE: 472 case AMDGPU::SI_SPILL_S64_RESTORE: 473 case AMDGPU::SI_SPILL_V64_SAVE: 474 case AMDGPU::SI_SPILL_V64_RESTORE: 475 case AMDGPU::SI_SPILL_A64_SAVE: 476 case AMDGPU::SI_SPILL_A64_RESTORE: 477 return 2; 478 case AMDGPU::SI_SPILL_S32_SAVE: 479 case AMDGPU::SI_SPILL_S32_RESTORE: 480 case AMDGPU::SI_SPILL_V32_SAVE: 481 case AMDGPU::SI_SPILL_V32_RESTORE: 482 case AMDGPU::SI_SPILL_A32_SAVE: 483 case AMDGPU::SI_SPILL_A32_RESTORE: 484 return 1; 485 default: llvm_unreachable("Invalid spill opcode"); 486 } 487 } 488 489 static int getOffsetMUBUFStore(unsigned Opc) { 490 switch (Opc) { 491 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 492 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 493 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 494 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 495 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 496 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 497 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 498 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 499 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 500 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 501 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 502 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 503 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 504 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 505 default: 506 return -1; 507 } 508 } 509 510 static int getOffsetMUBUFLoad(unsigned Opc) { 511 switch (Opc) { 512 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 513 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 514 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 515 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 516 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 517 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 518 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 519 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 520 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 521 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 522 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 523 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 524 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 525 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 526 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 527 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 528 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 529 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 530 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 531 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 532 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 533 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 534 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 535 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 536 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 537 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 538 default: 539 return -1; 540 } 541 } 542 543 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 544 MachineBasicBlock::iterator MI, 545 int Index, 546 unsigned Lane, 547 unsigned ValueReg, 548 bool IsKill) { 549 MachineBasicBlock *MBB = MI->getParent(); 550 MachineFunction *MF = MI->getParent()->getParent(); 551 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 552 const SIInstrInfo *TII = ST.getInstrInfo(); 553 554 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 555 556 if (Reg == AMDGPU::NoRegister) 557 return MachineInstrBuilder(); 558 559 bool IsStore = MI->mayStore(); 560 MachineRegisterInfo &MRI = MF->getRegInfo(); 561 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 562 563 unsigned Dst = IsStore ? Reg : ValueReg; 564 unsigned Src = IsStore ? ValueReg : Reg; 565 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 566 : AMDGPU::V_ACCVGPR_READ_B32; 567 568 return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 569 .addReg(Src, getKillRegState(IsKill)); 570 } 571 572 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 573 // need to handle the case where an SGPR may need to be spilled while spilling. 574 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 575 MachineFrameInfo &MFI, 576 MachineBasicBlock::iterator MI, 577 int Index, 578 int64_t Offset) { 579 const SIInstrInfo *TII = ST.getInstrInfo(); 580 MachineBasicBlock *MBB = MI->getParent(); 581 const DebugLoc &DL = MI->getDebugLoc(); 582 bool IsStore = MI->mayStore(); 583 584 unsigned Opc = MI->getOpcode(); 585 int LoadStoreOp = IsStore ? 586 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 587 if (LoadStoreOp == -1) 588 return false; 589 590 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 591 if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) 592 return true; 593 594 MachineInstrBuilder NewMI = 595 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 596 .add(*Reg) 597 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 598 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 599 .addImm(Offset) 600 .addImm(0) // glc 601 .addImm(0) // slc 602 .addImm(0) // tfe 603 .addImm(0) // dlc 604 .addImm(0) // swz 605 .cloneMemRefs(*MI); 606 607 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 608 AMDGPU::OpName::vdata_in); 609 if (VDataIn) 610 NewMI.add(*VDataIn); 611 return true; 612 } 613 614 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 615 unsigned LoadStoreOp, 616 int Index, 617 unsigned ValueReg, 618 bool IsKill, 619 unsigned ScratchRsrcReg, 620 unsigned ScratchOffsetReg, 621 int64_t InstOffset, 622 MachineMemOperand *MMO, 623 RegScavenger *RS) const { 624 MachineBasicBlock *MBB = MI->getParent(); 625 MachineFunction *MF = MI->getParent()->getParent(); 626 const SIInstrInfo *TII = ST.getInstrInfo(); 627 const MachineFrameInfo &MFI = MF->getFrameInfo(); 628 629 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 630 const DebugLoc &DL = MI->getDebugLoc(); 631 bool IsStore = Desc.mayStore(); 632 633 bool Scavenged = false; 634 unsigned SOffset = ScratchOffsetReg; 635 636 const unsigned EltSize = 4; 637 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 638 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); 639 unsigned Size = NumSubRegs * EltSize; 640 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 641 int64_t ScratchOffsetRegDelta = 0; 642 643 unsigned Align = MFI.getObjectAlignment(Index); 644 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 645 646 Register TmpReg = 647 hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() 648 : Register(); 649 650 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); 651 652 if (!isUInt<12>(Offset + Size - EltSize)) { 653 SOffset = AMDGPU::NoRegister; 654 655 // We currently only support spilling VGPRs to EltSize boundaries, meaning 656 // we can simplify the adjustment of Offset here to just scale with 657 // WavefrontSize. 658 Offset *= ST.getWavefrontSize(); 659 660 // We don't have access to the register scavenger if this function is called 661 // during PEI::scavengeFrameVirtualRegs(). 662 if (RS) 663 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 664 665 if (SOffset == AMDGPU::NoRegister) { 666 // There are no free SGPRs, and since we are in the process of spilling 667 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 668 // on SI/CI and on VI it is true until we implement spilling using scalar 669 // stores), we have no way to free up an SGPR. Our solution here is to 670 // add the offset directly to the ScratchOffset register, and then 671 // subtract the offset after the spill to return ScratchOffset to it's 672 // original value. 673 SOffset = ScratchOffsetReg; 674 ScratchOffsetRegDelta = Offset; 675 } else { 676 Scavenged = true; 677 } 678 679 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 680 .addReg(ScratchOffsetReg) 681 .addImm(Offset); 682 683 Offset = 0; 684 } 685 686 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 687 Register SubReg = NumSubRegs == 1 688 ? Register(ValueReg) 689 : getSubReg(ValueReg, getSubRegFromChannel(i)); 690 691 unsigned SOffsetRegState = 0; 692 unsigned SrcDstRegState = getDefRegState(!IsStore); 693 if (i + 1 == e) { 694 SOffsetRegState |= getKillRegState(Scavenged); 695 // The last implicit use carries the "Kill" flag. 696 SrcDstRegState |= getKillRegState(IsKill); 697 } 698 699 auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill); 700 701 if (!MIB.getInstr()) { 702 unsigned FinalReg = SubReg; 703 if (TmpReg != AMDGPU::NoRegister) { 704 if (IsStore) 705 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) 706 .addReg(SubReg, getKillRegState(IsKill)); 707 SubReg = TmpReg; 708 } 709 710 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 711 MachineMemOperand *NewMMO 712 = MF->getMachineMemOperand(PInfo, MMO->getFlags(), 713 EltSize, MinAlign(Align, EltSize * i)); 714 715 MIB = BuildMI(*MBB, MI, DL, Desc) 716 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) 717 .addReg(ScratchRsrcReg) 718 .addReg(SOffset, SOffsetRegState) 719 .addImm(Offset) 720 .addImm(0) // glc 721 .addImm(0) // slc 722 .addImm(0) // tfe 723 .addImm(0) // dlc 724 .addImm(0) // swz 725 .addMemOperand(NewMMO); 726 727 if (!IsStore && TmpReg != AMDGPU::NoRegister) 728 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), 729 FinalReg) 730 .addReg(TmpReg, RegState::Kill); 731 } 732 733 if (NumSubRegs > 1) 734 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 735 } 736 737 if (ScratchOffsetRegDelta != 0) { 738 // Subtract the offset we added to the ScratchOffset register. 739 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 740 .addReg(ScratchOffsetReg) 741 .addImm(ScratchOffsetRegDelta); 742 } 743 } 744 745 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 746 int Index, 747 RegScavenger *RS, 748 bool OnlyToVGPR) const { 749 MachineBasicBlock *MBB = MI->getParent(); 750 MachineFunction *MF = MBB->getParent(); 751 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 752 DenseSet<unsigned> SGPRSpillVGPRDefinedSet; 753 754 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 755 = MFI->getSGPRToVGPRSpills(Index); 756 bool SpillToVGPR = !VGPRSpills.empty(); 757 if (OnlyToVGPR && !SpillToVGPR) 758 return false; 759 760 const SIInstrInfo *TII = ST.getInstrInfo(); 761 762 Register SuperReg = MI->getOperand(0).getReg(); 763 bool IsKill = MI->getOperand(0).isKill(); 764 const DebugLoc &DL = MI->getDebugLoc(); 765 766 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 767 768 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 769 SuperReg != MFI->getFrameOffsetReg() && 770 SuperReg != MFI->getScratchWaveOffsetReg())); 771 772 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 773 774 unsigned M0CopyReg = AMDGPU::NoRegister; 775 776 unsigned EltSize = 4; 777 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 778 779 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 780 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 781 782 // Scavenged temporary VGPR to use. It must be scavenged once for any number 783 // of spilled subregs. 784 Register TmpVGPR; 785 786 // SubReg carries the "Kill" flag when SubReg == SuperReg. 787 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 788 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 789 Register SubReg = 790 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 791 792 if (SpillToVGPR) { 793 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 794 795 // During SGPR spilling to VGPR, determine if the VGPR is defined. The 796 // only circumstance in which we say it is undefined is when it is the 797 // first spill to this VGPR in the first basic block. 798 bool VGPRDefined = true; 799 if (MBB == &MF->front()) 800 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; 801 802 // Mark the "old value of vgpr" input undef only if this is the first sgpr 803 // spill to this specific vgpr in the first basic block. 804 BuildMI(*MBB, MI, DL, 805 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 806 Spill.VGPR) 807 .addReg(SubReg, getKillRegState(IsKill)) 808 .addImm(Spill.Lane) 809 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); 810 811 // FIXME: Since this spills to another register instead of an actual 812 // frame index, we should delete the frame index when all references to 813 // it are fixed. 814 } else { 815 // XXX - Can to VGPR spill fail for some subregisters but not others? 816 if (OnlyToVGPR) 817 return false; 818 819 // Spill SGPR to a frame index. 820 if (!TmpVGPR.isValid()) 821 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 822 823 MachineInstrBuilder Mov 824 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 825 .addReg(SubReg, SubKillState); 826 827 // There could be undef components of a spilled super register. 828 // TODO: Can we detect this and skip the spill? 829 if (NumSubRegs > 1) { 830 // The last implicit use of the SuperReg carries the "Kill" flag. 831 unsigned SuperKillState = 0; 832 if (i + 1 == e) 833 SuperKillState |= getKillRegState(IsKill); 834 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 835 } 836 837 unsigned Align = FrameInfo.getObjectAlignment(Index); 838 MachinePointerInfo PtrInfo 839 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 840 MachineMemOperand *MMO 841 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 842 EltSize, MinAlign(Align, EltSize * i)); 843 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 844 .addReg(TmpVGPR, RegState::Kill) // src 845 .addFrameIndex(Index) // vaddr 846 .addReg(MFI->getScratchRSrcReg()) // srrsrc 847 .addReg(MFI->getStackPtrOffsetReg()) // soffset 848 .addImm(i * 4) // offset 849 .addMemOperand(MMO); 850 } 851 } 852 853 if (M0CopyReg != AMDGPU::NoRegister) { 854 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 855 .addReg(M0CopyReg, RegState::Kill); 856 } 857 858 MI->eraseFromParent(); 859 MFI->addToSpilledSGPRs(NumSubRegs); 860 return true; 861 } 862 863 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 864 int Index, 865 RegScavenger *RS, 866 bool OnlyToVGPR) const { 867 MachineFunction *MF = MI->getParent()->getParent(); 868 MachineBasicBlock *MBB = MI->getParent(); 869 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 870 871 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 872 = MFI->getSGPRToVGPRSpills(Index); 873 bool SpillToVGPR = !VGPRSpills.empty(); 874 if (OnlyToVGPR && !SpillToVGPR) 875 return false; 876 877 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 878 const SIInstrInfo *TII = ST.getInstrInfo(); 879 const DebugLoc &DL = MI->getDebugLoc(); 880 881 Register SuperReg = MI->getOperand(0).getReg(); 882 883 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 884 885 unsigned M0CopyReg = AMDGPU::NoRegister; 886 887 unsigned EltSize = 4; 888 889 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 890 891 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 892 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 893 894 Register TmpVGPR; 895 896 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 897 Register SubReg = 898 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 899 900 if (SpillToVGPR) { 901 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 902 auto MIB = 903 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 904 SubReg) 905 .addReg(Spill.VGPR) 906 .addImm(Spill.Lane); 907 908 if (NumSubRegs > 1 && i == 0) 909 MIB.addReg(SuperReg, RegState::ImplicitDefine); 910 } else { 911 if (OnlyToVGPR) 912 return false; 913 914 // Restore SGPR from a stack slot. 915 // FIXME: We should use S_LOAD_DWORD here for VI. 916 if (!TmpVGPR.isValid()) 917 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 918 unsigned Align = FrameInfo.getObjectAlignment(Index); 919 920 MachinePointerInfo PtrInfo 921 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 922 923 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, 924 MachineMemOperand::MOLoad, EltSize, 925 MinAlign(Align, EltSize * i)); 926 927 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) 928 .addFrameIndex(Index) // vaddr 929 .addReg(MFI->getScratchRSrcReg()) // srsrc 930 .addReg(MFI->getStackPtrOffsetReg()) // soffset 931 .addImm(i * 4) // offset 932 .addMemOperand(MMO); 933 934 auto MIB = 935 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 936 .addReg(TmpVGPR, RegState::Kill); 937 938 if (NumSubRegs > 1) 939 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 940 } 941 } 942 943 if (M0CopyReg != AMDGPU::NoRegister) { 944 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 945 .addReg(M0CopyReg, RegState::Kill); 946 } 947 948 MI->eraseFromParent(); 949 return true; 950 } 951 952 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 953 /// a VGPR and the stack slot can be safely eliminated when all other users are 954 /// handled. 955 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 956 MachineBasicBlock::iterator MI, 957 int FI, 958 RegScavenger *RS) const { 959 switch (MI->getOpcode()) { 960 case AMDGPU::SI_SPILL_S1024_SAVE: 961 case AMDGPU::SI_SPILL_S512_SAVE: 962 case AMDGPU::SI_SPILL_S256_SAVE: 963 case AMDGPU::SI_SPILL_S160_SAVE: 964 case AMDGPU::SI_SPILL_S128_SAVE: 965 case AMDGPU::SI_SPILL_S96_SAVE: 966 case AMDGPU::SI_SPILL_S64_SAVE: 967 case AMDGPU::SI_SPILL_S32_SAVE: 968 return spillSGPR(MI, FI, RS, true); 969 case AMDGPU::SI_SPILL_S1024_RESTORE: 970 case AMDGPU::SI_SPILL_S512_RESTORE: 971 case AMDGPU::SI_SPILL_S256_RESTORE: 972 case AMDGPU::SI_SPILL_S160_RESTORE: 973 case AMDGPU::SI_SPILL_S128_RESTORE: 974 case AMDGPU::SI_SPILL_S96_RESTORE: 975 case AMDGPU::SI_SPILL_S64_RESTORE: 976 case AMDGPU::SI_SPILL_S32_RESTORE: 977 return restoreSGPR(MI, FI, RS, true); 978 default: 979 llvm_unreachable("not an SGPR spill instruction"); 980 } 981 } 982 983 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 984 int SPAdj, unsigned FIOperandNum, 985 RegScavenger *RS) const { 986 MachineFunction *MF = MI->getParent()->getParent(); 987 MachineBasicBlock *MBB = MI->getParent(); 988 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 989 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 990 const SIInstrInfo *TII = ST.getInstrInfo(); 991 DebugLoc DL = MI->getDebugLoc(); 992 993 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 994 995 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 996 int Index = MI->getOperand(FIOperandNum).getIndex(); 997 998 Register FrameReg = getFrameRegister(*MF); 999 1000 switch (MI->getOpcode()) { 1001 // SGPR register spill 1002 case AMDGPU::SI_SPILL_S1024_SAVE: 1003 case AMDGPU::SI_SPILL_S512_SAVE: 1004 case AMDGPU::SI_SPILL_S256_SAVE: 1005 case AMDGPU::SI_SPILL_S160_SAVE: 1006 case AMDGPU::SI_SPILL_S128_SAVE: 1007 case AMDGPU::SI_SPILL_S96_SAVE: 1008 case AMDGPU::SI_SPILL_S64_SAVE: 1009 case AMDGPU::SI_SPILL_S32_SAVE: { 1010 spillSGPR(MI, Index, RS); 1011 break; 1012 } 1013 1014 // SGPR register restore 1015 case AMDGPU::SI_SPILL_S1024_RESTORE: 1016 case AMDGPU::SI_SPILL_S512_RESTORE: 1017 case AMDGPU::SI_SPILL_S256_RESTORE: 1018 case AMDGPU::SI_SPILL_S160_RESTORE: 1019 case AMDGPU::SI_SPILL_S128_RESTORE: 1020 case AMDGPU::SI_SPILL_S96_RESTORE: 1021 case AMDGPU::SI_SPILL_S64_RESTORE: 1022 case AMDGPU::SI_SPILL_S32_RESTORE: { 1023 restoreSGPR(MI, Index, RS); 1024 break; 1025 } 1026 1027 // VGPR register spill 1028 case AMDGPU::SI_SPILL_V1024_SAVE: 1029 case AMDGPU::SI_SPILL_V512_SAVE: 1030 case AMDGPU::SI_SPILL_V256_SAVE: 1031 case AMDGPU::SI_SPILL_V160_SAVE: 1032 case AMDGPU::SI_SPILL_V128_SAVE: 1033 case AMDGPU::SI_SPILL_V96_SAVE: 1034 case AMDGPU::SI_SPILL_V64_SAVE: 1035 case AMDGPU::SI_SPILL_V32_SAVE: 1036 case AMDGPU::SI_SPILL_A1024_SAVE: 1037 case AMDGPU::SI_SPILL_A512_SAVE: 1038 case AMDGPU::SI_SPILL_A128_SAVE: 1039 case AMDGPU::SI_SPILL_A64_SAVE: 1040 case AMDGPU::SI_SPILL_A32_SAVE: { 1041 const MachineOperand *VData = TII->getNamedOperand(*MI, 1042 AMDGPU::OpName::vdata); 1043 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1044 MFI->getStackPtrOffsetReg()); 1045 1046 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 1047 Index, 1048 VData->getReg(), VData->isKill(), 1049 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1050 FrameReg, 1051 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1052 *MI->memoperands_begin(), 1053 RS); 1054 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1055 MI->eraseFromParent(); 1056 break; 1057 } 1058 case AMDGPU::SI_SPILL_V32_RESTORE: 1059 case AMDGPU::SI_SPILL_V64_RESTORE: 1060 case AMDGPU::SI_SPILL_V96_RESTORE: 1061 case AMDGPU::SI_SPILL_V128_RESTORE: 1062 case AMDGPU::SI_SPILL_V160_RESTORE: 1063 case AMDGPU::SI_SPILL_V256_RESTORE: 1064 case AMDGPU::SI_SPILL_V512_RESTORE: 1065 case AMDGPU::SI_SPILL_V1024_RESTORE: 1066 case AMDGPU::SI_SPILL_A32_RESTORE: 1067 case AMDGPU::SI_SPILL_A64_RESTORE: 1068 case AMDGPU::SI_SPILL_A128_RESTORE: 1069 case AMDGPU::SI_SPILL_A512_RESTORE: 1070 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1071 const MachineOperand *VData = TII->getNamedOperand(*MI, 1072 AMDGPU::OpName::vdata); 1073 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1074 MFI->getStackPtrOffsetReg()); 1075 1076 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 1077 Index, 1078 VData->getReg(), VData->isKill(), 1079 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1080 FrameReg, 1081 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1082 *MI->memoperands_begin(), 1083 RS); 1084 MI->eraseFromParent(); 1085 break; 1086 } 1087 1088 default: { 1089 const DebugLoc &DL = MI->getDebugLoc(); 1090 bool IsMUBUF = TII->isMUBUF(*MI); 1091 1092 if (!IsMUBUF && !MFI->isEntryFunction()) { 1093 // Convert to an absolute stack address by finding the offset from the 1094 // scratch wave base and scaling by the wave size. 1095 // 1096 // In an entry function/kernel the offset is already the absolute 1097 // address relative to the frame register. 1098 1099 Register TmpDiffReg = 1100 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1101 1102 // If there's no free SGPR, in-place modify the FP 1103 Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; 1104 1105 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1106 Register ResultReg = IsCopy ? 1107 MI->getOperand(0).getReg() : 1108 RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1109 1110 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) 1111 .addReg(FrameReg) 1112 .addReg(MFI->getScratchWaveOffsetReg()); 1113 1114 int64_t Offset = FrameInfo.getObjectOffset(Index); 1115 if (Offset == 0) { 1116 // XXX - This never happens because of emergency scavenging slot at 0? 1117 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1118 .addImm(ST.getWavefrontSizeLog2()) 1119 .addReg(DiffReg); 1120 } else { 1121 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1122 Register ScaledReg = 1123 RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); 1124 1125 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1126 ScaledReg) 1127 .addImm(ST.getWavefrontSizeLog2()) 1128 .addReg(DiffReg, RegState::Kill); 1129 1130 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1131 1132 // TODO: Fold if use instruction is another add of a constant. 1133 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1134 // FIXME: This can fail 1135 MIB.addImm(Offset); 1136 MIB.addReg(ScaledReg, RegState::Kill); 1137 if (!IsVOP2) 1138 MIB.addImm(0); // clamp bit 1139 } else { 1140 Register ConstOffsetReg = 1141 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false); 1142 1143 // This should always be able to use the unused carry out. 1144 assert(ConstOffsetReg && "this scavenge should not be able to fail"); 1145 1146 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1147 .addImm(Offset); 1148 MIB.addReg(ConstOffsetReg, RegState::Kill); 1149 MIB.addReg(ScaledReg, RegState::Kill); 1150 MIB.addImm(0); // clamp bit 1151 } 1152 } else { 1153 // We have to produce a carry out, and we there isn't a free SGPR 1154 // pair for it. We can keep the whole computation on the SALU to 1155 // avoid clobbering an additional register at the cost of an extra 1156 // mov. 1157 1158 // We may have 1 free scratch SGPR even though a carry out is 1159 // unavailable. Only one additional mov is needed. 1160 Register TmpScaledReg = 1161 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1162 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; 1163 1164 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1165 .addReg(DiffReg, RegState::Kill) 1166 .addImm(ST.getWavefrontSizeLog2()); 1167 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) 1168 .addReg(ScaledReg, RegState::Kill) 1169 .addImm(Offset); 1170 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1171 .addReg(ScaledReg, RegState::Kill); 1172 1173 // If there were truly no free SGPRs, we need to undo everything. 1174 if (!TmpScaledReg.isValid()) { 1175 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) 1176 .addReg(ScaledReg, RegState::Kill) 1177 .addImm(Offset); 1178 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1179 .addReg(DiffReg, RegState::Kill) 1180 .addImm(ST.getWavefrontSizeLog2()); 1181 } 1182 } 1183 } 1184 1185 if (!TmpDiffReg.isValid()) { 1186 // Restore the FP. 1187 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) 1188 .addReg(FrameReg) 1189 .addReg(MFI->getScratchWaveOffsetReg()); 1190 } 1191 1192 // Don't introduce an extra copy if we're just materializing in a mov. 1193 if (IsCopy) 1194 MI->eraseFromParent(); 1195 else 1196 FIOp.ChangeToRegister(ResultReg, false, false, true); 1197 return; 1198 } 1199 1200 if (IsMUBUF) { 1201 // Disable offen so we don't need a 0 vgpr base. 1202 assert(static_cast<int>(FIOperandNum) == 1203 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1204 AMDGPU::OpName::vaddr)); 1205 1206 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1207 MFI->getStackPtrOffsetReg()); 1208 1209 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); 1210 1211 int64_t Offset = FrameInfo.getObjectOffset(Index); 1212 int64_t OldImm 1213 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1214 int64_t NewOffset = OldImm + Offset; 1215 1216 if (isUInt<12>(NewOffset) && 1217 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 1218 MI->eraseFromParent(); 1219 return; 1220 } 1221 } 1222 1223 // If the offset is simply too big, don't convert to a scratch wave offset 1224 // relative index. 1225 1226 int64_t Offset = FrameInfo.getObjectOffset(Index); 1227 FIOp.ChangeToImmediate(Offset); 1228 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1229 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1230 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1231 .addImm(Offset); 1232 FIOp.ChangeToRegister(TmpReg, false, false, true); 1233 } 1234 } 1235 } 1236 } 1237 1238 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { 1239 return AMDGPUInstPrinter::getRegisterName(Reg); 1240 } 1241 1242 // FIXME: This is very slow. It might be worth creating a map from physreg to 1243 // register class. 1244 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 1245 assert(!Register::isVirtualRegister(Reg)); 1246 1247 static const TargetRegisterClass *const BaseClasses[] = { 1248 &AMDGPU::VGPR_32RegClass, 1249 &AMDGPU::SReg_32RegClass, 1250 &AMDGPU::AGPR_32RegClass, 1251 &AMDGPU::VReg_64RegClass, 1252 &AMDGPU::SReg_64RegClass, 1253 &AMDGPU::AReg_64RegClass, 1254 &AMDGPU::VReg_96RegClass, 1255 &AMDGPU::SReg_96RegClass, 1256 &AMDGPU::VReg_128RegClass, 1257 &AMDGPU::SReg_128RegClass, 1258 &AMDGPU::AReg_128RegClass, 1259 &AMDGPU::VReg_160RegClass, 1260 &AMDGPU::SReg_160RegClass, 1261 &AMDGPU::VReg_256RegClass, 1262 &AMDGPU::SReg_256RegClass, 1263 &AMDGPU::VReg_512RegClass, 1264 &AMDGPU::SReg_512RegClass, 1265 &AMDGPU::AReg_512RegClass, 1266 &AMDGPU::SReg_1024RegClass, 1267 &AMDGPU::VReg_1024RegClass, 1268 &AMDGPU::AReg_1024RegClass, 1269 &AMDGPU::SCC_CLASSRegClass, 1270 &AMDGPU::Pseudo_SReg_32RegClass, 1271 &AMDGPU::Pseudo_SReg_128RegClass, 1272 }; 1273 1274 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1275 if (BaseClass->contains(Reg)) { 1276 return BaseClass; 1277 } 1278 } 1279 return nullptr; 1280 } 1281 1282 // TODO: It might be helpful to have some target specific flags in 1283 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1284 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1285 unsigned Size = getRegSizeInBits(*RC); 1286 switch (Size) { 1287 case 32: 1288 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 1289 case 64: 1290 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 1291 case 96: 1292 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 1293 case 128: 1294 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 1295 case 160: 1296 return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; 1297 case 256: 1298 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 1299 case 512: 1300 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 1301 case 1024: 1302 return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; 1303 case 1: 1304 return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; 1305 default: 1306 assert(Size < 32 && "Invalid register class size"); 1307 return false; 1308 } 1309 } 1310 1311 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1312 unsigned Size = getRegSizeInBits(*RC); 1313 if (Size < 32) 1314 return false; 1315 switch (Size) { 1316 case 32: 1317 return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; 1318 case 64: 1319 return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; 1320 case 96: 1321 return false; 1322 case 128: 1323 return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; 1324 case 160: 1325 case 256: 1326 return false; 1327 case 512: 1328 return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; 1329 case 1024: 1330 return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; 1331 default: 1332 llvm_unreachable("Invalid register class size"); 1333 } 1334 } 1335 1336 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 1337 const TargetRegisterClass *SRC) const { 1338 switch (getRegSizeInBits(*SRC)) { 1339 case 32: 1340 return &AMDGPU::VGPR_32RegClass; 1341 case 64: 1342 return &AMDGPU::VReg_64RegClass; 1343 case 96: 1344 return &AMDGPU::VReg_96RegClass; 1345 case 128: 1346 return &AMDGPU::VReg_128RegClass; 1347 case 160: 1348 return &AMDGPU::VReg_160RegClass; 1349 case 256: 1350 return &AMDGPU::VReg_256RegClass; 1351 case 512: 1352 return &AMDGPU::VReg_512RegClass; 1353 case 1024: 1354 return &AMDGPU::VReg_1024RegClass; 1355 case 1: 1356 return &AMDGPU::VReg_1RegClass; 1357 default: 1358 llvm_unreachable("Invalid register class size"); 1359 } 1360 } 1361 1362 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( 1363 const TargetRegisterClass *SRC) const { 1364 switch (getRegSizeInBits(*SRC)) { 1365 case 32: 1366 return &AMDGPU::AGPR_32RegClass; 1367 case 64: 1368 return &AMDGPU::AReg_64RegClass; 1369 case 128: 1370 return &AMDGPU::AReg_128RegClass; 1371 case 512: 1372 return &AMDGPU::AReg_512RegClass; 1373 case 1024: 1374 return &AMDGPU::AReg_1024RegClass; 1375 default: 1376 llvm_unreachable("Invalid register class size"); 1377 } 1378 } 1379 1380 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1381 const TargetRegisterClass *VRC) const { 1382 switch (getRegSizeInBits(*VRC)) { 1383 case 32: 1384 return &AMDGPU::SGPR_32RegClass; 1385 case 64: 1386 return &AMDGPU::SReg_64RegClass; 1387 case 96: 1388 return &AMDGPU::SReg_96RegClass; 1389 case 128: 1390 return &AMDGPU::SGPR_128RegClass; 1391 case 160: 1392 return &AMDGPU::SReg_160RegClass; 1393 case 256: 1394 return &AMDGPU::SReg_256RegClass; 1395 case 512: 1396 return &AMDGPU::SReg_512RegClass; 1397 case 1024: 1398 return &AMDGPU::SReg_1024RegClass; 1399 default: 1400 llvm_unreachable("Invalid register class size"); 1401 } 1402 } 1403 1404 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1405 const TargetRegisterClass *RC, unsigned SubIdx) const { 1406 if (SubIdx == AMDGPU::NoSubRegister) 1407 return RC; 1408 1409 // We can assume that each lane corresponds to one 32-bit register. 1410 unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); 1411 if (isSGPRClass(RC)) { 1412 switch (Count) { 1413 case 1: 1414 return &AMDGPU::SGPR_32RegClass; 1415 case 2: 1416 return &AMDGPU::SReg_64RegClass; 1417 case 3: 1418 return &AMDGPU::SReg_96RegClass; 1419 case 4: 1420 return &AMDGPU::SGPR_128RegClass; 1421 case 5: 1422 return &AMDGPU::SReg_160RegClass; 1423 case 8: 1424 return &AMDGPU::SReg_256RegClass; 1425 case 16: 1426 return &AMDGPU::SReg_512RegClass; 1427 case 32: /* fall-through */ 1428 default: 1429 llvm_unreachable("Invalid sub-register class size"); 1430 } 1431 } else if (hasAGPRs(RC)) { 1432 switch (Count) { 1433 case 1: 1434 return &AMDGPU::AGPR_32RegClass; 1435 case 2: 1436 return &AMDGPU::AReg_64RegClass; 1437 case 4: 1438 return &AMDGPU::AReg_128RegClass; 1439 case 16: 1440 return &AMDGPU::AReg_512RegClass; 1441 case 32: /* fall-through */ 1442 default: 1443 llvm_unreachable("Invalid sub-register class size"); 1444 } 1445 } else { 1446 switch (Count) { 1447 case 1: 1448 return &AMDGPU::VGPR_32RegClass; 1449 case 2: 1450 return &AMDGPU::VReg_64RegClass; 1451 case 3: 1452 return &AMDGPU::VReg_96RegClass; 1453 case 4: 1454 return &AMDGPU::VReg_128RegClass; 1455 case 5: 1456 return &AMDGPU::VReg_160RegClass; 1457 case 8: 1458 return &AMDGPU::VReg_256RegClass; 1459 case 16: 1460 return &AMDGPU::VReg_512RegClass; 1461 case 32: /* fall-through */ 1462 default: 1463 llvm_unreachable("Invalid sub-register class size"); 1464 } 1465 } 1466 } 1467 1468 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 1469 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 1470 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 1471 return !ST.hasMFMAInlineLiteralBug(); 1472 1473 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 1474 OpType <= AMDGPU::OPERAND_SRC_LAST; 1475 } 1476 1477 bool SIRegisterInfo::shouldRewriteCopySrc( 1478 const TargetRegisterClass *DefRC, 1479 unsigned DefSubReg, 1480 const TargetRegisterClass *SrcRC, 1481 unsigned SrcSubReg) const { 1482 // We want to prefer the smallest register class possible, so we don't want to 1483 // stop and rewrite on anything that looks like a subregister 1484 // extract. Operations mostly don't care about the super register class, so we 1485 // only want to stop on the most basic of copies between the same register 1486 // class. 1487 // 1488 // e.g. if we have something like 1489 // %0 = ... 1490 // %1 = ... 1491 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1492 // %3 = COPY %2, sub0 1493 // 1494 // We want to look through the COPY to find: 1495 // => %3 = COPY %0 1496 1497 // Plain copy. 1498 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1499 } 1500 1501 /// Returns a register that is not used at any point in the function. 1502 /// If all registers are used, then this function will return 1503 // AMDGPU::NoRegister. 1504 unsigned 1505 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1506 const TargetRegisterClass *RC, 1507 const MachineFunction &MF) const { 1508 1509 for (unsigned Reg : *RC) 1510 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1511 return Reg; 1512 return AMDGPU::NoRegister; 1513 } 1514 1515 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1516 unsigned EltSize) const { 1517 if (EltSize == 4) { 1518 static const int16_t Sub0_31[] = { 1519 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1520 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1521 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1522 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1523 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 1524 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 1525 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 1526 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, 1527 }; 1528 1529 static const int16_t Sub0_15[] = { 1530 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1531 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1532 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1533 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1534 }; 1535 1536 static const int16_t Sub0_7[] = { 1537 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1538 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1539 }; 1540 1541 static const int16_t Sub0_4[] = { 1542 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, 1543 }; 1544 1545 static const int16_t Sub0_3[] = { 1546 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1547 }; 1548 1549 static const int16_t Sub0_2[] = { 1550 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1551 }; 1552 1553 static const int16_t Sub0_1[] = { 1554 AMDGPU::sub0, AMDGPU::sub1, 1555 }; 1556 1557 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1558 case 32: 1559 return {}; 1560 case 64: 1561 return makeArrayRef(Sub0_1); 1562 case 96: 1563 return makeArrayRef(Sub0_2); 1564 case 128: 1565 return makeArrayRef(Sub0_3); 1566 case 160: 1567 return makeArrayRef(Sub0_4); 1568 case 256: 1569 return makeArrayRef(Sub0_7); 1570 case 512: 1571 return makeArrayRef(Sub0_15); 1572 case 1024: 1573 return makeArrayRef(Sub0_31); 1574 default: 1575 llvm_unreachable("unhandled register size"); 1576 } 1577 } 1578 1579 if (EltSize == 8) { 1580 static const int16_t Sub0_31_64[] = { 1581 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1582 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1583 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1584 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1585 AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, 1586 AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, 1587 AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, 1588 AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 1589 }; 1590 1591 static const int16_t Sub0_15_64[] = { 1592 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1593 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1594 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1595 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1596 }; 1597 1598 static const int16_t Sub0_7_64[] = { 1599 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1600 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1601 }; 1602 1603 1604 static const int16_t Sub0_3_64[] = { 1605 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1606 }; 1607 1608 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1609 case 64: 1610 return {}; 1611 case 128: 1612 return makeArrayRef(Sub0_3_64); 1613 case 256: 1614 return makeArrayRef(Sub0_7_64); 1615 case 512: 1616 return makeArrayRef(Sub0_15_64); 1617 case 1024: 1618 return makeArrayRef(Sub0_31_64); 1619 default: 1620 llvm_unreachable("unhandled register size"); 1621 } 1622 } 1623 1624 if (EltSize == 16) { 1625 1626 static const int16_t Sub0_31_128[] = { 1627 AMDGPU::sub0_sub1_sub2_sub3, 1628 AMDGPU::sub4_sub5_sub6_sub7, 1629 AMDGPU::sub8_sub9_sub10_sub11, 1630 AMDGPU::sub12_sub13_sub14_sub15, 1631 AMDGPU::sub16_sub17_sub18_sub19, 1632 AMDGPU::sub20_sub21_sub22_sub23, 1633 AMDGPU::sub24_sub25_sub26_sub27, 1634 AMDGPU::sub28_sub29_sub30_sub31 1635 }; 1636 1637 static const int16_t Sub0_15_128[] = { 1638 AMDGPU::sub0_sub1_sub2_sub3, 1639 AMDGPU::sub4_sub5_sub6_sub7, 1640 AMDGPU::sub8_sub9_sub10_sub11, 1641 AMDGPU::sub12_sub13_sub14_sub15 1642 }; 1643 1644 static const int16_t Sub0_7_128[] = { 1645 AMDGPU::sub0_sub1_sub2_sub3, 1646 AMDGPU::sub4_sub5_sub6_sub7 1647 }; 1648 1649 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1650 case 128: 1651 return {}; 1652 case 256: 1653 return makeArrayRef(Sub0_7_128); 1654 case 512: 1655 return makeArrayRef(Sub0_15_128); 1656 case 1024: 1657 return makeArrayRef(Sub0_31_128); 1658 default: 1659 llvm_unreachable("unhandled register size"); 1660 } 1661 } 1662 1663 assert(EltSize == 32 && "unhandled elt size"); 1664 1665 static const int16_t Sub0_31_256[] = { 1666 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1667 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1668 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, 1669 AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1670 }; 1671 1672 static const int16_t Sub0_15_256[] = { 1673 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1674 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 1675 }; 1676 1677 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1678 case 256: 1679 return {}; 1680 case 512: 1681 return makeArrayRef(Sub0_15_256); 1682 case 1024: 1683 return makeArrayRef(Sub0_31_256); 1684 default: 1685 llvm_unreachable("unhandled register size"); 1686 } 1687 } 1688 1689 const TargetRegisterClass* 1690 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1691 unsigned Reg) const { 1692 if (Register::isVirtualRegister(Reg)) 1693 return MRI.getRegClass(Reg); 1694 1695 return getPhysRegClass(Reg); 1696 } 1697 1698 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1699 unsigned Reg) const { 1700 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1701 assert(RC && "Register class for the reg not found"); 1702 return hasVGPRs(RC); 1703 } 1704 1705 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 1706 unsigned Reg) const { 1707 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1708 assert(RC && "Register class for the reg not found"); 1709 return hasAGPRs(RC); 1710 } 1711 1712 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 1713 const TargetRegisterClass *SrcRC, 1714 unsigned SubReg, 1715 const TargetRegisterClass *DstRC, 1716 unsigned DstSubReg, 1717 const TargetRegisterClass *NewRC, 1718 LiveIntervals &LIS) const { 1719 unsigned SrcSize = getRegSizeInBits(*SrcRC); 1720 unsigned DstSize = getRegSizeInBits(*DstRC); 1721 unsigned NewSize = getRegSizeInBits(*NewRC); 1722 1723 // Do not increase size of registers beyond dword, we would need to allocate 1724 // adjacent registers and constraint regalloc more than needed. 1725 1726 // Always allow dword coalescing. 1727 if (SrcSize <= 32 || DstSize <= 32) 1728 return true; 1729 1730 return NewSize <= DstSize || NewSize <= SrcSize; 1731 } 1732 1733 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 1734 MachineFunction &MF) const { 1735 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1736 1737 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 1738 MF.getFunction()); 1739 switch (RC->getID()) { 1740 default: 1741 return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); 1742 case AMDGPU::VGPR_32RegClassID: 1743 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 1744 case AMDGPU::SGPR_32RegClassID: 1745 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 1746 } 1747 } 1748 1749 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 1750 unsigned Idx) const { 1751 if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) 1752 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 1753 const_cast<MachineFunction &>(MF)); 1754 1755 if (Idx == getSGPRPressureSet()) 1756 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 1757 const_cast<MachineFunction &>(MF)); 1758 1759 return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); 1760 } 1761 1762 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 1763 static const int Empty[] = { -1 }; 1764 1765 if (hasRegUnit(AMDGPU::M0, RegUnit)) 1766 return Empty; 1767 return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); 1768 } 1769 1770 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 1771 // Not a callee saved register. 1772 return AMDGPU::SGPR30_SGPR31; 1773 } 1774 1775 const TargetRegisterClass * 1776 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 1777 const RegisterBank &RB, 1778 const MachineRegisterInfo &MRI) const { 1779 switch (Size) { 1780 case 1: { 1781 switch (RB.getID()) { 1782 case AMDGPU::VGPRRegBankID: 1783 return &AMDGPU::VGPR_32RegClass; 1784 case AMDGPU::VCCRegBankID: 1785 return isWave32 ? 1786 &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; 1787 case AMDGPU::SGPRRegBankID: 1788 return &AMDGPU::SReg_32RegClass; 1789 case AMDGPU::SCCRegBankID: 1790 // This needs to return an allocatable class, so don't bother returning 1791 // the dummy SCC class. 1792 // 1793 // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this 1794 // was not an VCC bank value since we use the larger class SReg_32 for 1795 // other values. These should all use SReg_32. 1796 return &AMDGPU::SGPR_32RegClass; 1797 default: 1798 llvm_unreachable("unknown register bank"); 1799 } 1800 } 1801 case 32: 1802 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1803 &AMDGPU::SReg_32RegClass; 1804 case 64: 1805 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : 1806 &AMDGPU::SReg_64_XEXECRegClass; 1807 case 96: 1808 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : 1809 &AMDGPU::SReg_96RegClass; 1810 case 128: 1811 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : 1812 &AMDGPU::SGPR_128RegClass; 1813 case 160: 1814 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : 1815 &AMDGPU::SReg_160RegClass; 1816 case 256: 1817 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : 1818 &AMDGPU::SReg_256RegClass; 1819 case 512: 1820 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : 1821 &AMDGPU::SReg_512RegClass; 1822 case 1024: 1823 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : 1824 &AMDGPU::SReg_1024RegClass; 1825 default: 1826 if (Size < 32) 1827 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1828 &AMDGPU::SReg_32RegClass; 1829 return nullptr; 1830 } 1831 } 1832 1833 const TargetRegisterClass * 1834 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 1835 const MachineRegisterInfo &MRI) const { 1836 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 1837 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 1838 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 1839 1840 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); 1841 return getAllocatableClass(RC); 1842 } 1843 1844 unsigned SIRegisterInfo::getVCC() const { 1845 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 1846 } 1847 1848 const TargetRegisterClass * 1849 SIRegisterInfo::getRegClass(unsigned RCID) const { 1850 switch ((int)RCID) { 1851 case AMDGPU::SReg_1RegClassID: 1852 return getBoolRC(); 1853 case AMDGPU::SReg_1_XEXECRegClassID: 1854 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 1855 : &AMDGPU::SReg_64_XEXECRegClass; 1856 case -1: 1857 return nullptr; 1858 default: 1859 return AMDGPURegisterInfo::getRegClass(RCID); 1860 } 1861 } 1862 1863 // Find reaching register definition 1864 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, 1865 MachineInstr &Use, 1866 MachineRegisterInfo &MRI, 1867 LiveIntervals *LIS) const { 1868 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 1869 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 1870 SlotIndex DefIdx; 1871 1872 if (Register::isVirtualRegister(Reg)) { 1873 if (!LIS->hasInterval(Reg)) 1874 return nullptr; 1875 LiveInterval &LI = LIS->getInterval(Reg); 1876 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 1877 : MRI.getMaxLaneMaskForVReg(Reg); 1878 VNInfo *V = nullptr; 1879 if (LI.hasSubRanges()) { 1880 for (auto &S : LI.subranges()) { 1881 if ((S.LaneMask & SubLanes) == SubLanes) { 1882 V = S.getVNInfoAt(UseIdx); 1883 break; 1884 } 1885 } 1886 } else { 1887 V = LI.getVNInfoAt(UseIdx); 1888 } 1889 if (!V) 1890 return nullptr; 1891 DefIdx = V->def; 1892 } else { 1893 // Find last def. 1894 for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { 1895 LiveRange &LR = LIS->getRegUnit(*Units); 1896 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 1897 if (!DefIdx.isValid() || 1898 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 1899 LIS->getInstructionFromIndex(V->def))) 1900 DefIdx = V->def; 1901 } else { 1902 return nullptr; 1903 } 1904 } 1905 } 1906 1907 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 1908 1909 if (!Def || !MDT.dominates(Def, &Use)) 1910 return nullptr; 1911 1912 assert(Def->modifiesRegister(Reg, this)); 1913 1914 return Def; 1915 } 1916