1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "MCTargetDesc/AMDGPUInstPrinter.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/CodeGen/SlotIndexes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/LLVMContext.h" 29 30 using namespace llvm; 31 32 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 33 for (unsigned i = 0; PSets[i] != -1; ++i) { 34 if (PSets[i] == (int)PSetID) 35 return true; 36 } 37 return false; 38 } 39 40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 41 BitVector &PressureSets) const { 42 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 43 const int *PSets = getRegUnitPressureSets(*U); 44 if (hasPressureSet(PSets, PSetID)) { 45 PressureSets.set(PSetID); 46 break; 47 } 48 } 49 } 50 51 static cl::opt<bool> EnableSpillSGPRToSMEM( 52 "amdgpu-spill-sgpr-to-smem", 53 cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableSpillSGPRToVGPR( 57 "amdgpu-spill-sgpr-to-vgpr", 58 cl::desc("Enable spilling VGPRs to SGPRs"), 59 cl::ReallyHidden, 60 cl::init(true)); 61 62 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : 63 AMDGPURegisterInfo(), 64 SGPRPressureSets(getNumRegPressureSets()), 65 VGPRPressureSets(getNumRegPressureSets()), 66 AGPRPressureSets(getNumRegPressureSets()), 67 SpillSGPRToVGPR(false), 68 SpillSGPRToSMEM(false), 69 isWave32(ST.isWave32()) { 70 if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) 71 SpillSGPRToSMEM = true; 72 else if (EnableSpillSGPRToVGPR) 73 SpillSGPRToVGPR = true; 74 75 unsigned NumRegPressureSets = getNumRegPressureSets(); 76 77 SGPRSetID = NumRegPressureSets; 78 VGPRSetID = NumRegPressureSets; 79 AGPRSetID = NumRegPressureSets; 80 81 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 82 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 83 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 84 classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); 85 } 86 87 // Determine the number of reg units for each pressure set. 88 std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); 89 for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { 90 const int *PSets = getRegUnitPressureSets(i); 91 for (unsigned j = 0; PSets[j] != -1; ++j) { 92 ++PressureSetRegUnits[PSets[j]]; 93 } 94 } 95 96 unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; 97 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 98 if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { 99 VGPRSetID = i; 100 VGPRMax = PressureSetRegUnits[i]; 101 continue; 102 } 103 if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { 104 SGPRSetID = i; 105 SGPRMax = PressureSetRegUnits[i]; 106 } 107 if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { 108 AGPRSetID = i; 109 AGPRMax = PressureSetRegUnits[i]; 110 continue; 111 } 112 } 113 114 assert(SGPRSetID < NumRegPressureSets && 115 VGPRSetID < NumRegPressureSets && 116 AGPRSetID < NumRegPressureSets); 117 } 118 119 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 120 const MachineFunction &MF) const { 121 122 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 123 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 124 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 125 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); 126 } 127 128 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { 129 unsigned Reg; 130 131 // Try to place it in a hole after PrivateSegmentBufferReg. 132 if (RegCount & 3) { 133 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 134 // alignment constraints, so we have a hole where can put the wave offset. 135 Reg = RegCount - 1; 136 } else { 137 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 138 // wave offset before it. 139 Reg = RegCount - 5; 140 } 141 142 return Reg; 143 } 144 145 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 146 const MachineFunction &MF) const { 147 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 148 unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); 149 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 150 } 151 152 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 153 BitVector Reserved(getNumRegs()); 154 155 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 156 // this seems likely to result in bugs, so I'm marking them as reserved. 157 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 158 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 159 160 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 161 reserveRegisterTuples(Reserved, AMDGPU::M0); 162 163 // Reserve src_vccz, src_execz, src_scc. 164 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 165 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 166 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 167 168 // Reserve the memory aperture registers. 169 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 170 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 171 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 172 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 173 174 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 175 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 176 177 // Reserve xnack_mask registers - support is not implemented in Codegen. 178 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 179 180 // Reserve lds_direct register - support is not implemented in Codegen. 181 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 182 183 // Reserve Trap Handler registers - support is not implemented in Codegen. 184 reserveRegisterTuples(Reserved, AMDGPU::TBA); 185 reserveRegisterTuples(Reserved, AMDGPU::TMA); 186 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 187 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 188 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 189 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 190 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 191 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 192 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 193 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 194 195 // Reserve null register - it shall never be allocated 196 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 197 198 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 199 // will result in bugs. 200 if (isWave32) { 201 Reserved.set(AMDGPU::VCC); 202 Reserved.set(AMDGPU::VCC_HI); 203 } 204 205 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 206 207 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 208 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 209 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 210 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 211 reserveRegisterTuples(Reserved, Reg); 212 } 213 214 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 215 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 216 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 217 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 218 reserveRegisterTuples(Reserved, Reg); 219 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 220 reserveRegisterTuples(Reserved, Reg); 221 } 222 223 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 224 225 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 226 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 227 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 228 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 229 } 230 231 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 232 if (ScratchRSrcReg != AMDGPU::NoRegister) { 233 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 234 // to spill. 235 // TODO: May need to reserve a VGPR if doing LDS spilling. 236 reserveRegisterTuples(Reserved, ScratchRSrcReg); 237 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 238 } 239 240 // We have to assume the SP is needed in case there are calls in the function, 241 // which is detected after the function is lowered. If we aren't really going 242 // to need SP, don't bother reserving it. 243 unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); 244 245 if (StackPtrReg != AMDGPU::NoRegister) { 246 reserveRegisterTuples(Reserved, StackPtrReg); 247 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 248 } 249 250 unsigned FrameReg = MFI->getFrameOffsetReg(); 251 if (FrameReg != AMDGPU::NoRegister) { 252 reserveRegisterTuples(Reserved, FrameReg); 253 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 254 } 255 256 for (unsigned Reg : MFI->WWMReservedRegs) { 257 reserveRegisterTuples(Reserved, Reg); 258 } 259 260 // FIXME: Stop using reserved registers for this. 261 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 262 reserveRegisterTuples(Reserved, Reg); 263 264 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 265 reserveRegisterTuples(Reserved, Reg); 266 267 return Reserved; 268 } 269 270 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 271 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 272 // On entry, the base address is 0, so it can't possibly need any more 273 // alignment. 274 275 // FIXME: Should be able to specify the entry frame alignment per calling 276 // convention instead. 277 if (Info->isEntryFunction()) 278 return false; 279 280 return TargetRegisterInfo::canRealignStack(MF); 281 } 282 283 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 284 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 285 if (Info->isEntryFunction()) { 286 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 287 return MFI.hasStackObjects() || MFI.hasCalls(); 288 } 289 290 // May need scavenger for dealing with callee saved registers. 291 return true; 292 } 293 294 bool SIRegisterInfo::requiresFrameIndexScavenging( 295 const MachineFunction &MF) const { 296 const MachineFrameInfo &MFI = MF.getFrameInfo(); 297 if (MFI.hasStackObjects()) 298 return true; 299 300 // May need to deal with callee saved registers. 301 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 302 return !Info->isEntryFunction(); 303 } 304 305 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 306 const MachineFunction &MF) const { 307 const MachineFrameInfo &MFI = MF.getFrameInfo(); 308 if (!MFI.hasStackObjects()) 309 return false; 310 311 // The scavenger is used for large frames which may require finding a free 312 // register for large offsets. 313 if (!isUInt<12>(MFI.getStackSize())) 314 return true; 315 316 // If using scalar stores, for spills, m0 is needed for the scalar store 317 // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual 318 // register for it during frame index elimination, so the scavenger is 319 // directly needed. 320 return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && 321 MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); 322 } 323 324 bool SIRegisterInfo::requiresVirtualBaseRegisters( 325 const MachineFunction &) const { 326 // There are no special dedicated stack or frame pointers. 327 return true; 328 } 329 330 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 331 // This helps catch bugs as verifier errors. 332 return true; 333 } 334 335 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 336 assert(SIInstrInfo::isMUBUF(*MI)); 337 338 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 339 AMDGPU::OpName::offset); 340 return MI->getOperand(OffIdx).getImm(); 341 } 342 343 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 344 int Idx) const { 345 if (!SIInstrInfo::isMUBUF(*MI)) 346 return 0; 347 348 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 349 AMDGPU::OpName::vaddr) && 350 "Should never see frame index on non-address operand"); 351 352 return getMUBUFInstrOffset(MI); 353 } 354 355 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 356 if (!MI->mayLoadOrStore()) 357 return false; 358 359 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 360 361 return !isUInt<12>(FullOffset); 362 } 363 364 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 365 unsigned BaseReg, 366 int FrameIdx, 367 int64_t Offset) const { 368 MachineBasicBlock::iterator Ins = MBB->begin(); 369 DebugLoc DL; // Defaults to "unknown" 370 371 if (Ins != MBB->end()) 372 DL = Ins->getDebugLoc(); 373 374 MachineFunction *MF = MBB->getParent(); 375 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 376 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 377 378 if (Offset == 0) { 379 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 380 .addFrameIndex(FrameIdx); 381 return; 382 } 383 384 MachineRegisterInfo &MRI = MF->getRegInfo(); 385 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 386 387 unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 388 389 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 390 .addImm(Offset); 391 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 392 .addFrameIndex(FrameIdx); 393 394 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 395 .addReg(OffsetReg, RegState::Kill) 396 .addReg(FIReg) 397 .addImm(0); // clamp bit 398 } 399 400 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 401 int64_t Offset) const { 402 403 MachineBasicBlock *MBB = MI.getParent(); 404 MachineFunction *MF = MBB->getParent(); 405 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 406 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 407 408 #ifndef NDEBUG 409 // FIXME: Is it possible to be storing a frame index to itself? 410 bool SeenFI = false; 411 for (const MachineOperand &MO: MI.operands()) { 412 if (MO.isFI()) { 413 if (SeenFI) 414 llvm_unreachable("should not see multiple frame indices"); 415 416 SeenFI = true; 417 } 418 } 419 #endif 420 421 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 422 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 423 assert(TII->isMUBUF(MI)); 424 assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == 425 MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && 426 "should only be seeing frame offset relative FrameIndex"); 427 428 429 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 430 int64_t NewOffset = OffsetOp->getImm() + Offset; 431 assert(isUInt<12>(NewOffset) && "offset should be legal"); 432 433 FIOp->ChangeToRegister(BaseReg, false); 434 OffsetOp->setImm(NewOffset); 435 } 436 437 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 438 unsigned BaseReg, 439 int64_t Offset) const { 440 if (!SIInstrInfo::isMUBUF(*MI)) 441 return false; 442 443 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 444 445 return isUInt<12>(NewOffset); 446 } 447 448 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 449 const MachineFunction &MF, unsigned Kind) const { 450 // This is inaccurate. It depends on the instruction and address space. The 451 // only place where we should hit this is for dealing with frame indexes / 452 // private accesses, so this is correct in that case. 453 return &AMDGPU::VGPR_32RegClass; 454 } 455 456 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 457 458 switch (Op) { 459 case AMDGPU::SI_SPILL_S1024_SAVE: 460 case AMDGPU::SI_SPILL_S1024_RESTORE: 461 case AMDGPU::SI_SPILL_V1024_SAVE: 462 case AMDGPU::SI_SPILL_V1024_RESTORE: 463 case AMDGPU::SI_SPILL_A1024_SAVE: 464 case AMDGPU::SI_SPILL_A1024_RESTORE: 465 return 32; 466 case AMDGPU::SI_SPILL_S512_SAVE: 467 case AMDGPU::SI_SPILL_S512_RESTORE: 468 case AMDGPU::SI_SPILL_V512_SAVE: 469 case AMDGPU::SI_SPILL_V512_RESTORE: 470 case AMDGPU::SI_SPILL_A512_SAVE: 471 case AMDGPU::SI_SPILL_A512_RESTORE: 472 return 16; 473 case AMDGPU::SI_SPILL_S256_SAVE: 474 case AMDGPU::SI_SPILL_S256_RESTORE: 475 case AMDGPU::SI_SPILL_V256_SAVE: 476 case AMDGPU::SI_SPILL_V256_RESTORE: 477 return 8; 478 case AMDGPU::SI_SPILL_S160_SAVE: 479 case AMDGPU::SI_SPILL_S160_RESTORE: 480 case AMDGPU::SI_SPILL_V160_SAVE: 481 case AMDGPU::SI_SPILL_V160_RESTORE: 482 return 5; 483 case AMDGPU::SI_SPILL_S128_SAVE: 484 case AMDGPU::SI_SPILL_S128_RESTORE: 485 case AMDGPU::SI_SPILL_V128_SAVE: 486 case AMDGPU::SI_SPILL_V128_RESTORE: 487 case AMDGPU::SI_SPILL_A128_SAVE: 488 case AMDGPU::SI_SPILL_A128_RESTORE: 489 return 4; 490 case AMDGPU::SI_SPILL_S96_SAVE: 491 case AMDGPU::SI_SPILL_S96_RESTORE: 492 case AMDGPU::SI_SPILL_V96_SAVE: 493 case AMDGPU::SI_SPILL_V96_RESTORE: 494 return 3; 495 case AMDGPU::SI_SPILL_S64_SAVE: 496 case AMDGPU::SI_SPILL_S64_RESTORE: 497 case AMDGPU::SI_SPILL_V64_SAVE: 498 case AMDGPU::SI_SPILL_V64_RESTORE: 499 case AMDGPU::SI_SPILL_A64_SAVE: 500 case AMDGPU::SI_SPILL_A64_RESTORE: 501 return 2; 502 case AMDGPU::SI_SPILL_S32_SAVE: 503 case AMDGPU::SI_SPILL_S32_RESTORE: 504 case AMDGPU::SI_SPILL_V32_SAVE: 505 case AMDGPU::SI_SPILL_V32_RESTORE: 506 case AMDGPU::SI_SPILL_A32_SAVE: 507 case AMDGPU::SI_SPILL_A32_RESTORE: 508 return 1; 509 default: llvm_unreachable("Invalid spill opcode"); 510 } 511 } 512 513 static int getOffsetMUBUFStore(unsigned Opc) { 514 switch (Opc) { 515 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 516 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 517 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 518 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 519 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 520 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 521 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 522 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 523 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 524 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 525 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 526 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 527 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 528 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 529 default: 530 return -1; 531 } 532 } 533 534 static int getOffsetMUBUFLoad(unsigned Opc) { 535 switch (Opc) { 536 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 537 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 538 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 539 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 540 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 541 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 542 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 543 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 544 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 545 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 546 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 547 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 548 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 549 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 550 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 551 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 552 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 553 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 554 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 555 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 556 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 557 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 558 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 559 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 560 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 561 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 562 default: 563 return -1; 564 } 565 } 566 567 static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, 568 int Index, 569 unsigned Lane, 570 unsigned ValueReg, 571 bool IsKill) { 572 MachineBasicBlock *MBB = MI->getParent(); 573 MachineFunction *MF = MI->getParent()->getParent(); 574 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 575 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 576 const SIInstrInfo *TII = ST.getInstrInfo(); 577 578 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 579 580 if (Reg == AMDGPU::NoRegister) 581 return MachineInstrBuilder(); 582 583 bool IsStore = MI->mayStore(); 584 MachineRegisterInfo &MRI = MF->getRegInfo(); 585 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 586 587 unsigned Dst = IsStore ? Reg : ValueReg; 588 unsigned Src = IsStore ? ValueReg : Reg; 589 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 590 : AMDGPU::V_ACCVGPR_READ_B32; 591 592 return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 593 .addReg(Src, getKillRegState(IsKill)); 594 } 595 596 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 597 // need to handle the case where an SGPR may need to be spilled while spilling. 598 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, 599 MachineFrameInfo &MFI, 600 MachineBasicBlock::iterator MI, 601 int Index, 602 int64_t Offset) { 603 MachineBasicBlock *MBB = MI->getParent(); 604 const DebugLoc &DL = MI->getDebugLoc(); 605 bool IsStore = MI->mayStore(); 606 607 unsigned Opc = MI->getOpcode(); 608 int LoadStoreOp = IsStore ? 609 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 610 if (LoadStoreOp == -1) 611 return false; 612 613 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 614 if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) 615 return true; 616 617 MachineInstrBuilder NewMI = 618 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 619 .add(*Reg) 620 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 621 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 622 .addImm(Offset) 623 .addImm(0) // glc 624 .addImm(0) // slc 625 .addImm(0) // tfe 626 .addImm(0) // dlc 627 .cloneMemRefs(*MI); 628 629 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 630 AMDGPU::OpName::vdata_in); 631 if (VDataIn) 632 NewMI.add(*VDataIn); 633 return true; 634 } 635 636 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 637 unsigned LoadStoreOp, 638 int Index, 639 unsigned ValueReg, 640 bool IsKill, 641 unsigned ScratchRsrcReg, 642 unsigned ScratchOffsetReg, 643 int64_t InstOffset, 644 MachineMemOperand *MMO, 645 RegScavenger *RS) const { 646 MachineBasicBlock *MBB = MI->getParent(); 647 MachineFunction *MF = MI->getParent()->getParent(); 648 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 649 const SIInstrInfo *TII = ST.getInstrInfo(); 650 const MachineFrameInfo &MFI = MF->getFrameInfo(); 651 652 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 653 const DebugLoc &DL = MI->getDebugLoc(); 654 bool IsStore = Desc.mayStore(); 655 656 bool Scavenged = false; 657 unsigned SOffset = ScratchOffsetReg; 658 659 const unsigned EltSize = 4; 660 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 661 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); 662 unsigned Size = NumSubRegs * EltSize; 663 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 664 int64_t ScratchOffsetRegDelta = 0; 665 666 unsigned Align = MFI.getObjectAlignment(Index); 667 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 668 669 Register TmpReg = 670 hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() 671 : Register(); 672 673 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); 674 675 if (!isUInt<12>(Offset + Size - EltSize)) { 676 SOffset = AMDGPU::NoRegister; 677 678 // We currently only support spilling VGPRs to EltSize boundaries, meaning 679 // we can simplify the adjustment of Offset here to just scale with 680 // WavefrontSize. 681 Offset *= ST.getWavefrontSize(); 682 683 // We don't have access to the register scavenger if this function is called 684 // during PEI::scavengeFrameVirtualRegs(). 685 if (RS) 686 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 687 688 if (SOffset == AMDGPU::NoRegister) { 689 // There are no free SGPRs, and since we are in the process of spilling 690 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 691 // on SI/CI and on VI it is true until we implement spilling using scalar 692 // stores), we have no way to free up an SGPR. Our solution here is to 693 // add the offset directly to the ScratchOffset register, and then 694 // subtract the offset after the spill to return ScratchOffset to it's 695 // original value. 696 SOffset = ScratchOffsetReg; 697 ScratchOffsetRegDelta = Offset; 698 } else { 699 Scavenged = true; 700 } 701 702 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 703 .addReg(ScratchOffsetReg) 704 .addImm(Offset); 705 706 Offset = 0; 707 } 708 709 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 710 unsigned SubReg = NumSubRegs == 1 ? 711 ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); 712 713 unsigned SOffsetRegState = 0; 714 unsigned SrcDstRegState = getDefRegState(!IsStore); 715 if (i + 1 == e) { 716 SOffsetRegState |= getKillRegState(Scavenged); 717 // The last implicit use carries the "Kill" flag. 718 SrcDstRegState |= getKillRegState(IsKill); 719 } 720 721 auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); 722 723 if (!MIB.getInstr()) { 724 unsigned FinalReg = SubReg; 725 if (TmpReg != AMDGPU::NoRegister) { 726 if (IsStore) 727 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) 728 .addReg(SubReg, getKillRegState(IsKill)); 729 SubReg = TmpReg; 730 } 731 732 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 733 MachineMemOperand *NewMMO 734 = MF->getMachineMemOperand(PInfo, MMO->getFlags(), 735 EltSize, MinAlign(Align, EltSize * i)); 736 737 MIB = BuildMI(*MBB, MI, DL, Desc) 738 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) 739 .addReg(ScratchRsrcReg) 740 .addReg(SOffset, SOffsetRegState) 741 .addImm(Offset) 742 .addImm(0) // glc 743 .addImm(0) // slc 744 .addImm(0) // tfe 745 .addImm(0) // dlc 746 .addMemOperand(NewMMO); 747 748 if (!IsStore && TmpReg != AMDGPU::NoRegister) 749 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), 750 FinalReg) 751 .addReg(TmpReg, RegState::Kill); 752 } 753 754 if (NumSubRegs > 1) 755 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 756 } 757 758 if (ScratchOffsetRegDelta != 0) { 759 // Subtract the offset we added to the ScratchOffset register. 760 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 761 .addReg(ScratchOffsetReg) 762 .addImm(ScratchOffsetRegDelta); 763 } 764 } 765 766 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, 767 bool Store) { 768 if (SuperRegSize % 16 == 0) { 769 return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : 770 AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; 771 } 772 773 if (SuperRegSize % 8 == 0) { 774 return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : 775 AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; 776 } 777 778 return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : 779 AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; 780 } 781 782 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 783 int Index, 784 RegScavenger *RS, 785 bool OnlyToVGPR) const { 786 MachineBasicBlock *MBB = MI->getParent(); 787 MachineFunction *MF = MBB->getParent(); 788 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 789 DenseSet<unsigned> SGPRSpillVGPRDefinedSet; 790 791 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 792 = MFI->getSGPRToVGPRSpills(Index); 793 bool SpillToVGPR = !VGPRSpills.empty(); 794 if (OnlyToVGPR && !SpillToVGPR) 795 return false; 796 797 MachineRegisterInfo &MRI = MF->getRegInfo(); 798 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 799 const SIInstrInfo *TII = ST.getInstrInfo(); 800 801 unsigned SuperReg = MI->getOperand(0).getReg(); 802 bool IsKill = MI->getOperand(0).isKill(); 803 const DebugLoc &DL = MI->getDebugLoc(); 804 805 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 806 807 bool SpillToSMEM = spillSGPRToSMEM(); 808 if (SpillToSMEM && OnlyToVGPR) 809 return false; 810 811 Register FrameReg = getFrameRegister(*MF); 812 813 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 814 SuperReg != MFI->getFrameOffsetReg() && 815 SuperReg != MFI->getScratchWaveOffsetReg())); 816 817 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 818 819 unsigned OffsetReg = AMDGPU::M0; 820 unsigned M0CopyReg = AMDGPU::NoRegister; 821 822 if (SpillToSMEM) { 823 if (RS->isRegUsed(AMDGPU::M0)) { 824 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 825 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 826 .addReg(AMDGPU::M0); 827 } 828 } 829 830 unsigned ScalarStoreOp; 831 unsigned EltSize = 4; 832 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 833 if (SpillToSMEM && isSGPRClass(RC)) { 834 // XXX - if private_element_size is larger than 4 it might be useful to be 835 // able to spill wider vmem spills. 836 std::tie(EltSize, ScalarStoreOp) = 837 getSpillEltSize(getRegSizeInBits(*RC) / 8, true); 838 } 839 840 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 841 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 842 843 // SubReg carries the "Kill" flag when SubReg == SuperReg. 844 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 845 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 846 unsigned SubReg = NumSubRegs == 1 ? 847 SuperReg : getSubReg(SuperReg, SplitParts[i]); 848 849 if (SpillToSMEM) { 850 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 851 852 // The allocated memory size is really the wavefront size * the frame 853 // index size. The widest register class is 64 bytes, so a 4-byte scratch 854 // allocation is enough to spill this in a single stack object. 855 // 856 // FIXME: Frame size/offsets are computed earlier than this, so the extra 857 // space is still unnecessarily allocated. 858 859 unsigned Align = FrameInfo.getObjectAlignment(Index); 860 MachinePointerInfo PtrInfo 861 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 862 MachineMemOperand *MMO 863 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 864 EltSize, MinAlign(Align, EltSize * i)); 865 866 // SMEM instructions only support a single offset, so increment the wave 867 // offset. 868 869 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 870 if (Offset != 0) { 871 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 872 .addReg(FrameReg) 873 .addImm(Offset); 874 } else { 875 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 876 .addReg(FrameReg); 877 } 878 879 BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) 880 .addReg(SubReg, getKillRegState(IsKill)) // sdata 881 .addReg(MFI->getScratchRSrcReg()) // sbase 882 .addReg(OffsetReg, RegState::Kill) // soff 883 .addImm(0) // glc 884 .addImm(0) // dlc 885 .addMemOperand(MMO); 886 887 continue; 888 } 889 890 if (SpillToVGPR) { 891 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 892 893 // During SGPR spilling to VGPR, determine if the VGPR is defined. The 894 // only circumstance in which we say it is undefined is when it is the 895 // first spill to this VGPR in the first basic block. 896 bool VGPRDefined = true; 897 if (MBB == &MF->front()) 898 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; 899 900 // Mark the "old value of vgpr" input undef only if this is the first sgpr 901 // spill to this specific vgpr in the first basic block. 902 BuildMI(*MBB, MI, DL, 903 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 904 Spill.VGPR) 905 .addReg(SubReg, getKillRegState(IsKill)) 906 .addImm(Spill.Lane) 907 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); 908 909 // FIXME: Since this spills to another register instead of an actual 910 // frame index, we should delete the frame index when all references to 911 // it are fixed. 912 } else { 913 // XXX - Can to VGPR spill fail for some subregisters but not others? 914 if (OnlyToVGPR) 915 return false; 916 917 // Spill SGPR to a frame index. 918 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 919 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 920 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 921 922 MachineInstrBuilder Mov 923 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 924 .addReg(SubReg, SubKillState); 925 926 927 // There could be undef components of a spilled super register. 928 // TODO: Can we detect this and skip the spill? 929 if (NumSubRegs > 1) { 930 // The last implicit use of the SuperReg carries the "Kill" flag. 931 unsigned SuperKillState = 0; 932 if (i + 1 == e) 933 SuperKillState |= getKillRegState(IsKill); 934 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 935 } 936 937 unsigned Align = FrameInfo.getObjectAlignment(Index); 938 MachinePointerInfo PtrInfo 939 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 940 MachineMemOperand *MMO 941 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 942 EltSize, MinAlign(Align, EltSize * i)); 943 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 944 .addReg(TmpReg, RegState::Kill) // src 945 .addFrameIndex(Index) // vaddr 946 .addReg(MFI->getScratchRSrcReg()) // srrsrc 947 .addReg(MFI->getStackPtrOffsetReg()) // soffset 948 .addImm(i * 4) // offset 949 .addMemOperand(MMO); 950 } 951 } 952 953 if (M0CopyReg != AMDGPU::NoRegister) { 954 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 955 .addReg(M0CopyReg, RegState::Kill); 956 } 957 958 MI->eraseFromParent(); 959 MFI->addToSpilledSGPRs(NumSubRegs); 960 return true; 961 } 962 963 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 964 int Index, 965 RegScavenger *RS, 966 bool OnlyToVGPR) const { 967 MachineFunction *MF = MI->getParent()->getParent(); 968 MachineRegisterInfo &MRI = MF->getRegInfo(); 969 MachineBasicBlock *MBB = MI->getParent(); 970 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 971 972 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 973 = MFI->getSGPRToVGPRSpills(Index); 974 bool SpillToVGPR = !VGPRSpills.empty(); 975 if (OnlyToVGPR && !SpillToVGPR) 976 return false; 977 978 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 979 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 980 const SIInstrInfo *TII = ST.getInstrInfo(); 981 const DebugLoc &DL = MI->getDebugLoc(); 982 983 unsigned SuperReg = MI->getOperand(0).getReg(); 984 bool SpillToSMEM = spillSGPRToSMEM(); 985 if (SpillToSMEM && OnlyToVGPR) 986 return false; 987 988 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 989 990 unsigned OffsetReg = AMDGPU::M0; 991 unsigned M0CopyReg = AMDGPU::NoRegister; 992 993 if (SpillToSMEM) { 994 if (RS->isRegUsed(AMDGPU::M0)) { 995 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 996 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 997 .addReg(AMDGPU::M0); 998 } 999 } 1000 1001 unsigned EltSize = 4; 1002 unsigned ScalarLoadOp; 1003 1004 Register FrameReg = getFrameRegister(*MF); 1005 1006 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1007 if (SpillToSMEM && isSGPRClass(RC)) { 1008 // XXX - if private_element_size is larger than 4 it might be useful to be 1009 // able to spill wider vmem spills. 1010 std::tie(EltSize, ScalarLoadOp) = 1011 getSpillEltSize(getRegSizeInBits(*RC) / 8, false); 1012 } 1013 1014 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1015 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1016 1017 // SubReg carries the "Kill" flag when SubReg == SuperReg. 1018 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 1019 1020 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1021 unsigned SubReg = NumSubRegs == 1 ? 1022 SuperReg : getSubReg(SuperReg, SplitParts[i]); 1023 1024 if (SpillToSMEM) { 1025 // FIXME: Size may be > 4 but extra bytes wasted. 1026 unsigned Align = FrameInfo.getObjectAlignment(Index); 1027 MachinePointerInfo PtrInfo 1028 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 1029 MachineMemOperand *MMO 1030 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 1031 EltSize, MinAlign(Align, EltSize * i)); 1032 1033 // Add i * 4 offset 1034 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 1035 if (Offset != 0) { 1036 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 1037 .addReg(FrameReg) 1038 .addImm(Offset); 1039 } else { 1040 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 1041 .addReg(FrameReg); 1042 } 1043 1044 auto MIB = 1045 BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) 1046 .addReg(MFI->getScratchRSrcReg()) // sbase 1047 .addReg(OffsetReg, RegState::Kill) // soff 1048 .addImm(0) // glc 1049 .addImm(0) // dlc 1050 .addMemOperand(MMO); 1051 1052 if (NumSubRegs > 1 && i == 0) 1053 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1054 1055 continue; 1056 } 1057 1058 if (SpillToVGPR) { 1059 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1060 auto MIB = 1061 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 1062 SubReg) 1063 .addReg(Spill.VGPR) 1064 .addImm(Spill.Lane); 1065 1066 if (NumSubRegs > 1 && i == 0) 1067 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1068 } else { 1069 if (OnlyToVGPR) 1070 return false; 1071 1072 // Restore SGPR from a stack slot. 1073 // FIXME: We should use S_LOAD_DWORD here for VI. 1074 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1075 unsigned Align = FrameInfo.getObjectAlignment(Index); 1076 1077 MachinePointerInfo PtrInfo 1078 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 1079 1080 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, 1081 MachineMemOperand::MOLoad, EltSize, 1082 MinAlign(Align, EltSize * i)); 1083 1084 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) 1085 .addFrameIndex(Index) // vaddr 1086 .addReg(MFI->getScratchRSrcReg()) // srsrc 1087 .addReg(MFI->getStackPtrOffsetReg()) // soffset 1088 .addImm(i * 4) // offset 1089 .addMemOperand(MMO); 1090 1091 auto MIB = 1092 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 1093 .addReg(TmpReg, RegState::Kill); 1094 1095 if (NumSubRegs > 1) 1096 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 1097 } 1098 } 1099 1100 if (M0CopyReg != AMDGPU::NoRegister) { 1101 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 1102 .addReg(M0CopyReg, RegState::Kill); 1103 } 1104 1105 MI->eraseFromParent(); 1106 return true; 1107 } 1108 1109 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1110 /// a VGPR and the stack slot can be safely eliminated when all other users are 1111 /// handled. 1112 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1113 MachineBasicBlock::iterator MI, 1114 int FI, 1115 RegScavenger *RS) const { 1116 switch (MI->getOpcode()) { 1117 case AMDGPU::SI_SPILL_S1024_SAVE: 1118 case AMDGPU::SI_SPILL_S512_SAVE: 1119 case AMDGPU::SI_SPILL_S256_SAVE: 1120 case AMDGPU::SI_SPILL_S160_SAVE: 1121 case AMDGPU::SI_SPILL_S128_SAVE: 1122 case AMDGPU::SI_SPILL_S96_SAVE: 1123 case AMDGPU::SI_SPILL_S64_SAVE: 1124 case AMDGPU::SI_SPILL_S32_SAVE: 1125 return spillSGPR(MI, FI, RS, true); 1126 case AMDGPU::SI_SPILL_S1024_RESTORE: 1127 case AMDGPU::SI_SPILL_S512_RESTORE: 1128 case AMDGPU::SI_SPILL_S256_RESTORE: 1129 case AMDGPU::SI_SPILL_S160_RESTORE: 1130 case AMDGPU::SI_SPILL_S128_RESTORE: 1131 case AMDGPU::SI_SPILL_S96_RESTORE: 1132 case AMDGPU::SI_SPILL_S64_RESTORE: 1133 case AMDGPU::SI_SPILL_S32_RESTORE: 1134 return restoreSGPR(MI, FI, RS, true); 1135 default: 1136 llvm_unreachable("not an SGPR spill instruction"); 1137 } 1138 } 1139 1140 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1141 int SPAdj, unsigned FIOperandNum, 1142 RegScavenger *RS) const { 1143 MachineFunction *MF = MI->getParent()->getParent(); 1144 MachineRegisterInfo &MRI = MF->getRegInfo(); 1145 MachineBasicBlock *MBB = MI->getParent(); 1146 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1147 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1148 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1149 const SIInstrInfo *TII = ST.getInstrInfo(); 1150 DebugLoc DL = MI->getDebugLoc(); 1151 1152 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1153 1154 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1155 int Index = MI->getOperand(FIOperandNum).getIndex(); 1156 1157 Register FrameReg = getFrameRegister(*MF); 1158 1159 switch (MI->getOpcode()) { 1160 // SGPR register spill 1161 case AMDGPU::SI_SPILL_S1024_SAVE: 1162 case AMDGPU::SI_SPILL_S512_SAVE: 1163 case AMDGPU::SI_SPILL_S256_SAVE: 1164 case AMDGPU::SI_SPILL_S160_SAVE: 1165 case AMDGPU::SI_SPILL_S128_SAVE: 1166 case AMDGPU::SI_SPILL_S96_SAVE: 1167 case AMDGPU::SI_SPILL_S64_SAVE: 1168 case AMDGPU::SI_SPILL_S32_SAVE: { 1169 spillSGPR(MI, Index, RS); 1170 break; 1171 } 1172 1173 // SGPR register restore 1174 case AMDGPU::SI_SPILL_S1024_RESTORE: 1175 case AMDGPU::SI_SPILL_S512_RESTORE: 1176 case AMDGPU::SI_SPILL_S256_RESTORE: 1177 case AMDGPU::SI_SPILL_S160_RESTORE: 1178 case AMDGPU::SI_SPILL_S128_RESTORE: 1179 case AMDGPU::SI_SPILL_S96_RESTORE: 1180 case AMDGPU::SI_SPILL_S64_RESTORE: 1181 case AMDGPU::SI_SPILL_S32_RESTORE: { 1182 restoreSGPR(MI, Index, RS); 1183 break; 1184 } 1185 1186 // VGPR register spill 1187 case AMDGPU::SI_SPILL_V1024_SAVE: 1188 case AMDGPU::SI_SPILL_V512_SAVE: 1189 case AMDGPU::SI_SPILL_V256_SAVE: 1190 case AMDGPU::SI_SPILL_V160_SAVE: 1191 case AMDGPU::SI_SPILL_V128_SAVE: 1192 case AMDGPU::SI_SPILL_V96_SAVE: 1193 case AMDGPU::SI_SPILL_V64_SAVE: 1194 case AMDGPU::SI_SPILL_V32_SAVE: 1195 case AMDGPU::SI_SPILL_A1024_SAVE: 1196 case AMDGPU::SI_SPILL_A512_SAVE: 1197 case AMDGPU::SI_SPILL_A128_SAVE: 1198 case AMDGPU::SI_SPILL_A64_SAVE: 1199 case AMDGPU::SI_SPILL_A32_SAVE: { 1200 const MachineOperand *VData = TII->getNamedOperand(*MI, 1201 AMDGPU::OpName::vdata); 1202 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1203 MFI->getStackPtrOffsetReg()); 1204 1205 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 1206 Index, 1207 VData->getReg(), VData->isKill(), 1208 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1209 FrameReg, 1210 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1211 *MI->memoperands_begin(), 1212 RS); 1213 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1214 MI->eraseFromParent(); 1215 break; 1216 } 1217 case AMDGPU::SI_SPILL_V32_RESTORE: 1218 case AMDGPU::SI_SPILL_V64_RESTORE: 1219 case AMDGPU::SI_SPILL_V96_RESTORE: 1220 case AMDGPU::SI_SPILL_V128_RESTORE: 1221 case AMDGPU::SI_SPILL_V160_RESTORE: 1222 case AMDGPU::SI_SPILL_V256_RESTORE: 1223 case AMDGPU::SI_SPILL_V512_RESTORE: 1224 case AMDGPU::SI_SPILL_V1024_RESTORE: 1225 case AMDGPU::SI_SPILL_A32_RESTORE: 1226 case AMDGPU::SI_SPILL_A64_RESTORE: 1227 case AMDGPU::SI_SPILL_A128_RESTORE: 1228 case AMDGPU::SI_SPILL_A512_RESTORE: 1229 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1230 const MachineOperand *VData = TII->getNamedOperand(*MI, 1231 AMDGPU::OpName::vdata); 1232 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1233 MFI->getStackPtrOffsetReg()); 1234 1235 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 1236 Index, 1237 VData->getReg(), VData->isKill(), 1238 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1239 FrameReg, 1240 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1241 *MI->memoperands_begin(), 1242 RS); 1243 MI->eraseFromParent(); 1244 break; 1245 } 1246 1247 default: { 1248 const DebugLoc &DL = MI->getDebugLoc(); 1249 bool IsMUBUF = TII->isMUBUF(*MI); 1250 1251 if (!IsMUBUF && !MFI->isEntryFunction()) { 1252 // Convert to an absolute stack address by finding the offset from the 1253 // scratch wave base and scaling by the wave size. 1254 // 1255 // In an entry function/kernel the offset is already the absolute 1256 // address relative to the frame register. 1257 1258 unsigned DiffReg 1259 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1260 1261 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1262 Register ResultReg = IsCopy ? 1263 MI->getOperand(0).getReg() : 1264 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1265 1266 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) 1267 .addReg(FrameReg) 1268 .addReg(MFI->getScratchWaveOffsetReg()); 1269 1270 int64_t Offset = FrameInfo.getObjectOffset(Index); 1271 if (Offset == 0) { 1272 // XXX - This never happens because of emergency scavenging slot at 0? 1273 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1274 .addImm(Log2_32(ST.getWavefrontSize())) 1275 .addReg(DiffReg); 1276 } else { 1277 unsigned ScaledReg 1278 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1279 1280 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) 1281 .addImm(Log2_32(ST.getWavefrontSize())) 1282 .addReg(DiffReg, RegState::Kill); 1283 1284 // TODO: Fold if use instruction is another add of a constant. 1285 if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1286 TII->getAddNoCarry(*MBB, MI, DL, ResultReg) 1287 .addImm(Offset) 1288 .addReg(ScaledReg, RegState::Kill) 1289 .addImm(0); // clamp bit 1290 } else { 1291 unsigned ConstOffsetReg 1292 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1293 1294 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1295 .addImm(Offset); 1296 TII->getAddNoCarry(*MBB, MI, DL, ResultReg) 1297 .addReg(ConstOffsetReg, RegState::Kill) 1298 .addReg(ScaledReg, RegState::Kill) 1299 .addImm(0); // clamp bit 1300 } 1301 } 1302 1303 // Don't introduce an extra copy if we're just materializing in a mov. 1304 if (IsCopy) 1305 MI->eraseFromParent(); 1306 else 1307 FIOp.ChangeToRegister(ResultReg, false, false, true); 1308 return; 1309 } 1310 1311 if (IsMUBUF) { 1312 // Disable offen so we don't need a 0 vgpr base. 1313 assert(static_cast<int>(FIOperandNum) == 1314 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1315 AMDGPU::OpName::vaddr)); 1316 1317 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1318 MFI->getStackPtrOffsetReg()); 1319 1320 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); 1321 1322 int64_t Offset = FrameInfo.getObjectOffset(Index); 1323 int64_t OldImm 1324 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1325 int64_t NewOffset = OldImm + Offset; 1326 1327 if (isUInt<12>(NewOffset) && 1328 buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { 1329 MI->eraseFromParent(); 1330 return; 1331 } 1332 } 1333 1334 // If the offset is simply too big, don't convert to a scratch wave offset 1335 // relative index. 1336 1337 int64_t Offset = FrameInfo.getObjectOffset(Index); 1338 FIOp.ChangeToImmediate(Offset); 1339 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1340 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1341 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1342 .addImm(Offset); 1343 FIOp.ChangeToRegister(TmpReg, false, false, true); 1344 } 1345 } 1346 } 1347 } 1348 1349 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { 1350 const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); 1351 unsigned Size = getRegSizeInBits(*RC); 1352 unsigned AltName = AMDGPU::NoRegAltName; 1353 1354 switch (Size) { 1355 case 32: AltName = AMDGPU::Reg32; break; 1356 case 64: AltName = AMDGPU::Reg64; break; 1357 case 96: AltName = AMDGPU::Reg96; break; 1358 case 128: AltName = AMDGPU::Reg128; break; 1359 case 160: AltName = AMDGPU::Reg160; break; 1360 case 256: AltName = AMDGPU::Reg256; break; 1361 case 512: AltName = AMDGPU::Reg512; break; 1362 case 1024: AltName = AMDGPU::Reg1024; break; 1363 } 1364 return AMDGPUInstPrinter::getRegisterName(Reg, AltName); 1365 } 1366 1367 // FIXME: This is very slow. It might be worth creating a map from physreg to 1368 // register class. 1369 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 1370 assert(!TargetRegisterInfo::isVirtualRegister(Reg)); 1371 1372 static const TargetRegisterClass *const BaseClasses[] = { 1373 &AMDGPU::VGPR_32RegClass, 1374 &AMDGPU::SReg_32RegClass, 1375 &AMDGPU::AGPR_32RegClass, 1376 &AMDGPU::VReg_64RegClass, 1377 &AMDGPU::SReg_64RegClass, 1378 &AMDGPU::AReg_64RegClass, 1379 &AMDGPU::VReg_96RegClass, 1380 &AMDGPU::SReg_96RegClass, 1381 &AMDGPU::VReg_128RegClass, 1382 &AMDGPU::SReg_128RegClass, 1383 &AMDGPU::AReg_128RegClass, 1384 &AMDGPU::VReg_160RegClass, 1385 &AMDGPU::SReg_160RegClass, 1386 &AMDGPU::VReg_256RegClass, 1387 &AMDGPU::SReg_256RegClass, 1388 &AMDGPU::VReg_512RegClass, 1389 &AMDGPU::SReg_512RegClass, 1390 &AMDGPU::AReg_512RegClass, 1391 &AMDGPU::SReg_1024RegClass, 1392 &AMDGPU::VReg_1024RegClass, 1393 &AMDGPU::AReg_1024RegClass, 1394 &AMDGPU::SCC_CLASSRegClass, 1395 &AMDGPU::Pseudo_SReg_32RegClass, 1396 &AMDGPU::Pseudo_SReg_128RegClass, 1397 }; 1398 1399 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1400 if (BaseClass->contains(Reg)) { 1401 return BaseClass; 1402 } 1403 } 1404 return nullptr; 1405 } 1406 1407 // TODO: It might be helpful to have some target specific flags in 1408 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1409 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1410 unsigned Size = getRegSizeInBits(*RC); 1411 if (Size < 32) 1412 return false; 1413 switch (Size) { 1414 case 32: 1415 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 1416 case 64: 1417 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 1418 case 96: 1419 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 1420 case 128: 1421 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 1422 case 160: 1423 return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; 1424 case 256: 1425 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 1426 case 512: 1427 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 1428 case 1024: 1429 return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; 1430 default: 1431 llvm_unreachable("Invalid register class size"); 1432 } 1433 } 1434 1435 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1436 unsigned Size = getRegSizeInBits(*RC); 1437 if (Size < 32) 1438 return false; 1439 switch (Size) { 1440 case 32: 1441 return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; 1442 case 64: 1443 return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; 1444 case 96: 1445 return false; 1446 case 128: 1447 return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; 1448 case 160: 1449 case 256: 1450 return false; 1451 case 512: 1452 return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; 1453 case 1024: 1454 return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; 1455 default: 1456 llvm_unreachable("Invalid register class size"); 1457 } 1458 } 1459 1460 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 1461 const TargetRegisterClass *SRC) const { 1462 switch (getRegSizeInBits(*SRC)) { 1463 case 32: 1464 return &AMDGPU::VGPR_32RegClass; 1465 case 64: 1466 return &AMDGPU::VReg_64RegClass; 1467 case 96: 1468 return &AMDGPU::VReg_96RegClass; 1469 case 128: 1470 return &AMDGPU::VReg_128RegClass; 1471 case 160: 1472 return &AMDGPU::VReg_160RegClass; 1473 case 256: 1474 return &AMDGPU::VReg_256RegClass; 1475 case 512: 1476 return &AMDGPU::VReg_512RegClass; 1477 case 1024: 1478 return &AMDGPU::VReg_1024RegClass; 1479 default: 1480 llvm_unreachable("Invalid register class size"); 1481 } 1482 } 1483 1484 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( 1485 const TargetRegisterClass *SRC) const { 1486 switch (getRegSizeInBits(*SRC)) { 1487 case 32: 1488 return &AMDGPU::AGPR_32RegClass; 1489 case 64: 1490 return &AMDGPU::AReg_64RegClass; 1491 case 128: 1492 return &AMDGPU::AReg_128RegClass; 1493 case 512: 1494 return &AMDGPU::AReg_512RegClass; 1495 case 1024: 1496 return &AMDGPU::AReg_1024RegClass; 1497 default: 1498 llvm_unreachable("Invalid register class size"); 1499 } 1500 } 1501 1502 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1503 const TargetRegisterClass *VRC) const { 1504 switch (getRegSizeInBits(*VRC)) { 1505 case 32: 1506 return &AMDGPU::SGPR_32RegClass; 1507 case 64: 1508 return &AMDGPU::SReg_64RegClass; 1509 case 96: 1510 return &AMDGPU::SReg_96RegClass; 1511 case 128: 1512 return &AMDGPU::SReg_128RegClass; 1513 case 160: 1514 return &AMDGPU::SReg_160RegClass; 1515 case 256: 1516 return &AMDGPU::SReg_256RegClass; 1517 case 512: 1518 return &AMDGPU::SReg_512RegClass; 1519 case 1024: 1520 return &AMDGPU::SReg_1024RegClass; 1521 default: 1522 llvm_unreachable("Invalid register class size"); 1523 } 1524 } 1525 1526 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1527 const TargetRegisterClass *RC, unsigned SubIdx) const { 1528 if (SubIdx == AMDGPU::NoSubRegister) 1529 return RC; 1530 1531 // We can assume that each lane corresponds to one 32-bit register. 1532 unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); 1533 if (isSGPRClass(RC)) { 1534 switch (Count) { 1535 case 1: 1536 return &AMDGPU::SGPR_32RegClass; 1537 case 2: 1538 return &AMDGPU::SReg_64RegClass; 1539 case 3: 1540 return &AMDGPU::SReg_96RegClass; 1541 case 4: 1542 return &AMDGPU::SReg_128RegClass; 1543 case 5: 1544 return &AMDGPU::SReg_160RegClass; 1545 case 8: 1546 return &AMDGPU::SReg_256RegClass; 1547 case 16: 1548 return &AMDGPU::SReg_512RegClass; 1549 case 32: /* fall-through */ 1550 default: 1551 llvm_unreachable("Invalid sub-register class size"); 1552 } 1553 } else if (hasAGPRs(RC)) { 1554 switch (Count) { 1555 case 1: 1556 return &AMDGPU::AGPR_32RegClass; 1557 case 2: 1558 return &AMDGPU::AReg_64RegClass; 1559 case 4: 1560 return &AMDGPU::AReg_128RegClass; 1561 case 16: 1562 return &AMDGPU::AReg_512RegClass; 1563 case 32: /* fall-through */ 1564 default: 1565 llvm_unreachable("Invalid sub-register class size"); 1566 } 1567 } else { 1568 switch (Count) { 1569 case 1: 1570 return &AMDGPU::VGPR_32RegClass; 1571 case 2: 1572 return &AMDGPU::VReg_64RegClass; 1573 case 3: 1574 return &AMDGPU::VReg_96RegClass; 1575 case 4: 1576 return &AMDGPU::VReg_128RegClass; 1577 case 5: 1578 return &AMDGPU::VReg_160RegClass; 1579 case 8: 1580 return &AMDGPU::VReg_256RegClass; 1581 case 16: 1582 return &AMDGPU::VReg_512RegClass; 1583 case 32: /* fall-through */ 1584 default: 1585 llvm_unreachable("Invalid sub-register class size"); 1586 } 1587 } 1588 } 1589 1590 bool SIRegisterInfo::shouldRewriteCopySrc( 1591 const TargetRegisterClass *DefRC, 1592 unsigned DefSubReg, 1593 const TargetRegisterClass *SrcRC, 1594 unsigned SrcSubReg) const { 1595 // We want to prefer the smallest register class possible, so we don't want to 1596 // stop and rewrite on anything that looks like a subregister 1597 // extract. Operations mostly don't care about the super register class, so we 1598 // only want to stop on the most basic of copies between the same register 1599 // class. 1600 // 1601 // e.g. if we have something like 1602 // %0 = ... 1603 // %1 = ... 1604 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1605 // %3 = COPY %2, sub0 1606 // 1607 // We want to look through the COPY to find: 1608 // => %3 = COPY %0 1609 1610 // Plain copy. 1611 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1612 } 1613 1614 /// Returns a register that is not used at any point in the function. 1615 /// If all registers are used, then this function will return 1616 // AMDGPU::NoRegister. 1617 unsigned 1618 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1619 const TargetRegisterClass *RC, 1620 const MachineFunction &MF) const { 1621 1622 for (unsigned Reg : *RC) 1623 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1624 return Reg; 1625 return AMDGPU::NoRegister; 1626 } 1627 1628 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1629 unsigned EltSize) const { 1630 if (EltSize == 4) { 1631 static const int16_t Sub0_31[] = { 1632 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1633 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1634 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1635 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1636 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 1637 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 1638 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 1639 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, 1640 }; 1641 1642 static const int16_t Sub0_15[] = { 1643 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1644 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1645 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1646 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1647 }; 1648 1649 static const int16_t Sub0_7[] = { 1650 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1651 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1652 }; 1653 1654 static const int16_t Sub0_4[] = { 1655 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, 1656 }; 1657 1658 static const int16_t Sub0_3[] = { 1659 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1660 }; 1661 1662 static const int16_t Sub0_2[] = { 1663 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1664 }; 1665 1666 static const int16_t Sub0_1[] = { 1667 AMDGPU::sub0, AMDGPU::sub1, 1668 }; 1669 1670 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1671 case 32: 1672 return {}; 1673 case 64: 1674 return makeArrayRef(Sub0_1); 1675 case 96: 1676 return makeArrayRef(Sub0_2); 1677 case 128: 1678 return makeArrayRef(Sub0_3); 1679 case 160: 1680 return makeArrayRef(Sub0_4); 1681 case 256: 1682 return makeArrayRef(Sub0_7); 1683 case 512: 1684 return makeArrayRef(Sub0_15); 1685 case 1024: 1686 return makeArrayRef(Sub0_31); 1687 default: 1688 llvm_unreachable("unhandled register size"); 1689 } 1690 } 1691 1692 if (EltSize == 8) { 1693 static const int16_t Sub0_31_64[] = { 1694 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1695 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1696 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1697 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1698 AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, 1699 AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, 1700 AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, 1701 AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 1702 }; 1703 1704 static const int16_t Sub0_15_64[] = { 1705 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1706 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1707 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1708 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1709 }; 1710 1711 static const int16_t Sub0_7_64[] = { 1712 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1713 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1714 }; 1715 1716 1717 static const int16_t Sub0_3_64[] = { 1718 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1719 }; 1720 1721 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1722 case 64: 1723 return {}; 1724 case 128: 1725 return makeArrayRef(Sub0_3_64); 1726 case 256: 1727 return makeArrayRef(Sub0_7_64); 1728 case 512: 1729 return makeArrayRef(Sub0_15_64); 1730 case 1024: 1731 return makeArrayRef(Sub0_31_64); 1732 default: 1733 llvm_unreachable("unhandled register size"); 1734 } 1735 } 1736 1737 if (EltSize == 16) { 1738 1739 static const int16_t Sub0_31_128[] = { 1740 AMDGPU::sub0_sub1_sub2_sub3, 1741 AMDGPU::sub4_sub5_sub6_sub7, 1742 AMDGPU::sub8_sub9_sub10_sub11, 1743 AMDGPU::sub12_sub13_sub14_sub15, 1744 AMDGPU::sub16_sub17_sub18_sub19, 1745 AMDGPU::sub20_sub21_sub22_sub23, 1746 AMDGPU::sub24_sub25_sub26_sub27, 1747 AMDGPU::sub28_sub29_sub30_sub31 1748 }; 1749 1750 static const int16_t Sub0_15_128[] = { 1751 AMDGPU::sub0_sub1_sub2_sub3, 1752 AMDGPU::sub4_sub5_sub6_sub7, 1753 AMDGPU::sub8_sub9_sub10_sub11, 1754 AMDGPU::sub12_sub13_sub14_sub15 1755 }; 1756 1757 static const int16_t Sub0_7_128[] = { 1758 AMDGPU::sub0_sub1_sub2_sub3, 1759 AMDGPU::sub4_sub5_sub6_sub7 1760 }; 1761 1762 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1763 case 128: 1764 return {}; 1765 case 256: 1766 return makeArrayRef(Sub0_7_128); 1767 case 512: 1768 return makeArrayRef(Sub0_15_128); 1769 case 1024: 1770 return makeArrayRef(Sub0_31_128); 1771 default: 1772 llvm_unreachable("unhandled register size"); 1773 } 1774 } 1775 1776 assert(EltSize == 32 && "unhandled elt size"); 1777 1778 static const int16_t Sub0_31_256[] = { 1779 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1780 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1781 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, 1782 AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1783 }; 1784 1785 static const int16_t Sub0_15_256[] = { 1786 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1787 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 1788 }; 1789 1790 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1791 case 256: 1792 return {}; 1793 case 512: 1794 return makeArrayRef(Sub0_15_256); 1795 case 1024: 1796 return makeArrayRef(Sub0_31_256); 1797 default: 1798 llvm_unreachable("unhandled register size"); 1799 } 1800 } 1801 1802 const TargetRegisterClass* 1803 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1804 unsigned Reg) const { 1805 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1806 return MRI.getRegClass(Reg); 1807 1808 return getPhysRegClass(Reg); 1809 } 1810 1811 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1812 unsigned Reg) const { 1813 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1814 assert(RC && "Register class for the reg not found"); 1815 return hasVGPRs(RC); 1816 } 1817 1818 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 1819 unsigned Reg) const { 1820 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1821 assert(RC && "Register class for the reg not found"); 1822 return hasAGPRs(RC); 1823 } 1824 1825 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 1826 const TargetRegisterClass *SrcRC, 1827 unsigned SubReg, 1828 const TargetRegisterClass *DstRC, 1829 unsigned DstSubReg, 1830 const TargetRegisterClass *NewRC, 1831 LiveIntervals &LIS) const { 1832 unsigned SrcSize = getRegSizeInBits(*SrcRC); 1833 unsigned DstSize = getRegSizeInBits(*DstRC); 1834 unsigned NewSize = getRegSizeInBits(*NewRC); 1835 1836 // Do not increase size of registers beyond dword, we would need to allocate 1837 // adjacent registers and constraint regalloc more than needed. 1838 1839 // Always allow dword coalescing. 1840 if (SrcSize <= 32 || DstSize <= 32) 1841 return true; 1842 1843 return NewSize <= DstSize || NewSize <= SrcSize; 1844 } 1845 1846 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 1847 MachineFunction &MF) const { 1848 1849 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1850 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1851 1852 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 1853 MF.getFunction()); 1854 switch (RC->getID()) { 1855 default: 1856 return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); 1857 case AMDGPU::VGPR_32RegClassID: 1858 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 1859 case AMDGPU::SGPR_32RegClassID: 1860 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 1861 } 1862 } 1863 1864 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 1865 unsigned Idx) const { 1866 if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) 1867 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 1868 const_cast<MachineFunction &>(MF)); 1869 1870 if (Idx == getSGPRPressureSet()) 1871 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 1872 const_cast<MachineFunction &>(MF)); 1873 1874 return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); 1875 } 1876 1877 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 1878 static const int Empty[] = { -1 }; 1879 1880 if (hasRegUnit(AMDGPU::M0, RegUnit)) 1881 return Empty; 1882 return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); 1883 } 1884 1885 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 1886 // Not a callee saved register. 1887 return AMDGPU::SGPR30_SGPR31; 1888 } 1889 1890 const TargetRegisterClass * 1891 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 1892 const RegisterBank &RB, 1893 const MachineRegisterInfo &MRI) const { 1894 switch (Size) { 1895 case 1: { 1896 switch (RB.getID()) { 1897 case AMDGPU::VGPRRegBankID: 1898 return &AMDGPU::VGPR_32RegClass; 1899 case AMDGPU::VCCRegBankID: 1900 return isWave32 ? 1901 &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; 1902 case AMDGPU::SGPRRegBankID: 1903 return &AMDGPU::SReg_32_XM0RegClass; 1904 case AMDGPU::SCCRegBankID: 1905 // This needs to return an allocatable class, so don't bother returning 1906 // the dummy SCC class. 1907 return &AMDGPU::SReg_32_XM0RegClass; 1908 default: 1909 llvm_unreachable("unknown register bank"); 1910 } 1911 } 1912 case 32: 1913 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1914 &AMDGPU::SReg_32_XM0RegClass; 1915 case 64: 1916 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : 1917 &AMDGPU::SReg_64_XEXECRegClass; 1918 case 96: 1919 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : 1920 &AMDGPU::SReg_96RegClass; 1921 case 128: 1922 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : 1923 &AMDGPU::SReg_128RegClass; 1924 case 160: 1925 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : 1926 &AMDGPU::SReg_160RegClass; 1927 case 256: 1928 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : 1929 &AMDGPU::SReg_256RegClass; 1930 case 512: 1931 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : 1932 &AMDGPU::SReg_512RegClass; 1933 default: 1934 if (Size < 32) 1935 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1936 &AMDGPU::SReg_32_XM0RegClass; 1937 return nullptr; 1938 } 1939 } 1940 1941 const TargetRegisterClass * 1942 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 1943 const MachineRegisterInfo &MRI) const { 1944 if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) 1945 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 1946 return nullptr; 1947 } 1948 1949 unsigned SIRegisterInfo::getVCC() const { 1950 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 1951 } 1952 1953 const TargetRegisterClass * 1954 SIRegisterInfo::getRegClass(unsigned RCID) const { 1955 switch ((int)RCID) { 1956 case AMDGPU::SReg_1RegClassID: 1957 return getBoolRC(); 1958 case AMDGPU::SReg_1_XEXECRegClassID: 1959 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 1960 : &AMDGPU::SReg_64_XEXECRegClass; 1961 case -1: 1962 return nullptr; 1963 default: 1964 return AMDGPURegisterInfo::getRegClass(RCID); 1965 } 1966 } 1967 1968 // Find reaching register definition 1969 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, 1970 MachineInstr &Use, 1971 MachineRegisterInfo &MRI, 1972 LiveIntervals *LIS) const { 1973 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 1974 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 1975 SlotIndex DefIdx; 1976 1977 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 1978 if (!LIS->hasInterval(Reg)) 1979 return nullptr; 1980 LiveInterval &LI = LIS->getInterval(Reg); 1981 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 1982 : MRI.getMaxLaneMaskForVReg(Reg); 1983 VNInfo *V = nullptr; 1984 if (LI.hasSubRanges()) { 1985 for (auto &S : LI.subranges()) { 1986 if ((S.LaneMask & SubLanes) == SubLanes) { 1987 V = S.getVNInfoAt(UseIdx); 1988 break; 1989 } 1990 } 1991 } else { 1992 V = LI.getVNInfoAt(UseIdx); 1993 } 1994 if (!V) 1995 return nullptr; 1996 DefIdx = V->def; 1997 } else { 1998 // Find last def. 1999 for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { 2000 LiveRange &LR = LIS->getRegUnit(*Units); 2001 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2002 if (!DefIdx.isValid() || 2003 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2004 LIS->getInstructionFromIndex(V->def))) 2005 DefIdx = V->def; 2006 } else { 2007 return nullptr; 2008 } 2009 } 2010 } 2011 2012 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2013 2014 if (!Def || !MDT.dominates(Def, &Use)) 2015 return nullptr; 2016 2017 assert(Def->modifiesRegister(Reg, this)); 2018 2019 return Def; 2020 } 2021