1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 11 /// will sometimes generate these illegal copies in situations like this: 12 /// 13 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14 /// 15 /// BB0: 16 /// %0 <sgpr> = SCALAR_INST 17 /// %1 <vsrc> = COPY %0 <sgpr> 18 /// ... 19 /// BRANCH %cond BB1, BB2 20 /// BB1: 21 /// %2 <vgpr> = VECTOR_INST 22 /// %3 <vsrc> = COPY %2 <vgpr> 23 /// BB2: 24 /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25 /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26 /// 27 /// 28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29 /// code will look like this: 30 /// 31 /// BB0: 32 /// %0 <sgpr> = SCALAR_INST 33 /// ... 34 /// BRANCH %cond BB1, BB2 35 /// BB1: 36 /// %2 <vgpr> = VECTOR_INST 37 /// %3 <vsrc> = COPY %2 <vgpr> 38 /// BB2: 39 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41 /// 42 /// Now that the result of the PHI instruction is an SGPR, the register 43 /// allocator is now forced to constrain the register class of %3 to 44 /// <sgpr> so we end up with final code like this: 45 /// 46 /// BB0: 47 /// %0 <sgpr> = SCALAR_INST 48 /// ... 49 /// BRANCH %cond BB1, BB2 50 /// BB1: 51 /// %2 <vgpr> = VECTOR_INST 52 /// %3 <sgpr> = COPY %2 <vgpr> 53 /// BB2: 54 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56 /// 57 /// Now this code contains an illegal copy from a VGPR to an SGPR. 58 /// 59 /// In order to avoid this problem, this pass searches for PHI instructions 60 /// which define a <vsrc> register and constrains its definition class to 61 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 63 /// will be unable to perform the COPY removal from the above example which 64 /// ultimately led to the creation of an illegal COPY. 65 //===----------------------------------------------------------------------===// 66 67 #include "AMDGPU.h" 68 #include "AMDGPUSubtarget.h" 69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 70 #include "SIInstrInfo.h" 71 #include "SIRegisterInfo.h" 72 #include "llvm/ADT/DenseSet.h" 73 #include "llvm/ADT/STLExtras.h" 74 #include "llvm/ADT/SmallSet.h" 75 #include "llvm/ADT/SmallVector.h" 76 #include "llvm/CodeGen/MachineBasicBlock.h" 77 #include "llvm/CodeGen/MachineDominators.h" 78 #include "llvm/CodeGen/MachineFunction.h" 79 #include "llvm/CodeGen/MachineFunctionPass.h" 80 #include "llvm/CodeGen/MachineInstr.h" 81 #include "llvm/CodeGen/MachineInstrBuilder.h" 82 #include "llvm/CodeGen/MachineOperand.h" 83 #include "llvm/CodeGen/MachineRegisterInfo.h" 84 #include "llvm/CodeGen/TargetRegisterInfo.h" 85 #include "llvm/InitializePasses.h" 86 #include "llvm/Pass.h" 87 #include "llvm/Support/CodeGen.h" 88 #include "llvm/Support/CommandLine.h" 89 #include "llvm/Support/Debug.h" 90 #include "llvm/Support/raw_ostream.h" 91 #include "llvm/Target/TargetMachine.h" 92 #include <cassert> 93 #include <cstdint> 94 #include <iterator> 95 #include <list> 96 #include <map> 97 #include <tuple> 98 #include <utility> 99 100 using namespace llvm; 101 102 #define DEBUG_TYPE "si-fix-sgpr-copies" 103 104 static cl::opt<bool> EnableM0Merge( 105 "amdgpu-enable-merge-m0", 106 cl::desc("Merge and hoist M0 initializations"), 107 cl::init(true)); 108 109 namespace { 110 111 class SIFixSGPRCopies : public MachineFunctionPass { 112 MachineDominatorTree *MDT; 113 114 public: 115 static char ID; 116 117 MachineRegisterInfo *MRI; 118 const SIRegisterInfo *TRI; 119 const SIInstrInfo *TII; 120 121 SIFixSGPRCopies() : MachineFunctionPass(ID) {} 122 123 bool runOnMachineFunction(MachineFunction &MF) override; 124 125 void processPHINode(MachineInstr &MI); 126 127 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 128 129 void getAnalysisUsage(AnalysisUsage &AU) const override { 130 AU.addRequired<MachineDominatorTree>(); 131 AU.addPreserved<MachineDominatorTree>(); 132 AU.setPreservesCFG(); 133 MachineFunctionPass::getAnalysisUsage(AU); 134 } 135 }; 136 137 } // end anonymous namespace 138 139 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 140 "SI Fix SGPR copies", false, false) 141 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 142 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 143 "SI Fix SGPR copies", false, false) 144 145 char SIFixSGPRCopies::ID = 0; 146 147 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 148 149 FunctionPass *llvm::createSIFixSGPRCopiesPass() { 150 return new SIFixSGPRCopies(); 151 } 152 153 static bool hasVectorOperands(const MachineInstr &MI, 154 const SIRegisterInfo *TRI) { 155 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 156 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 157 if (!MI.getOperand(i).isReg() || 158 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 159 continue; 160 161 if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) 162 return true; 163 } 164 return false; 165 } 166 167 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 168 getCopyRegClasses(const MachineInstr &Copy, 169 const SIRegisterInfo &TRI, 170 const MachineRegisterInfo &MRI) { 171 Register DstReg = Copy.getOperand(0).getReg(); 172 Register SrcReg = Copy.getOperand(1).getReg(); 173 174 const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg) 175 ? MRI.getRegClass(SrcReg) 176 : TRI.getPhysRegClass(SrcReg); 177 178 // We don't really care about the subregister here. 179 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 180 181 const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg) 182 ? MRI.getRegClass(DstReg) 183 : TRI.getPhysRegClass(DstReg); 184 185 return std::make_pair(SrcRC, DstRC); 186 } 187 188 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 189 const TargetRegisterClass *DstRC, 190 const SIRegisterInfo &TRI) { 191 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 192 TRI.hasVectorRegisters(SrcRC); 193 } 194 195 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 196 const TargetRegisterClass *DstRC, 197 const SIRegisterInfo &TRI) { 198 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 199 TRI.hasVectorRegisters(DstRC); 200 } 201 202 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 203 const SIRegisterInfo *TRI, 204 const SIInstrInfo *TII) { 205 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 206 auto &Src = MI.getOperand(1); 207 Register DstReg = MI.getOperand(0).getReg(); 208 Register SrcReg = Src.getReg(); 209 if (!Register::isVirtualRegister(SrcReg) || 210 !Register::isVirtualRegister(DstReg)) 211 return false; 212 213 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 214 const auto *UseMI = MO.getParent(); 215 if (UseMI == &MI) 216 continue; 217 if (MO.isDef() || UseMI->getParent() != MI.getParent() || 218 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || 219 !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) 220 return false; 221 } 222 // Change VGPR to SGPR destination. 223 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 224 return true; 225 } 226 227 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 228 // 229 // SGPRx = ... 230 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 231 // VGPRz = COPY SGPRy 232 // 233 // ==> 234 // 235 // VGPRx = COPY SGPRx 236 // VGPRz = REG_SEQUENCE VGPRx, sub0 237 // 238 // This exposes immediate folding opportunities when materializing 64-bit 239 // immediates. 240 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 241 const SIRegisterInfo *TRI, 242 const SIInstrInfo *TII, 243 MachineRegisterInfo &MRI) { 244 assert(MI.isRegSequence()); 245 246 Register DstReg = MI.getOperand(0).getReg(); 247 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 248 return false; 249 250 if (!MRI.hasOneUse(DstReg)) 251 return false; 252 253 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 254 if (!CopyUse.isCopy()) 255 return false; 256 257 // It is illegal to have vreg inputs to a physreg defining reg_sequence. 258 if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg())) 259 return false; 260 261 const TargetRegisterClass *SrcRC, *DstRC; 262 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 263 264 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 265 return false; 266 267 if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 268 return true; 269 270 // TODO: Could have multiple extracts? 271 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 272 if (SubReg != AMDGPU::NoSubRegister) 273 return false; 274 275 MRI.setRegClass(DstReg, DstRC); 276 277 // SGPRx = ... 278 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 279 // VGPRz = COPY SGPRy 280 281 // => 282 // VGPRx = COPY SGPRx 283 // VGPRz = REG_SEQUENCE VGPRx, sub0 284 285 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 286 bool IsAGPR = TRI->hasAGPRs(DstRC); 287 288 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 289 Register SrcReg = MI.getOperand(I).getReg(); 290 unsigned SrcSubReg = MI.getOperand(I).getSubReg(); 291 292 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 293 assert(TRI->isSGPRClass(SrcRC) && 294 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 295 296 SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); 297 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 298 299 Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 300 301 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 302 TmpReg) 303 .add(MI.getOperand(I)); 304 305 if (IsAGPR) { 306 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 307 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 308 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 309 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 310 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 311 TmpAReg) 312 .addReg(TmpReg, RegState::Kill); 313 TmpReg = TmpAReg; 314 } 315 316 MI.getOperand(I).setReg(TmpReg); 317 } 318 319 CopyUse.eraseFromParent(); 320 return true; 321 } 322 323 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 324 const MachineInstr *MoveImm, 325 const SIInstrInfo *TII, 326 unsigned &SMovOp, 327 int64_t &Imm) { 328 if (Copy->getOpcode() != AMDGPU::COPY) 329 return false; 330 331 if (!MoveImm->isMoveImmediate()) 332 return false; 333 334 const MachineOperand *ImmOp = 335 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 336 if (!ImmOp->isImm()) 337 return false; 338 339 // FIXME: Handle copies with sub-regs. 340 if (Copy->getOperand(0).getSubReg()) 341 return false; 342 343 switch (MoveImm->getOpcode()) { 344 default: 345 return false; 346 case AMDGPU::V_MOV_B32_e32: 347 SMovOp = AMDGPU::S_MOV_B32; 348 break; 349 case AMDGPU::V_MOV_B64_PSEUDO: 350 SMovOp = AMDGPU::S_MOV_B64; 351 break; 352 } 353 Imm = ImmOp->getImm(); 354 return true; 355 } 356 357 template <class UnaryPredicate> 358 bool searchPredecessors(const MachineBasicBlock *MBB, 359 const MachineBasicBlock *CutOff, 360 UnaryPredicate Predicate) { 361 if (MBB == CutOff) 362 return false; 363 364 DenseSet<const MachineBasicBlock *> Visited; 365 SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(), 366 MBB->pred_end()); 367 368 while (!Worklist.empty()) { 369 MachineBasicBlock *MBB = Worklist.pop_back_val(); 370 371 if (!Visited.insert(MBB).second) 372 continue; 373 if (MBB == CutOff) 374 continue; 375 if (Predicate(MBB)) 376 return true; 377 378 Worklist.append(MBB->pred_begin(), MBB->pred_end()); 379 } 380 381 return false; 382 } 383 384 // Checks if there is potential path From instruction To instruction. 385 // If CutOff is specified and it sits in between of that path we ignore 386 // a higher portion of the path and report it is not reachable. 387 static bool isReachable(const MachineInstr *From, 388 const MachineInstr *To, 389 const MachineBasicBlock *CutOff, 390 MachineDominatorTree &MDT) { 391 // If either From block dominates To block or instructions are in the same 392 // block and From is higher. 393 if (MDT.dominates(From, To)) 394 return true; 395 396 const MachineBasicBlock *MBBFrom = From->getParent(); 397 const MachineBasicBlock *MBBTo = To->getParent(); 398 if (MBBFrom == MBBTo) 399 return false; 400 401 // Instructions are in different blocks, do predecessor search. 402 // We should almost never get here since we do not usually produce M0 stores 403 // other than -1. 404 return searchPredecessors(MBBTo, CutOff, [MBBFrom] 405 (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 406 } 407 408 // Return the first non-prologue instruction in the block. 409 static MachineBasicBlock::iterator 410 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 411 MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 412 while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 413 ++I; 414 415 return I; 416 } 417 418 // Hoist and merge identical SGPR initializations into a common predecessor. 419 // This is intended to combine M0 initializations, but can work with any 420 // SGPR. A VGPR cannot be processed since we cannot guarantee vector 421 // executioon. 422 static bool hoistAndMergeSGPRInits(unsigned Reg, 423 const MachineRegisterInfo &MRI, 424 const TargetRegisterInfo *TRI, 425 MachineDominatorTree &MDT, 426 const TargetInstrInfo *TII) { 427 // List of inits by immediate value. 428 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 429 InitListMap Inits; 430 // List of clobbering instructions. 431 SmallVector<MachineInstr*, 8> Clobbers; 432 // List of instructions marked for deletion. 433 SmallSet<MachineInstr*, 8> MergedInstrs; 434 435 bool Changed = false; 436 437 for (auto &MI : MRI.def_instructions(Reg)) { 438 MachineOperand *Imm = nullptr; 439 for (auto &MO : MI.operands()) { 440 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 441 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 442 Imm = nullptr; 443 break; 444 } else if (MO.isImm()) 445 Imm = &MO; 446 } 447 if (Imm) 448 Inits[Imm->getImm()].push_front(&MI); 449 else 450 Clobbers.push_back(&MI); 451 } 452 453 for (auto &Init : Inits) { 454 auto &Defs = Init.second; 455 456 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 457 MachineInstr *MI1 = *I1; 458 459 for (auto I2 = std::next(I1); I2 != E; ) { 460 MachineInstr *MI2 = *I2; 461 462 // Check any possible interference 463 auto interferes = [&](MachineBasicBlock::iterator From, 464 MachineBasicBlock::iterator To) -> bool { 465 466 assert(MDT.dominates(&*To, &*From)); 467 468 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 469 const MachineBasicBlock *MBBFrom = From->getParent(); 470 const MachineBasicBlock *MBBTo = To->getParent(); 471 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 472 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 473 if (!MayClobberFrom && !MayClobberTo) 474 return false; 475 if ((MayClobberFrom && !MayClobberTo) || 476 (!MayClobberFrom && MayClobberTo)) 477 return true; 478 // Both can clobber, this is not an interference only if both are 479 // dominated by Clobber and belong to the same block or if Clobber 480 // properly dominates To, given that To >> From, so it dominates 481 // both and located in a common dominator. 482 return !((MBBFrom == MBBTo && 483 MDT.dominates(Clobber, &*From) && 484 MDT.dominates(Clobber, &*To)) || 485 MDT.properlyDominates(Clobber->getParent(), MBBTo)); 486 }; 487 488 return (llvm::any_of(Clobbers, interferes)) || 489 (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 490 return C.first != Init.first && 491 llvm::any_of(C.second, interferes); 492 })); 493 }; 494 495 if (MDT.dominates(MI1, MI2)) { 496 if (!interferes(MI2, MI1)) { 497 LLVM_DEBUG(dbgs() 498 << "Erasing from " 499 << printMBBReference(*MI2->getParent()) << " " << *MI2); 500 MergedInstrs.insert(MI2); 501 Changed = true; 502 ++I2; 503 continue; 504 } 505 } else if (MDT.dominates(MI2, MI1)) { 506 if (!interferes(MI1, MI2)) { 507 LLVM_DEBUG(dbgs() 508 << "Erasing from " 509 << printMBBReference(*MI1->getParent()) << " " << *MI1); 510 MergedInstrs.insert(MI1); 511 Changed = true; 512 ++I1; 513 break; 514 } 515 } else { 516 auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 517 MI2->getParent()); 518 if (!MBB) { 519 ++I2; 520 continue; 521 } 522 523 MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 524 if (!interferes(MI1, I) && !interferes(MI2, I)) { 525 LLVM_DEBUG(dbgs() 526 << "Erasing from " 527 << printMBBReference(*MI1->getParent()) << " " << *MI1 528 << "and moving from " 529 << printMBBReference(*MI2->getParent()) << " to " 530 << printMBBReference(*I->getParent()) << " " << *MI2); 531 I->getParent()->splice(I, MI2->getParent(), MI2); 532 MergedInstrs.insert(MI1); 533 Changed = true; 534 ++I1; 535 break; 536 } 537 } 538 ++I2; 539 } 540 ++I1; 541 } 542 } 543 544 // Remove initializations that were merged into another. 545 for (auto &Init : Inits) { 546 auto &Defs = Init.second; 547 auto I = Defs.begin(); 548 while (I != Defs.end()) { 549 if (MergedInstrs.count(*I)) { 550 (*I)->eraseFromParent(); 551 I = Defs.erase(I); 552 } else 553 ++I; 554 } 555 } 556 557 // Try to schedule SGPR initializations as early as possible in the MBB. 558 for (auto &Init : Inits) { 559 auto &Defs = Init.second; 560 for (auto MI : Defs) { 561 auto MBB = MI->getParent(); 562 MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 563 MachineBasicBlock::reverse_iterator B(BoundaryMI); 564 // Check if B should actually be a boundary. If not set the previous 565 // instruction as the boundary instead. 566 if (!TII->isBasicBlockPrologue(*B)) 567 B++; 568 569 auto R = std::next(MI->getReverseIterator()); 570 const unsigned Threshold = 50; 571 // Search until B or Threshold for a place to insert the initialization. 572 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 573 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 574 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 575 break; 576 577 // Move to directly after R. 578 if (&*--R != MI) 579 MBB->splice(*R, MBB, MI); 580 } 581 } 582 583 if (Changed) 584 MRI.clearKillFlags(Reg); 585 586 return Changed; 587 } 588 589 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 590 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 591 MRI = &MF.getRegInfo(); 592 TRI = ST.getRegisterInfo(); 593 TII = ST.getInstrInfo(); 594 MDT = &getAnalysis<MachineDominatorTree>(); 595 596 SmallVector<MachineInstr *, 16> Worklist; 597 598 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 599 BI != BE; ++BI) { 600 MachineBasicBlock &MBB = *BI; 601 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 602 I != E; ++I) { 603 MachineInstr &MI = *I; 604 605 switch (MI.getOpcode()) { 606 default: 607 continue; 608 case AMDGPU::COPY: 609 case AMDGPU::WQM: 610 case AMDGPU::SOFT_WQM: 611 case AMDGPU::WWM: { 612 Register DstReg = MI.getOperand(0).getReg(); 613 614 const TargetRegisterClass *SrcRC, *DstRC; 615 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 616 617 if (!Register::isVirtualRegister(DstReg)) { 618 // If the destination register is a physical register there isn't 619 // really much we can do to fix this. 620 // Some special instructions use M0 as an input. Some even only use 621 // the first lane. Insert a readfirstlane and hope for the best. 622 if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { 623 Register TmpReg 624 = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 625 626 BuildMI(MBB, MI, MI.getDebugLoc(), 627 TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 628 .add(MI.getOperand(1)); 629 MI.getOperand(1).setReg(TmpReg); 630 } 631 632 continue; 633 } 634 635 if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { 636 Register SrcReg = MI.getOperand(1).getReg(); 637 if (!Register::isVirtualRegister(SrcReg)) { 638 TII->moveToVALU(MI, MDT); 639 break; 640 } 641 642 MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 643 unsigned SMovOp; 644 int64_t Imm; 645 // If we are just copying an immediate, we can replace the copy with 646 // s_mov_b32. 647 if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { 648 MI.getOperand(1).ChangeToImmediate(Imm); 649 MI.addImplicitDefUseOperands(MF); 650 MI.setDesc(TII->get(SMovOp)); 651 break; 652 } 653 TII->moveToVALU(MI, MDT); 654 } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 655 tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); 656 } 657 658 break; 659 } 660 case AMDGPU::PHI: { 661 processPHINode(MI); 662 break; 663 } 664 case AMDGPU::REG_SEQUENCE: 665 if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || 666 !hasVectorOperands(MI, TRI)) { 667 foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); 668 continue; 669 } 670 671 LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); 672 673 TII->moveToVALU(MI, MDT); 674 break; 675 case AMDGPU::INSERT_SUBREG: { 676 const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; 677 DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); 678 Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); 679 Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); 680 if (TRI->isSGPRClass(DstRC) && 681 (TRI->hasVectorRegisters(Src0RC) || 682 TRI->hasVectorRegisters(Src1RC))) { 683 LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); 684 TII->moveToVALU(MI, MDT); 685 } 686 break; 687 } 688 case AMDGPU::V_WRITELANE_B32: { 689 // Some architectures allow more than one constant bus access without 690 // SGPR restriction 691 if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 692 break; 693 694 // Writelane is special in that it can use SGPR and M0 (which would 695 // normally count as using the constant bus twice - but in this case it 696 // is allowed since the lane selector doesn't count as a use of the 697 // constant bus). However, it is still required to abide by the 1 SGPR 698 // rule. Apply a fix here as we might have multiple SGPRs after 699 // legalizing VGPRs to SGPRs 700 int Src0Idx = 701 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 702 int Src1Idx = 703 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 704 MachineOperand &Src0 = MI.getOperand(Src0Idx); 705 MachineOperand &Src1 = MI.getOperand(Src1Idx); 706 707 // Check to see if the instruction violates the 1 SGPR rule 708 if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 709 Src0.getReg() != AMDGPU::M0) && 710 (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 711 Src1.getReg() != AMDGPU::M0)) { 712 713 // Check for trivially easy constant prop into one of the operands 714 // If this is the case then perform the operation now to resolve SGPR 715 // issue. If we don't do that here we will always insert a mov to m0 716 // that can't be resolved in later operand folding pass 717 bool Resolved = false; 718 for (MachineOperand *MO : {&Src0, &Src1}) { 719 if (Register::isVirtualRegister(MO->getReg())) { 720 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 721 if (DefMI && TII->isFoldableCopy(*DefMI)) { 722 const MachineOperand &Def = DefMI->getOperand(0); 723 if (Def.isReg() && 724 MO->getReg() == Def.getReg() && 725 MO->getSubReg() == Def.getSubReg()) { 726 const MachineOperand &Copied = DefMI->getOperand(1); 727 if (Copied.isImm() && 728 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 729 MO->ChangeToImmediate(Copied.getImm()); 730 Resolved = true; 731 break; 732 } 733 } 734 } 735 } 736 } 737 738 if (!Resolved) { 739 // Haven't managed to resolve by replacing an SGPR with an immediate 740 // Move src1 to be in M0 741 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 742 TII->get(AMDGPU::COPY), AMDGPU::M0) 743 .add(Src1); 744 Src1.ChangeToRegister(AMDGPU::M0, false); 745 } 746 } 747 break; 748 } 749 } 750 } 751 } 752 753 if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) 754 hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 755 756 return true; 757 } 758 759 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 760 unsigned numVGPRUses = 0; 761 bool AllAGPRUses = true; 762 SetVector<const MachineInstr *> worklist; 763 SmallSet<const MachineInstr *, 4> Visited; 764 worklist.insert(&MI); 765 Visited.insert(&MI); 766 while (!worklist.empty()) { 767 const MachineInstr *Instr = worklist.pop_back_val(); 768 unsigned Reg = Instr->getOperand(0).getReg(); 769 for (const auto &Use : MRI->use_operands(Reg)) { 770 const MachineInstr *UseMI = Use.getParent(); 771 AllAGPRUses &= (UseMI->isCopy() && 772 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 773 TRI->isAGPR(*MRI, Use.getReg()); 774 if (UseMI->isCopy() || UseMI->isRegSequence()) { 775 if (UseMI->isCopy() && 776 UseMI->getOperand(0).getReg().isPhysical() && 777 !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { 778 numVGPRUses++; 779 } 780 if (Visited.insert(UseMI).second) 781 worklist.insert(UseMI); 782 783 continue; 784 } 785 786 if (UseMI->isPHI()) { 787 const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); 788 if (!TRI->isSGPRReg(*MRI, Use.getReg()) && 789 UseRC != &AMDGPU::VReg_1RegClass) 790 numVGPRUses++; 791 continue; 792 } 793 794 const TargetRegisterClass *OpRC = 795 TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); 796 if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && 797 OpRC != &AMDGPU::VS_64RegClass) { 798 numVGPRUses++; 799 } 800 } 801 } 802 803 Register PHIRes = MI.getOperand(0).getReg(); 804 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 805 if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { 806 LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 807 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 808 } 809 810 bool hasVGPRInput = false; 811 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 812 unsigned InputReg = MI.getOperand(i).getReg(); 813 MachineInstr *Def = MRI->getVRegDef(InputReg); 814 if (TRI->isVectorRegister(*MRI, InputReg)) { 815 if (Def->isCopy()) { 816 unsigned SrcReg = Def->getOperand(1).getReg(); 817 const TargetRegisterClass *RC = 818 TRI->getRegClassForReg(*MRI, SrcReg); 819 if (TRI->isSGPRClass(RC)) 820 continue; 821 } 822 hasVGPRInput = true; 823 break; 824 } 825 else if (Def->isCopy() && 826 TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { 827 hasVGPRInput = true; 828 break; 829 } 830 } 831 832 if ((!TRI->isVectorRegister(*MRI, PHIRes) && 833 RC0 != &AMDGPU::VReg_1RegClass) && 834 (hasVGPRInput || numVGPRUses > 1)) { 835 LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); 836 TII->moveToVALU(MI); 837 } 838 else { 839 LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 840 TII->legalizeOperands(MI, MDT); 841 } 842 843 } 844