1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 11 /// will sometimes generate these illegal copies in situations like this: 12 /// 13 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14 /// 15 /// BB0: 16 /// %0 <sgpr> = SCALAR_INST 17 /// %1 <vsrc> = COPY %0 <sgpr> 18 /// ... 19 /// BRANCH %cond BB1, BB2 20 /// BB1: 21 /// %2 <vgpr> = VECTOR_INST 22 /// %3 <vsrc> = COPY %2 <vgpr> 23 /// BB2: 24 /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25 /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26 /// 27 /// 28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29 /// code will look like this: 30 /// 31 /// BB0: 32 /// %0 <sgpr> = SCALAR_INST 33 /// ... 34 /// BRANCH %cond BB1, BB2 35 /// BB1: 36 /// %2 <vgpr> = VECTOR_INST 37 /// %3 <vsrc> = COPY %2 <vgpr> 38 /// BB2: 39 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41 /// 42 /// Now that the result of the PHI instruction is an SGPR, the register 43 /// allocator is now forced to constrain the register class of %3 to 44 /// <sgpr> so we end up with final code like this: 45 /// 46 /// BB0: 47 /// %0 <sgpr> = SCALAR_INST 48 /// ... 49 /// BRANCH %cond BB1, BB2 50 /// BB1: 51 /// %2 <vgpr> = VECTOR_INST 52 /// %3 <sgpr> = COPY %2 <vgpr> 53 /// BB2: 54 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56 /// 57 /// Now this code contains an illegal copy from a VGPR to an SGPR. 58 /// 59 /// In order to avoid this problem, this pass searches for PHI instructions 60 /// which define a <vsrc> register and constrains its definition class to 61 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 63 /// will be unable to perform the COPY removal from the above example which 64 /// ultimately led to the creation of an illegal COPY. 65 //===----------------------------------------------------------------------===// 66 67 #include "AMDGPU.h" 68 #include "AMDGPUSubtarget.h" 69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 70 #include "SIInstrInfo.h" 71 #include "SIRegisterInfo.h" 72 #include "llvm/ADT/DenseSet.h" 73 #include "llvm/ADT/STLExtras.h" 74 #include "llvm/ADT/SmallSet.h" 75 #include "llvm/ADT/SmallVector.h" 76 #include "llvm/CodeGen/MachineBasicBlock.h" 77 #include "llvm/CodeGen/MachineDominators.h" 78 #include "llvm/CodeGen/MachineFunction.h" 79 #include "llvm/CodeGen/MachineFunctionPass.h" 80 #include "llvm/CodeGen/MachineInstr.h" 81 #include "llvm/CodeGen/MachineInstrBuilder.h" 82 #include "llvm/CodeGen/MachineOperand.h" 83 #include "llvm/CodeGen/MachineRegisterInfo.h" 84 #include "llvm/CodeGen/TargetRegisterInfo.h" 85 #include "llvm/InitializePasses.h" 86 #include "llvm/Pass.h" 87 #include "llvm/Support/CodeGen.h" 88 #include "llvm/Support/CommandLine.h" 89 #include "llvm/Support/Debug.h" 90 #include "llvm/Support/raw_ostream.h" 91 #include "llvm/Target/TargetMachine.h" 92 #include <cassert> 93 #include <cstdint> 94 #include <iterator> 95 #include <list> 96 #include <map> 97 #include <tuple> 98 #include <utility> 99 100 using namespace llvm; 101 102 #define DEBUG_TYPE "si-fix-sgpr-copies" 103 104 static cl::opt<bool> EnableM0Merge( 105 "amdgpu-enable-merge-m0", 106 cl::desc("Merge and hoist M0 initializations"), 107 cl::init(true)); 108 109 namespace { 110 111 class SIFixSGPRCopies : public MachineFunctionPass { 112 MachineDominatorTree *MDT; 113 114 public: 115 static char ID; 116 117 MachineRegisterInfo *MRI; 118 const SIRegisterInfo *TRI; 119 const SIInstrInfo *TII; 120 121 SIFixSGPRCopies() : MachineFunctionPass(ID) {} 122 123 bool runOnMachineFunction(MachineFunction &MF) override; 124 125 void processPHINode(MachineInstr &MI); 126 127 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 128 129 void getAnalysisUsage(AnalysisUsage &AU) const override { 130 AU.addRequired<MachineDominatorTree>(); 131 AU.addPreserved<MachineDominatorTree>(); 132 AU.setPreservesCFG(); 133 MachineFunctionPass::getAnalysisUsage(AU); 134 } 135 }; 136 137 } // end anonymous namespace 138 139 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 140 "SI Fix SGPR copies", false, false) 141 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 142 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 143 "SI Fix SGPR copies", false, false) 144 145 char SIFixSGPRCopies::ID = 0; 146 147 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 148 149 FunctionPass *llvm::createSIFixSGPRCopiesPass() { 150 return new SIFixSGPRCopies(); 151 } 152 153 static bool hasVectorOperands(const MachineInstr &MI, 154 const SIRegisterInfo *TRI) { 155 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 156 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 157 if (!MI.getOperand(i).isReg() || 158 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 159 continue; 160 161 if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) 162 return true; 163 } 164 return false; 165 } 166 167 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 168 getCopyRegClasses(const MachineInstr &Copy, 169 const SIRegisterInfo &TRI, 170 const MachineRegisterInfo &MRI) { 171 Register DstReg = Copy.getOperand(0).getReg(); 172 Register SrcReg = Copy.getOperand(1).getReg(); 173 174 const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg) 175 ? MRI.getRegClass(SrcReg) 176 : TRI.getPhysRegClass(SrcReg); 177 178 // We don't really care about the subregister here. 179 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 180 181 const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg) 182 ? MRI.getRegClass(DstReg) 183 : TRI.getPhysRegClass(DstReg); 184 185 return std::make_pair(SrcRC, DstRC); 186 } 187 188 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 189 const TargetRegisterClass *DstRC, 190 const SIRegisterInfo &TRI) { 191 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 192 TRI.hasVectorRegisters(SrcRC); 193 } 194 195 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 196 const TargetRegisterClass *DstRC, 197 const SIRegisterInfo &TRI) { 198 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 199 TRI.hasVectorRegisters(DstRC); 200 } 201 202 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 203 const SIRegisterInfo *TRI, 204 const SIInstrInfo *TII) { 205 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 206 auto &Src = MI.getOperand(1); 207 Register DstReg = MI.getOperand(0).getReg(); 208 Register SrcReg = Src.getReg(); 209 if (!Register::isVirtualRegister(SrcReg) || 210 !Register::isVirtualRegister(DstReg)) 211 return false; 212 213 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 214 const auto *UseMI = MO.getParent(); 215 if (UseMI == &MI) 216 continue; 217 if (MO.isDef() || UseMI->getParent() != MI.getParent() || 218 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || 219 !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) 220 return false; 221 } 222 // Change VGPR to SGPR destination. 223 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 224 return true; 225 } 226 227 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 228 // 229 // SGPRx = ... 230 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 231 // VGPRz = COPY SGPRy 232 // 233 // ==> 234 // 235 // VGPRx = COPY SGPRx 236 // VGPRz = REG_SEQUENCE VGPRx, sub0 237 // 238 // This exposes immediate folding opportunities when materializing 64-bit 239 // immediates. 240 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 241 const SIRegisterInfo *TRI, 242 const SIInstrInfo *TII, 243 MachineRegisterInfo &MRI) { 244 assert(MI.isRegSequence()); 245 246 Register DstReg = MI.getOperand(0).getReg(); 247 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 248 return false; 249 250 if (!MRI.hasOneUse(DstReg)) 251 return false; 252 253 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 254 if (!CopyUse.isCopy()) 255 return false; 256 257 // It is illegal to have vreg inputs to a physreg defining reg_sequence. 258 if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg())) 259 return false; 260 261 const TargetRegisterClass *SrcRC, *DstRC; 262 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 263 264 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 265 return false; 266 267 if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 268 return true; 269 270 // TODO: Could have multiple extracts? 271 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 272 if (SubReg != AMDGPU::NoSubRegister) 273 return false; 274 275 MRI.setRegClass(DstReg, DstRC); 276 277 // SGPRx = ... 278 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 279 // VGPRz = COPY SGPRy 280 281 // => 282 // VGPRx = COPY SGPRx 283 // VGPRz = REG_SEQUENCE VGPRx, sub0 284 285 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 286 bool IsAGPR = TRI->hasAGPRs(DstRC); 287 288 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 289 Register SrcReg = MI.getOperand(I).getReg(); 290 unsigned SrcSubReg = MI.getOperand(I).getSubReg(); 291 292 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 293 assert(TRI->isSGPRClass(SrcRC) && 294 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 295 296 SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); 297 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 298 299 Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 300 301 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 302 TmpReg) 303 .add(MI.getOperand(I)); 304 305 if (IsAGPR) { 306 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 307 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 308 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 309 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 310 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 311 TmpAReg) 312 .addReg(TmpReg, RegState::Kill); 313 TmpReg = TmpAReg; 314 } 315 316 MI.getOperand(I).setReg(TmpReg); 317 } 318 319 CopyUse.eraseFromParent(); 320 return true; 321 } 322 323 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 324 const MachineInstr *MoveImm, 325 const SIInstrInfo *TII, 326 unsigned &SMovOp, 327 int64_t &Imm) { 328 if (Copy->getOpcode() != AMDGPU::COPY) 329 return false; 330 331 if (!MoveImm->isMoveImmediate()) 332 return false; 333 334 const MachineOperand *ImmOp = 335 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 336 if (!ImmOp->isImm()) 337 return false; 338 339 // FIXME: Handle copies with sub-regs. 340 if (Copy->getOperand(0).getSubReg()) 341 return false; 342 343 switch (MoveImm->getOpcode()) { 344 default: 345 return false; 346 case AMDGPU::V_MOV_B32_e32: 347 SMovOp = AMDGPU::S_MOV_B32; 348 break; 349 case AMDGPU::V_MOV_B64_PSEUDO: 350 SMovOp = AMDGPU::S_MOV_B64; 351 break; 352 } 353 Imm = ImmOp->getImm(); 354 return true; 355 } 356 357 template <class UnaryPredicate> 358 bool searchPredecessors(const MachineBasicBlock *MBB, 359 const MachineBasicBlock *CutOff, 360 UnaryPredicate Predicate) { 361 if (MBB == CutOff) 362 return false; 363 364 DenseSet<const MachineBasicBlock *> Visited; 365 SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(), 366 MBB->pred_end()); 367 368 while (!Worklist.empty()) { 369 MachineBasicBlock *MBB = Worklist.pop_back_val(); 370 371 if (!Visited.insert(MBB).second) 372 continue; 373 if (MBB == CutOff) 374 continue; 375 if (Predicate(MBB)) 376 return true; 377 378 Worklist.append(MBB->pred_begin(), MBB->pred_end()); 379 } 380 381 return false; 382 } 383 384 // Checks if there is potential path From instruction To instruction. 385 // If CutOff is specified and it sits in between of that path we ignore 386 // a higher portion of the path and report it is not reachable. 387 static bool isReachable(const MachineInstr *From, 388 const MachineInstr *To, 389 const MachineBasicBlock *CutOff, 390 MachineDominatorTree &MDT) { 391 // If either From block dominates To block or instructions are in the same 392 // block and From is higher. 393 if (MDT.dominates(From, To)) 394 return true; 395 396 const MachineBasicBlock *MBBFrom = From->getParent(); 397 const MachineBasicBlock *MBBTo = To->getParent(); 398 if (MBBFrom == MBBTo) 399 return false; 400 401 // Instructions are in different blocks, do predecessor search. 402 // We should almost never get here since we do not usually produce M0 stores 403 // other than -1. 404 return searchPredecessors(MBBTo, CutOff, [MBBFrom] 405 (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 406 } 407 408 // Return the first non-prologue instruction in the block. 409 static MachineBasicBlock::iterator 410 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 411 MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 412 while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 413 ++I; 414 415 return I; 416 } 417 418 // Hoist and merge identical SGPR initializations into a common predecessor. 419 // This is intended to combine M0 initializations, but can work with any 420 // SGPR. A VGPR cannot be processed since we cannot guarantee vector 421 // executioon. 422 static bool hoistAndMergeSGPRInits(unsigned Reg, 423 const MachineRegisterInfo &MRI, 424 const TargetRegisterInfo *TRI, 425 MachineDominatorTree &MDT, 426 const TargetInstrInfo *TII) { 427 // List of inits by immediate value. 428 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 429 InitListMap Inits; 430 // List of clobbering instructions. 431 SmallVector<MachineInstr*, 8> Clobbers; 432 // List of instructions marked for deletion. 433 SmallSet<MachineInstr*, 8> MergedInstrs; 434 435 bool Changed = false; 436 437 for (auto &MI : MRI.def_instructions(Reg)) { 438 MachineOperand *Imm = nullptr; 439 for (auto &MO : MI.operands()) { 440 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 441 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 442 Imm = nullptr; 443 break; 444 } else if (MO.isImm()) 445 Imm = &MO; 446 } 447 if (Imm) 448 Inits[Imm->getImm()].push_front(&MI); 449 else 450 Clobbers.push_back(&MI); 451 } 452 453 for (auto &Init : Inits) { 454 auto &Defs = Init.second; 455 456 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 457 MachineInstr *MI1 = *I1; 458 459 for (auto I2 = std::next(I1); I2 != E; ) { 460 MachineInstr *MI2 = *I2; 461 462 // Check any possible interference 463 auto interferes = [&](MachineBasicBlock::iterator From, 464 MachineBasicBlock::iterator To) -> bool { 465 466 assert(MDT.dominates(&*To, &*From)); 467 468 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 469 const MachineBasicBlock *MBBFrom = From->getParent(); 470 const MachineBasicBlock *MBBTo = To->getParent(); 471 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 472 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 473 if (!MayClobberFrom && !MayClobberTo) 474 return false; 475 if ((MayClobberFrom && !MayClobberTo) || 476 (!MayClobberFrom && MayClobberTo)) 477 return true; 478 // Both can clobber, this is not an interference only if both are 479 // dominated by Clobber and belong to the same block or if Clobber 480 // properly dominates To, given that To >> From, so it dominates 481 // both and located in a common dominator. 482 return !((MBBFrom == MBBTo && 483 MDT.dominates(Clobber, &*From) && 484 MDT.dominates(Clobber, &*To)) || 485 MDT.properlyDominates(Clobber->getParent(), MBBTo)); 486 }; 487 488 return (llvm::any_of(Clobbers, interferes)) || 489 (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 490 return C.first != Init.first && 491 llvm::any_of(C.second, interferes); 492 })); 493 }; 494 495 if (MDT.dominates(MI1, MI2)) { 496 if (!interferes(MI2, MI1)) { 497 LLVM_DEBUG(dbgs() 498 << "Erasing from " 499 << printMBBReference(*MI2->getParent()) << " " << *MI2); 500 MergedInstrs.insert(MI2); 501 Changed = true; 502 ++I2; 503 continue; 504 } 505 } else if (MDT.dominates(MI2, MI1)) { 506 if (!interferes(MI1, MI2)) { 507 LLVM_DEBUG(dbgs() 508 << "Erasing from " 509 << printMBBReference(*MI1->getParent()) << " " << *MI1); 510 MergedInstrs.insert(MI1); 511 Changed = true; 512 ++I1; 513 break; 514 } 515 } else { 516 auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 517 MI2->getParent()); 518 if (!MBB) { 519 ++I2; 520 continue; 521 } 522 523 MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 524 if (!interferes(MI1, I) && !interferes(MI2, I)) { 525 LLVM_DEBUG(dbgs() 526 << "Erasing from " 527 << printMBBReference(*MI1->getParent()) << " " << *MI1 528 << "and moving from " 529 << printMBBReference(*MI2->getParent()) << " to " 530 << printMBBReference(*I->getParent()) << " " << *MI2); 531 I->getParent()->splice(I, MI2->getParent(), MI2); 532 MergedInstrs.insert(MI1); 533 Changed = true; 534 ++I1; 535 break; 536 } 537 } 538 ++I2; 539 } 540 ++I1; 541 } 542 } 543 544 // Remove initializations that were merged into another. 545 for (auto &Init : Inits) { 546 auto &Defs = Init.second; 547 auto I = Defs.begin(); 548 while (I != Defs.end()) { 549 if (MergedInstrs.count(*I)) { 550 (*I)->eraseFromParent(); 551 I = Defs.erase(I); 552 } else 553 ++I; 554 } 555 } 556 557 // Try to schedule SGPR initializations as early as possible in the MBB. 558 for (auto &Init : Inits) { 559 auto &Defs = Init.second; 560 for (auto MI : Defs) { 561 auto MBB = MI->getParent(); 562 MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 563 MachineBasicBlock::reverse_iterator B(BoundaryMI); 564 // Check if B should actually be a boundary. If not set the previous 565 // instruction as the boundary instead. 566 if (!TII->isBasicBlockPrologue(*B)) 567 B++; 568 569 auto R = std::next(MI->getReverseIterator()); 570 const unsigned Threshold = 50; 571 // Search until B or Threshold for a place to insert the initialization. 572 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 573 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 574 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 575 break; 576 577 // Move to directly after R. 578 if (&*--R != MI) 579 MBB->splice(*R, MBB, MI); 580 } 581 } 582 583 if (Changed) 584 MRI.clearKillFlags(Reg); 585 586 return Changed; 587 } 588 589 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 590 // Only need to run this in SelectionDAG path. 591 if (MF.getProperties().hasProperty( 592 MachineFunctionProperties::Property::Selected)) 593 return false; 594 595 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 596 MRI = &MF.getRegInfo(); 597 TRI = ST.getRegisterInfo(); 598 TII = ST.getInstrInfo(); 599 MDT = &getAnalysis<MachineDominatorTree>(); 600 601 SmallVector<MachineInstr *, 16> Worklist; 602 603 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 604 BI != BE; ++BI) { 605 MachineBasicBlock &MBB = *BI; 606 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 607 I != E; ++I) { 608 MachineInstr &MI = *I; 609 610 switch (MI.getOpcode()) { 611 default: 612 continue; 613 case AMDGPU::COPY: 614 case AMDGPU::WQM: 615 case AMDGPU::SOFT_WQM: 616 case AMDGPU::WWM: { 617 Register DstReg = MI.getOperand(0).getReg(); 618 619 const TargetRegisterClass *SrcRC, *DstRC; 620 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 621 622 if (!Register::isVirtualRegister(DstReg)) { 623 // If the destination register is a physical register there isn't 624 // really much we can do to fix this. 625 // Some special instructions use M0 as an input. Some even only use 626 // the first lane. Insert a readfirstlane and hope for the best. 627 if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { 628 Register TmpReg 629 = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 630 631 BuildMI(MBB, MI, MI.getDebugLoc(), 632 TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 633 .add(MI.getOperand(1)); 634 MI.getOperand(1).setReg(TmpReg); 635 } 636 637 continue; 638 } 639 640 if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { 641 Register SrcReg = MI.getOperand(1).getReg(); 642 if (!Register::isVirtualRegister(SrcReg)) { 643 TII->moveToVALU(MI, MDT); 644 break; 645 } 646 647 MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 648 unsigned SMovOp; 649 int64_t Imm; 650 // If we are just copying an immediate, we can replace the copy with 651 // s_mov_b32. 652 if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { 653 MI.getOperand(1).ChangeToImmediate(Imm); 654 MI.addImplicitDefUseOperands(MF); 655 MI.setDesc(TII->get(SMovOp)); 656 break; 657 } 658 TII->moveToVALU(MI, MDT); 659 } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 660 tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); 661 } 662 663 break; 664 } 665 case AMDGPU::PHI: { 666 processPHINode(MI); 667 break; 668 } 669 case AMDGPU::REG_SEQUENCE: 670 if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || 671 !hasVectorOperands(MI, TRI)) { 672 foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); 673 continue; 674 } 675 676 LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); 677 678 TII->moveToVALU(MI, MDT); 679 break; 680 case AMDGPU::INSERT_SUBREG: { 681 const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; 682 DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); 683 Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); 684 Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); 685 if (TRI->isSGPRClass(DstRC) && 686 (TRI->hasVectorRegisters(Src0RC) || 687 TRI->hasVectorRegisters(Src1RC))) { 688 LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); 689 TII->moveToVALU(MI, MDT); 690 } 691 break; 692 } 693 case AMDGPU::V_WRITELANE_B32: { 694 // Some architectures allow more than one constant bus access without 695 // SGPR restriction 696 if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 697 break; 698 699 // Writelane is special in that it can use SGPR and M0 (which would 700 // normally count as using the constant bus twice - but in this case it 701 // is allowed since the lane selector doesn't count as a use of the 702 // constant bus). However, it is still required to abide by the 1 SGPR 703 // rule. Apply a fix here as we might have multiple SGPRs after 704 // legalizing VGPRs to SGPRs 705 int Src0Idx = 706 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 707 int Src1Idx = 708 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 709 MachineOperand &Src0 = MI.getOperand(Src0Idx); 710 MachineOperand &Src1 = MI.getOperand(Src1Idx); 711 712 // Check to see if the instruction violates the 1 SGPR rule 713 if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 714 Src0.getReg() != AMDGPU::M0) && 715 (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 716 Src1.getReg() != AMDGPU::M0)) { 717 718 // Check for trivially easy constant prop into one of the operands 719 // If this is the case then perform the operation now to resolve SGPR 720 // issue. If we don't do that here we will always insert a mov to m0 721 // that can't be resolved in later operand folding pass 722 bool Resolved = false; 723 for (MachineOperand *MO : {&Src0, &Src1}) { 724 if (Register::isVirtualRegister(MO->getReg())) { 725 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 726 if (DefMI && TII->isFoldableCopy(*DefMI)) { 727 const MachineOperand &Def = DefMI->getOperand(0); 728 if (Def.isReg() && 729 MO->getReg() == Def.getReg() && 730 MO->getSubReg() == Def.getSubReg()) { 731 const MachineOperand &Copied = DefMI->getOperand(1); 732 if (Copied.isImm() && 733 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 734 MO->ChangeToImmediate(Copied.getImm()); 735 Resolved = true; 736 break; 737 } 738 } 739 } 740 } 741 } 742 743 if (!Resolved) { 744 // Haven't managed to resolve by replacing an SGPR with an immediate 745 // Move src1 to be in M0 746 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 747 TII->get(AMDGPU::COPY), AMDGPU::M0) 748 .add(Src1); 749 Src1.ChangeToRegister(AMDGPU::M0, false); 750 } 751 } 752 break; 753 } 754 } 755 } 756 } 757 758 if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) 759 hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 760 761 return true; 762 } 763 764 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 765 unsigned numVGPRUses = 0; 766 bool AllAGPRUses = true; 767 SetVector<const MachineInstr *> worklist; 768 SmallSet<const MachineInstr *, 4> Visited; 769 SetVector<MachineInstr *> PHIOperands; 770 worklist.insert(&MI); 771 Visited.insert(&MI); 772 while (!worklist.empty()) { 773 const MachineInstr *Instr = worklist.pop_back_val(); 774 unsigned Reg = Instr->getOperand(0).getReg(); 775 for (const auto &Use : MRI->use_operands(Reg)) { 776 const MachineInstr *UseMI = Use.getParent(); 777 AllAGPRUses &= (UseMI->isCopy() && 778 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 779 TRI->isAGPR(*MRI, Use.getReg()); 780 if (UseMI->isCopy() || UseMI->isRegSequence()) { 781 if (UseMI->isCopy() && 782 UseMI->getOperand(0).getReg().isPhysical() && 783 !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { 784 numVGPRUses++; 785 } 786 if (Visited.insert(UseMI).second) 787 worklist.insert(UseMI); 788 789 continue; 790 } 791 792 if (UseMI->isPHI()) { 793 const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); 794 if (!TRI->isSGPRReg(*MRI, Use.getReg()) && 795 UseRC != &AMDGPU::VReg_1RegClass) 796 numVGPRUses++; 797 continue; 798 } 799 800 const TargetRegisterClass *OpRC = 801 TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); 802 if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && 803 OpRC != &AMDGPU::VS_64RegClass) { 804 numVGPRUses++; 805 } 806 } 807 } 808 809 Register PHIRes = MI.getOperand(0).getReg(); 810 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 811 if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { 812 LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 813 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 814 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 815 MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); 816 if (DefMI && DefMI->isPHI()) 817 PHIOperands.insert(DefMI); 818 } 819 } 820 821 bool hasVGPRInput = false; 822 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 823 unsigned InputReg = MI.getOperand(i).getReg(); 824 MachineInstr *Def = MRI->getVRegDef(InputReg); 825 if (TRI->isVectorRegister(*MRI, InputReg)) { 826 if (Def->isCopy()) { 827 unsigned SrcReg = Def->getOperand(1).getReg(); 828 const TargetRegisterClass *RC = 829 TRI->getRegClassForReg(*MRI, SrcReg); 830 if (TRI->isSGPRClass(RC)) 831 continue; 832 } 833 hasVGPRInput = true; 834 break; 835 } 836 else if (Def->isCopy() && 837 TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { 838 Register SrcReg = Def->getOperand(1).getReg(); 839 MachineInstr *SrcDef = MRI->getVRegDef(SrcReg); 840 unsigned SMovOp; 841 int64_t Imm; 842 if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) { 843 hasVGPRInput = true; 844 break; 845 } else { 846 // Formally, if we did not do this right away 847 // it would be done on the next iteration of the 848 // runOnMachineFunction main loop. But why not if we can? 849 MachineFunction *MF = MI.getParent()->getParent(); 850 Def->getOperand(1).ChangeToImmediate(Imm); 851 Def->addImplicitDefUseOperands(*MF); 852 Def->setDesc(TII->get(SMovOp)); 853 } 854 } 855 } 856 857 if ((!TRI->isVectorRegister(*MRI, PHIRes) && 858 RC0 != &AMDGPU::VReg_1RegClass) && 859 (hasVGPRInput || numVGPRUses > 1)) { 860 LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); 861 TII->moveToVALU(MI); 862 } 863 else { 864 LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 865 TII->legalizeOperands(MI, MDT); 866 } 867 868 // Propagate register class back to PHI operands which are PHI themselves. 869 while (!PHIOperands.empty()) { 870 processPHINode(*PHIOperands.pop_back_val()); 871 } 872 } 873