1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 11 /// will sometimes generate these illegal copies in situations like this: 12 /// 13 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14 /// 15 /// BB0: 16 /// %0 <sgpr> = SCALAR_INST 17 /// %1 <vsrc> = COPY %0 <sgpr> 18 /// ... 19 /// BRANCH %cond BB1, BB2 20 /// BB1: 21 /// %2 <vgpr> = VECTOR_INST 22 /// %3 <vsrc> = COPY %2 <vgpr> 23 /// BB2: 24 /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25 /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26 /// 27 /// 28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29 /// code will look like this: 30 /// 31 /// BB0: 32 /// %0 <sgpr> = SCALAR_INST 33 /// ... 34 /// BRANCH %cond BB1, BB2 35 /// BB1: 36 /// %2 <vgpr> = VECTOR_INST 37 /// %3 <vsrc> = COPY %2 <vgpr> 38 /// BB2: 39 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41 /// 42 /// Now that the result of the PHI instruction is an SGPR, the register 43 /// allocator is now forced to constrain the register class of %3 to 44 /// <sgpr> so we end up with final code like this: 45 /// 46 /// BB0: 47 /// %0 <sgpr> = SCALAR_INST 48 /// ... 49 /// BRANCH %cond BB1, BB2 50 /// BB1: 51 /// %2 <vgpr> = VECTOR_INST 52 /// %3 <sgpr> = COPY %2 <vgpr> 53 /// BB2: 54 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56 /// 57 /// Now this code contains an illegal copy from a VGPR to an SGPR. 58 /// 59 /// In order to avoid this problem, this pass searches for PHI instructions 60 /// which define a <vsrc> register and constrains its definition class to 61 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 63 /// will be unable to perform the COPY removal from the above example which 64 /// ultimately led to the creation of an illegal COPY. 65 //===----------------------------------------------------------------------===// 66 67 #include "AMDGPU.h" 68 #include "GCNSubtarget.h" 69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 70 #include "llvm/ADT/SetOperations.h" 71 #include "llvm/CodeGen/MachineDominators.h" 72 #include "llvm/InitializePasses.h" 73 #include "llvm/Target/TargetMachine.h" 74 75 using namespace llvm; 76 77 #define DEBUG_TYPE "si-fix-sgpr-copies" 78 79 static cl::opt<bool> EnableM0Merge( 80 "amdgpu-enable-merge-m0", 81 cl::desc("Merge and hoist M0 initializations"), 82 cl::init(true)); 83 84 namespace { 85 86 class V2SCopyInfo { 87 public: 88 // VGPR to SGPR copy being processed 89 MachineInstr *Copy; 90 // All SALU instructions reachable from this copy in SSA graph 91 DenseSet<MachineInstr *> SChain; 92 // Number of SGPR to VGPR copies that are used to put the SALU computation 93 // results back to VALU. 94 unsigned NumSVCopies; 95 96 unsigned Score; 97 // Actual count of v_readfirstlane_b32 98 // which need to be inserted to keep SChain SALU 99 unsigned NumReadfirstlanes; 100 // Current score state. To speedup selection V2SCopyInfos for processing 101 bool NeedToBeConvertedToVALU = false; 102 // Unique ID. Used as a key for mapping to keep permanent order. 103 unsigned ID; 104 105 // Count of another VGPR to SGPR copies that contribute to the 106 // current copy SChain 107 unsigned SiblingPenalty = 0; 108 SetVector<unsigned> Siblings; 109 V2SCopyInfo() : Copy(nullptr), ID(0){}; 110 V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) 111 : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; 112 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 113 void dump() { 114 dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() 115 << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty 116 << "\nScore: " << Score << "\n"; 117 } 118 #endif 119 }; 120 121 class SIFixSGPRCopies : public MachineFunctionPass { 122 MachineDominatorTree *MDT; 123 SmallVector<MachineInstr*, 4> SCCCopies; 124 SmallVector<MachineInstr*, 4> RegSequences; 125 SmallVector<MachineInstr*, 4> PHINodes; 126 SmallVector<MachineInstr*, 4> S2VCopies; 127 unsigned NextVGPRToSGPRCopyID; 128 DenseMap<unsigned, V2SCopyInfo> V2SCopies; 129 DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; 130 131 public: 132 static char ID; 133 134 MachineRegisterInfo *MRI; 135 const SIRegisterInfo *TRI; 136 const SIInstrInfo *TII; 137 138 SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} 139 140 bool runOnMachineFunction(MachineFunction &MF) override; 141 void fixSCCCopies(MachineFunction &MF); 142 void prepareRegSequenceAndPHIs(MachineFunction &MF); 143 unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } 144 bool needToBeConvertedToVALU(V2SCopyInfo *I); 145 void analyzeVGPRToSGPRCopy(MachineInstr *MI); 146 void lowerVGPR2SGPRCopies(MachineFunction &MF); 147 // Handles copies which source register is: 148 // 1. Physical register 149 // 2. AGPR 150 // 3. Defined by the instruction the merely moves the immediate 151 bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); 152 153 void processPHINode(MachineInstr &MI); 154 155 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 156 157 void getAnalysisUsage(AnalysisUsage &AU) const override { 158 AU.addRequired<MachineDominatorTree>(); 159 AU.addPreserved<MachineDominatorTree>(); 160 AU.setPreservesCFG(); 161 MachineFunctionPass::getAnalysisUsage(AU); 162 } 163 }; 164 165 } // end anonymous namespace 166 167 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 168 "SI Fix SGPR copies", false, false) 169 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 170 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 171 "SI Fix SGPR copies", false, false) 172 173 char SIFixSGPRCopies::ID = 0; 174 175 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 176 177 FunctionPass *llvm::createSIFixSGPRCopiesPass() { 178 return new SIFixSGPRCopies(); 179 } 180 181 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 182 getCopyRegClasses(const MachineInstr &Copy, 183 const SIRegisterInfo &TRI, 184 const MachineRegisterInfo &MRI) { 185 Register DstReg = Copy.getOperand(0).getReg(); 186 Register SrcReg = Copy.getOperand(1).getReg(); 187 188 const TargetRegisterClass *SrcRC = SrcReg.isVirtual() 189 ? MRI.getRegClass(SrcReg) 190 : TRI.getPhysRegBaseClass(SrcReg); 191 192 // We don't really care about the subregister here. 193 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 194 195 const TargetRegisterClass *DstRC = DstReg.isVirtual() 196 ? MRI.getRegClass(DstReg) 197 : TRI.getPhysRegBaseClass(DstReg); 198 199 return std::pair(SrcRC, DstRC); 200 } 201 202 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 203 const TargetRegisterClass *DstRC, 204 const SIRegisterInfo &TRI) { 205 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 206 TRI.hasVectorRegisters(SrcRC); 207 } 208 209 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 210 const TargetRegisterClass *DstRC, 211 const SIRegisterInfo &TRI) { 212 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 213 TRI.hasVectorRegisters(DstRC); 214 } 215 216 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 217 const SIRegisterInfo *TRI, 218 const SIInstrInfo *TII) { 219 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 220 auto &Src = MI.getOperand(1); 221 Register DstReg = MI.getOperand(0).getReg(); 222 Register SrcReg = Src.getReg(); 223 if (!SrcReg.isVirtual() || !DstReg.isVirtual()) 224 return false; 225 226 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 227 const auto *UseMI = MO.getParent(); 228 if (UseMI == &MI) 229 continue; 230 if (MO.isDef() || UseMI->getParent() != MI.getParent() || 231 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 232 return false; 233 234 unsigned OpIdx = UseMI->getOperandNo(&MO); 235 if (OpIdx >= UseMI->getDesc().getNumOperands() || 236 !TII->isOperandLegal(*UseMI, OpIdx, &Src)) 237 return false; 238 } 239 // Change VGPR to SGPR destination. 240 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 241 return true; 242 } 243 244 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 245 // 246 // SGPRx = ... 247 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 248 // VGPRz = COPY SGPRy 249 // 250 // ==> 251 // 252 // VGPRx = COPY SGPRx 253 // VGPRz = REG_SEQUENCE VGPRx, sub0 254 // 255 // This exposes immediate folding opportunities when materializing 64-bit 256 // immediates. 257 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 258 const SIRegisterInfo *TRI, 259 const SIInstrInfo *TII, 260 MachineRegisterInfo &MRI) { 261 assert(MI.isRegSequence()); 262 263 Register DstReg = MI.getOperand(0).getReg(); 264 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 265 return false; 266 267 if (!MRI.hasOneUse(DstReg)) 268 return false; 269 270 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 271 if (!CopyUse.isCopy()) 272 return false; 273 274 // It is illegal to have vreg inputs to a physreg defining reg_sequence. 275 if (CopyUse.getOperand(0).getReg().isPhysical()) 276 return false; 277 278 const TargetRegisterClass *SrcRC, *DstRC; 279 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 280 281 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 282 return false; 283 284 if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 285 return true; 286 287 // TODO: Could have multiple extracts? 288 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 289 if (SubReg != AMDGPU::NoSubRegister) 290 return false; 291 292 MRI.setRegClass(DstReg, DstRC); 293 294 // SGPRx = ... 295 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 296 // VGPRz = COPY SGPRy 297 298 // => 299 // VGPRx = COPY SGPRx 300 // VGPRz = REG_SEQUENCE VGPRx, sub0 301 302 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 303 bool IsAGPR = TRI->isAGPRClass(DstRC); 304 305 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 306 const TargetRegisterClass *SrcRC = 307 TRI->getRegClassForOperandReg(MRI, MI.getOperand(I)); 308 assert(TRI->isSGPRClass(SrcRC) && 309 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 310 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 311 312 Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 313 314 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 315 TmpReg) 316 .add(MI.getOperand(I)); 317 318 if (IsAGPR) { 319 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 320 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 321 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 322 AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; 323 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 324 TmpAReg) 325 .addReg(TmpReg, RegState::Kill); 326 TmpReg = TmpAReg; 327 } 328 329 MI.getOperand(I).setReg(TmpReg); 330 } 331 332 CopyUse.eraseFromParent(); 333 return true; 334 } 335 336 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 337 const MachineInstr *MoveImm, 338 const SIInstrInfo *TII, 339 unsigned &SMovOp, 340 int64_t &Imm) { 341 if (Copy->getOpcode() != AMDGPU::COPY) 342 return false; 343 344 if (!MoveImm->isMoveImmediate()) 345 return false; 346 347 const MachineOperand *ImmOp = 348 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 349 if (!ImmOp->isImm()) 350 return false; 351 352 // FIXME: Handle copies with sub-regs. 353 if (Copy->getOperand(0).getSubReg()) 354 return false; 355 356 switch (MoveImm->getOpcode()) { 357 default: 358 return false; 359 case AMDGPU::V_MOV_B32_e32: 360 SMovOp = AMDGPU::S_MOV_B32; 361 break; 362 case AMDGPU::V_MOV_B64_PSEUDO: 363 SMovOp = AMDGPU::S_MOV_B64; 364 break; 365 } 366 Imm = ImmOp->getImm(); 367 return true; 368 } 369 370 template <class UnaryPredicate> 371 bool searchPredecessors(const MachineBasicBlock *MBB, 372 const MachineBasicBlock *CutOff, 373 UnaryPredicate Predicate) { 374 if (MBB == CutOff) 375 return false; 376 377 DenseSet<const MachineBasicBlock *> Visited; 378 SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); 379 380 while (!Worklist.empty()) { 381 MachineBasicBlock *MBB = Worklist.pop_back_val(); 382 383 if (!Visited.insert(MBB).second) 384 continue; 385 if (MBB == CutOff) 386 continue; 387 if (Predicate(MBB)) 388 return true; 389 390 Worklist.append(MBB->pred_begin(), MBB->pred_end()); 391 } 392 393 return false; 394 } 395 396 // Checks if there is potential path From instruction To instruction. 397 // If CutOff is specified and it sits in between of that path we ignore 398 // a higher portion of the path and report it is not reachable. 399 static bool isReachable(const MachineInstr *From, 400 const MachineInstr *To, 401 const MachineBasicBlock *CutOff, 402 MachineDominatorTree &MDT) { 403 if (MDT.dominates(From, To)) 404 return true; 405 406 const MachineBasicBlock *MBBFrom = From->getParent(); 407 const MachineBasicBlock *MBBTo = To->getParent(); 408 409 // Do predecessor search. 410 // We should almost never get here since we do not usually produce M0 stores 411 // other than -1. 412 return searchPredecessors(MBBTo, CutOff, [MBBFrom] 413 (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 414 } 415 416 // Return the first non-prologue instruction in the block. 417 static MachineBasicBlock::iterator 418 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 419 MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 420 while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 421 ++I; 422 423 return I; 424 } 425 426 // Hoist and merge identical SGPR initializations into a common predecessor. 427 // This is intended to combine M0 initializations, but can work with any 428 // SGPR. A VGPR cannot be processed since we cannot guarantee vector 429 // executioon. 430 static bool hoistAndMergeSGPRInits(unsigned Reg, 431 const MachineRegisterInfo &MRI, 432 const TargetRegisterInfo *TRI, 433 MachineDominatorTree &MDT, 434 const TargetInstrInfo *TII) { 435 // List of inits by immediate value. 436 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 437 InitListMap Inits; 438 // List of clobbering instructions. 439 SmallVector<MachineInstr*, 8> Clobbers; 440 // List of instructions marked for deletion. 441 SmallSet<MachineInstr*, 8> MergedInstrs; 442 443 bool Changed = false; 444 445 for (auto &MI : MRI.def_instructions(Reg)) { 446 MachineOperand *Imm = nullptr; 447 for (auto &MO : MI.operands()) { 448 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 449 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 450 Imm = nullptr; 451 break; 452 } else if (MO.isImm()) 453 Imm = &MO; 454 } 455 if (Imm) 456 Inits[Imm->getImm()].push_front(&MI); 457 else 458 Clobbers.push_back(&MI); 459 } 460 461 for (auto &Init : Inits) { 462 auto &Defs = Init.second; 463 464 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 465 MachineInstr *MI1 = *I1; 466 467 for (auto I2 = std::next(I1); I2 != E; ) { 468 MachineInstr *MI2 = *I2; 469 470 // Check any possible interference 471 auto interferes = [&](MachineBasicBlock::iterator From, 472 MachineBasicBlock::iterator To) -> bool { 473 474 assert(MDT.dominates(&*To, &*From)); 475 476 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 477 const MachineBasicBlock *MBBFrom = From->getParent(); 478 const MachineBasicBlock *MBBTo = To->getParent(); 479 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 480 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 481 if (!MayClobberFrom && !MayClobberTo) 482 return false; 483 if ((MayClobberFrom && !MayClobberTo) || 484 (!MayClobberFrom && MayClobberTo)) 485 return true; 486 // Both can clobber, this is not an interference only if both are 487 // dominated by Clobber and belong to the same block or if Clobber 488 // properly dominates To, given that To >> From, so it dominates 489 // both and located in a common dominator. 490 return !((MBBFrom == MBBTo && 491 MDT.dominates(Clobber, &*From) && 492 MDT.dominates(Clobber, &*To)) || 493 MDT.properlyDominates(Clobber->getParent(), MBBTo)); 494 }; 495 496 return (llvm::any_of(Clobbers, interferes)) || 497 (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 498 return C.first != Init.first && 499 llvm::any_of(C.second, interferes); 500 })); 501 }; 502 503 if (MDT.dominates(MI1, MI2)) { 504 if (!interferes(MI2, MI1)) { 505 LLVM_DEBUG(dbgs() 506 << "Erasing from " 507 << printMBBReference(*MI2->getParent()) << " " << *MI2); 508 MergedInstrs.insert(MI2); 509 Changed = true; 510 ++I2; 511 continue; 512 } 513 } else if (MDT.dominates(MI2, MI1)) { 514 if (!interferes(MI1, MI2)) { 515 LLVM_DEBUG(dbgs() 516 << "Erasing from " 517 << printMBBReference(*MI1->getParent()) << " " << *MI1); 518 MergedInstrs.insert(MI1); 519 Changed = true; 520 ++I1; 521 break; 522 } 523 } else { 524 auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 525 MI2->getParent()); 526 if (!MBB) { 527 ++I2; 528 continue; 529 } 530 531 MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 532 if (!interferes(MI1, I) && !interferes(MI2, I)) { 533 LLVM_DEBUG(dbgs() 534 << "Erasing from " 535 << printMBBReference(*MI1->getParent()) << " " << *MI1 536 << "and moving from " 537 << printMBBReference(*MI2->getParent()) << " to " 538 << printMBBReference(*I->getParent()) << " " << *MI2); 539 I->getParent()->splice(I, MI2->getParent(), MI2); 540 MergedInstrs.insert(MI1); 541 Changed = true; 542 ++I1; 543 break; 544 } 545 } 546 ++I2; 547 } 548 ++I1; 549 } 550 } 551 552 // Remove initializations that were merged into another. 553 for (auto &Init : Inits) { 554 auto &Defs = Init.second; 555 auto I = Defs.begin(); 556 while (I != Defs.end()) { 557 if (MergedInstrs.count(*I)) { 558 (*I)->eraseFromParent(); 559 I = Defs.erase(I); 560 } else 561 ++I; 562 } 563 } 564 565 // Try to schedule SGPR initializations as early as possible in the MBB. 566 for (auto &Init : Inits) { 567 auto &Defs = Init.second; 568 for (auto *MI : Defs) { 569 auto MBB = MI->getParent(); 570 MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 571 MachineBasicBlock::reverse_iterator B(BoundaryMI); 572 // Check if B should actually be a boundary. If not set the previous 573 // instruction as the boundary instead. 574 if (!TII->isBasicBlockPrologue(*B)) 575 B++; 576 577 auto R = std::next(MI->getReverseIterator()); 578 const unsigned Threshold = 50; 579 // Search until B or Threshold for a place to insert the initialization. 580 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 581 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 582 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 583 break; 584 585 // Move to directly after R. 586 if (&*--R != MI) 587 MBB->splice(*R, MBB, MI); 588 } 589 } 590 591 if (Changed) 592 MRI.clearKillFlags(Reg); 593 594 return Changed; 595 } 596 597 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 598 // Only need to run this in SelectionDAG path. 599 if (MF.getProperties().hasProperty( 600 MachineFunctionProperties::Property::Selected)) 601 return false; 602 603 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 604 MRI = &MF.getRegInfo(); 605 TRI = ST.getRegisterInfo(); 606 TII = ST.getInstrInfo(); 607 MDT = &getAnalysis<MachineDominatorTree>(); 608 609 610 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 611 BI != BE; ++BI) { 612 MachineBasicBlock *MBB = &*BI; 613 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 614 ++I) { 615 MachineInstr &MI = *I; 616 617 switch (MI.getOpcode()) { 618 default: 619 continue; 620 case AMDGPU::COPY: 621 case AMDGPU::WQM: 622 case AMDGPU::STRICT_WQM: 623 case AMDGPU::SOFT_WQM: 624 case AMDGPU::STRICT_WWM: { 625 const TargetRegisterClass *SrcRC, *DstRC; 626 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 627 628 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 629 // Since VGPR to SGPR copies affect VGPR to SGPR copy 630 // score and, hence the lowering decision, let's try to get rid of 631 // them as early as possible 632 if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) 633 continue; 634 635 // Collect those not changed to try them after VGPR to SGPR copies 636 // lowering as there will be more opportunities. 637 S2VCopies.push_back(&MI); 638 } 639 if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) 640 continue; 641 if (lowerSpecialCase(MI, I)) 642 continue; 643 644 analyzeVGPRToSGPRCopy(&MI); 645 646 break; 647 } 648 case AMDGPU::INSERT_SUBREG: 649 case AMDGPU::PHI: 650 case AMDGPU::REG_SEQUENCE: { 651 if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) { 652 for (MachineOperand &MO : MI.operands()) { 653 if (!MO.isReg() || !MO.getReg().isVirtual()) 654 continue; 655 const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); 656 if (TRI->hasVectorRegisters(SrcRC)) { 657 const TargetRegisterClass *DestRC = 658 TRI->getEquivalentSGPRClass(SrcRC); 659 Register NewDst = MRI->createVirtualRegister(DestRC); 660 MachineBasicBlock *BlockToInsertCopy = 661 MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB() 662 : MBB; 663 MachineBasicBlock::iterator PointToInsertCopy = 664 MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; 665 MachineInstr *NewCopy = 666 BuildMI(*BlockToInsertCopy, PointToInsertCopy, 667 PointToInsertCopy->getDebugLoc(), 668 TII->get(AMDGPU::COPY), NewDst) 669 .addReg(MO.getReg()); 670 MO.setReg(NewDst); 671 analyzeVGPRToSGPRCopy(NewCopy); 672 } 673 } 674 } 675 676 if (MI.isPHI()) 677 PHINodes.push_back(&MI); 678 else if (MI.isRegSequence()) 679 RegSequences.push_back(&MI); 680 681 break; 682 } 683 case AMDGPU::V_WRITELANE_B32: { 684 // Some architectures allow more than one constant bus access without 685 // SGPR restriction 686 if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 687 break; 688 689 // Writelane is special in that it can use SGPR and M0 (which would 690 // normally count as using the constant bus twice - but in this case it 691 // is allowed since the lane selector doesn't count as a use of the 692 // constant bus). However, it is still required to abide by the 1 SGPR 693 // rule. Apply a fix here as we might have multiple SGPRs after 694 // legalizing VGPRs to SGPRs 695 int Src0Idx = 696 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 697 int Src1Idx = 698 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 699 MachineOperand &Src0 = MI.getOperand(Src0Idx); 700 MachineOperand &Src1 = MI.getOperand(Src1Idx); 701 702 // Check to see if the instruction violates the 1 SGPR rule 703 if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 704 Src0.getReg() != AMDGPU::M0) && 705 (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 706 Src1.getReg() != AMDGPU::M0)) { 707 708 // Check for trivially easy constant prop into one of the operands 709 // If this is the case then perform the operation now to resolve SGPR 710 // issue. If we don't do that here we will always insert a mov to m0 711 // that can't be resolved in later operand folding pass 712 bool Resolved = false; 713 for (MachineOperand *MO : {&Src0, &Src1}) { 714 if (MO->getReg().isVirtual()) { 715 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 716 if (DefMI && TII->isFoldableCopy(*DefMI)) { 717 const MachineOperand &Def = DefMI->getOperand(0); 718 if (Def.isReg() && 719 MO->getReg() == Def.getReg() && 720 MO->getSubReg() == Def.getSubReg()) { 721 const MachineOperand &Copied = DefMI->getOperand(1); 722 if (Copied.isImm() && 723 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 724 MO->ChangeToImmediate(Copied.getImm()); 725 Resolved = true; 726 break; 727 } 728 } 729 } 730 } 731 } 732 733 if (!Resolved) { 734 // Haven't managed to resolve by replacing an SGPR with an immediate 735 // Move src1 to be in M0 736 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 737 TII->get(AMDGPU::COPY), AMDGPU::M0) 738 .add(Src1); 739 Src1.ChangeToRegister(AMDGPU::M0, false); 740 } 741 } 742 break; 743 } 744 } 745 } 746 } 747 748 lowerVGPR2SGPRCopies(MF); 749 // Postprocessing 750 fixSCCCopies(MF); 751 for (auto MI : S2VCopies) { 752 // Check if it is still valid 753 if (MI->isCopy()) { 754 const TargetRegisterClass *SrcRC, *DstRC; 755 std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); 756 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 757 tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII); 758 } 759 } 760 for (auto MI : RegSequences) { 761 // Check if it is still valid 762 if (MI->isRegSequence()) 763 foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI); 764 } 765 for (auto MI : PHINodes) { 766 processPHINode(*MI); 767 } 768 if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) 769 hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 770 771 SiblingPenalty.clear(); 772 V2SCopies.clear(); 773 SCCCopies.clear(); 774 RegSequences.clear(); 775 PHINodes.clear(); 776 S2VCopies.clear(); 777 778 return true; 779 } 780 781 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 782 bool AllAGPRUses = true; 783 SetVector<const MachineInstr *> worklist; 784 SmallSet<const MachineInstr *, 4> Visited; 785 SetVector<MachineInstr *> PHIOperands; 786 worklist.insert(&MI); 787 Visited.insert(&MI); 788 // HACK to make MIR tests with no uses happy 789 bool HasUses = false; 790 while (!worklist.empty()) { 791 const MachineInstr *Instr = worklist.pop_back_val(); 792 Register Reg = Instr->getOperand(0).getReg(); 793 for (const auto &Use : MRI->use_operands(Reg)) { 794 HasUses = true; 795 const MachineInstr *UseMI = Use.getParent(); 796 AllAGPRUses &= (UseMI->isCopy() && 797 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 798 TRI->isAGPR(*MRI, Use.getReg()); 799 if (UseMI->isCopy() || UseMI->isRegSequence()) { 800 if (Visited.insert(UseMI).second) 801 worklist.insert(UseMI); 802 803 continue; 804 } 805 } 806 } 807 808 Register PHIRes = MI.getOperand(0).getReg(); 809 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 810 if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC0)) { 811 LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 812 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 813 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 814 MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); 815 if (DefMI && DefMI->isPHI()) 816 PHIOperands.insert(DefMI); 817 } 818 } 819 820 if (TRI->isVectorRegister(*MRI, PHIRes) || 821 RC0 == &AMDGPU::VReg_1RegClass) { 822 LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 823 TII->legalizeOperands(MI, MDT); 824 } 825 826 // Propagate register class back to PHI operands which are PHI themselves. 827 while (!PHIOperands.empty()) { 828 processPHINode(*PHIOperands.pop_back_val()); 829 } 830 } 831 832 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, 833 MachineBasicBlock::iterator &I) { 834 Register DstReg = MI.getOperand(0).getReg(); 835 Register SrcReg = MI.getOperand(1).getReg(); 836 if (!DstReg.isVirtual()) { 837 // If the destination register is a physical register there isn't 838 // really much we can do to fix this. 839 // Some special instructions use M0 as an input. Some even only use 840 // the first lane. Insert a readfirstlane and hope for the best. 841 if (DstReg == AMDGPU::M0 && 842 TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { 843 Register TmpReg = 844 MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 845 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 846 TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 847 .add(MI.getOperand(1)); 848 MI.getOperand(1).setReg(TmpReg); 849 } else { 850 MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 851 if (DefMI && DefMI->isMoveImmediate()) { 852 MachineOperand SrcConst = DefMI->getOperand(AMDGPU::getNamedOperandIdx( 853 DefMI->getOpcode(), AMDGPU::OpName::src0)); 854 if (!SrcConst.isReg()) { 855 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); 856 unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); 857 unsigned MoveOp = 858 MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 859 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MoveOp), 860 DstReg) 861 .add(SrcConst); 862 I = std::next(I); 863 if (MRI->hasOneUse(SrcReg)) 864 DefMI->eraseFromParent(); 865 MI.eraseFromParent(); 866 } 867 } 868 } 869 return true; 870 } 871 if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { 872 TII->moveToVALU(MI, MDT); 873 return true; 874 } 875 876 unsigned SMovOp; 877 int64_t Imm; 878 // If we are just copying an immediate, we can replace the copy with 879 // s_mov_b32. 880 if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { 881 MI.getOperand(1).ChangeToImmediate(Imm); 882 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 883 MI.setDesc(TII->get(SMovOp)); 884 return true; 885 } 886 return false; 887 } 888 889 void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { 890 Register DstReg = MI->getOperand(0).getReg(); 891 const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); 892 893 V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, 894 TRI->getRegSizeInBits(*DstRC)); 895 SmallVector<MachineInstr *, 8> AnalysisWorklist; 896 // Needed because the SSA is not a tree but a graph and may have 897 // forks and joins. We should not then go same way twice. 898 DenseSet<MachineInstr *> Visited; 899 AnalysisWorklist.push_back(Info.Copy); 900 while (!AnalysisWorklist.empty()) { 901 902 MachineInstr *Inst = AnalysisWorklist.pop_back_val(); 903 904 if (!Visited.insert(Inst).second) 905 continue; 906 907 // Copies and REG_SEQUENCE do not contribute to the final assembly 908 // So, skip them but take care of the SGPR to VGPR copies bookkeeping. 909 if (Inst->isCopy() || Inst->isRegSequence()) { 910 if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { 911 if (!Inst->isCopy() || 912 !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { 913 Info.NumSVCopies++; 914 continue; 915 } 916 } 917 } 918 919 SiblingPenalty[Inst].insert(Info.ID); 920 921 SmallVector<MachineInstr *, 4> Users; 922 if ((TII->isSALU(*Inst) && Inst->isCompare()) || 923 (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { 924 auto I = Inst->getIterator(); 925 auto E = Inst->getParent()->end(); 926 while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) { 927 if (I->readsRegister(AMDGPU::SCC)) 928 Users.push_back(&*I); 929 } 930 } else if (Inst->getNumExplicitDefs() != 0) { 931 Register Reg = Inst->getOperand(0).getReg(); 932 if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) 933 for (auto &U : MRI->use_instructions(Reg)) 934 Users.push_back(&U); 935 } 936 for (auto U : Users) { 937 if (TII->isSALU(*U)) 938 Info.SChain.insert(U); 939 AnalysisWorklist.push_back(U); 940 } 941 } 942 V2SCopies[Info.ID] = Info; 943 } 944 945 // The main function that computes the VGPR to SGPR copy score 946 // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU 947 bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { 948 if (Info->SChain.empty()) { 949 Info->Score = 0; 950 return true; 951 } 952 Info->Siblings = SiblingPenalty[*std::max_element( 953 Info->SChain.begin(), Info->SChain.end(), 954 [&](MachineInstr *A, MachineInstr *B) -> bool { 955 return SiblingPenalty[A].size() < SiblingPenalty[B].size(); 956 })]; 957 Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; }); 958 // The loop below computes the number of another VGPR to SGPR V2SCopies 959 // which contribute to the current copy SALU chain. We assume that all the 960 // V2SCopies with the same source virtual register will be squashed to one 961 // by regalloc. Also we take care of the V2SCopies of the differnt subregs 962 // of the same register. 963 SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; 964 for (auto J : Info->Siblings) { 965 auto InfoIt = V2SCopies.find(J); 966 if (InfoIt != V2SCopies.end()) { 967 MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; 968 if (SiblingCopy->isImplicitDef()) 969 // the COPY has already been MoveToVALUed 970 continue; 971 972 SrcRegs.insert(std::pair(SiblingCopy->getOperand(1).getReg(), 973 SiblingCopy->getOperand(1).getSubReg())); 974 } 975 } 976 Info->SiblingPenalty = SrcRegs.size(); 977 978 unsigned Penalty = 979 Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; 980 unsigned Profit = Info->SChain.size(); 981 Info->Score = Penalty > Profit ? 0 : Profit - Penalty; 982 Info->NeedToBeConvertedToVALU = Info->Score < 3; 983 return Info->NeedToBeConvertedToVALU; 984 } 985 986 void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { 987 988 SmallVector<unsigned, 8> LoweringWorklist; 989 for (auto &C : V2SCopies) { 990 if (needToBeConvertedToVALU(&C.second)) 991 LoweringWorklist.push_back(C.second.ID); 992 } 993 994 while (!LoweringWorklist.empty()) { 995 unsigned CurID = LoweringWorklist.pop_back_val(); 996 auto CurInfoIt = V2SCopies.find(CurID); 997 if (CurInfoIt != V2SCopies.end()) { 998 V2SCopyInfo C = CurInfoIt->getSecond(); 999 LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); 1000 for (auto S : C.Siblings) { 1001 auto SibInfoIt = V2SCopies.find(S); 1002 if (SibInfoIt != V2SCopies.end()) { 1003 V2SCopyInfo &SI = SibInfoIt->getSecond(); 1004 LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); 1005 if (!SI.NeedToBeConvertedToVALU) { 1006 set_subtract(SI.SChain, C.SChain); 1007 if (needToBeConvertedToVALU(&SI)) 1008 LoweringWorklist.push_back(SI.ID); 1009 } 1010 SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); 1011 } 1012 } 1013 LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy 1014 << " is being turned to VALU\n"); 1015 V2SCopies.erase(C.ID); 1016 TII->moveToVALU(*C.Copy, MDT); 1017 } 1018 } 1019 1020 // Now do actual lowering 1021 for (auto C : V2SCopies) { 1022 MachineInstr *MI = C.second.Copy; 1023 MachineBasicBlock *MBB = MI->getParent(); 1024 // We decide to turn V2S copy to v_readfirstlane_b32 1025 // remove it from the V2SCopies and remove it from all its siblings 1026 LLVM_DEBUG(dbgs() << "V2S copy " << *MI 1027 << " is being turned to v_readfirstlane_b32" 1028 << " Score: " << C.second.Score << "\n"); 1029 Register DstReg = MI->getOperand(0).getReg(); 1030 Register SrcReg = MI->getOperand(1).getReg(); 1031 unsigned SubReg = MI->getOperand(1).getSubReg(); 1032 const TargetRegisterClass *SrcRC = 1033 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1)); 1034 size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); 1035 if (SrcSize == 16) { 1036 // HACK to handle possible 16bit VGPR source 1037 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1038 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1039 MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); 1040 } else if (SrcSize == 32) { 1041 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1042 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1043 MIB.addReg(SrcReg, 0, SubReg); 1044 } else { 1045 auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), 1046 TII->get(AMDGPU::REG_SEQUENCE), DstReg); 1047 int N = TRI->getRegSizeInBits(*SrcRC) / 32; 1048 for (int i = 0; i < N; i++) { 1049 Register PartialSrc = TII->buildExtractSubReg( 1050 Result, *MRI, MI->getOperand(1), SrcRC, 1051 TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); 1052 Register PartialDst = 1053 MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1054 BuildMI(*MBB, *Result, Result->getDebugLoc(), 1055 TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) 1056 .addReg(PartialSrc); 1057 Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); 1058 } 1059 } 1060 MI->eraseFromParent(); 1061 } 1062 } 1063 1064 void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { 1065 bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); 1066 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; 1067 ++BI) { 1068 MachineBasicBlock *MBB = &*BI; 1069 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 1070 ++I) { 1071 MachineInstr &MI = *I; 1072 // May already have been lowered. 1073 if (!MI.isCopy()) 1074 continue; 1075 Register SrcReg = MI.getOperand(1).getReg(); 1076 Register DstReg = MI.getOperand(0).getReg(); 1077 if (SrcReg == AMDGPU::SCC) { 1078 Register SCCCopy = MRI->createVirtualRegister( 1079 TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); 1080 I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1081 MI.getDebugLoc(), 1082 TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 1083 : AMDGPU::S_CSELECT_B64), 1084 SCCCopy) 1085 .addImm(-1) 1086 .addImm(0); 1087 I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), 1088 TII->get(AMDGPU::COPY), DstReg) 1089 .addReg(SCCCopy); 1090 MI.eraseFromParent(); 1091 continue; 1092 } 1093 if (DstReg == AMDGPU::SCC) { 1094 unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 1095 Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1096 Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); 1097 I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1098 MI.getDebugLoc(), TII->get(Opcode)) 1099 .addReg(Tmp, getDefRegState(true)) 1100 .addReg(SrcReg) 1101 .addReg(Exec); 1102 MI.eraseFromParent(); 1103 } 1104 } 1105 } 1106 } 1107