1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 11 /// will sometimes generate these illegal copies in situations like this: 12 /// 13 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14 /// 15 /// BB0: 16 /// %0 <sgpr> = SCALAR_INST 17 /// %1 <vsrc> = COPY %0 <sgpr> 18 /// ... 19 /// BRANCH %cond BB1, BB2 20 /// BB1: 21 /// %2 <vgpr> = VECTOR_INST 22 /// %3 <vsrc> = COPY %2 <vgpr> 23 /// BB2: 24 /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25 /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26 /// 27 /// 28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29 /// code will look like this: 30 /// 31 /// BB0: 32 /// %0 <sgpr> = SCALAR_INST 33 /// ... 34 /// BRANCH %cond BB1, BB2 35 /// BB1: 36 /// %2 <vgpr> = VECTOR_INST 37 /// %3 <vsrc> = COPY %2 <vgpr> 38 /// BB2: 39 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41 /// 42 /// Now that the result of the PHI instruction is an SGPR, the register 43 /// allocator is now forced to constrain the register class of %3 to 44 /// <sgpr> so we end up with final code like this: 45 /// 46 /// BB0: 47 /// %0 <sgpr> = SCALAR_INST 48 /// ... 49 /// BRANCH %cond BB1, BB2 50 /// BB1: 51 /// %2 <vgpr> = VECTOR_INST 52 /// %3 <sgpr> = COPY %2 <vgpr> 53 /// BB2: 54 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56 /// 57 /// Now this code contains an illegal copy from a VGPR to an SGPR. 58 /// 59 /// In order to avoid this problem, this pass searches for PHI instructions 60 /// which define a <vsrc> register and constrains its definition class to 61 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 63 /// will be unable to perform the COPY removal from the above example which 64 /// ultimately led to the creation of an illegal COPY. 65 //===----------------------------------------------------------------------===// 66 67 #include "SIFixSGPRCopies.h" 68 #include "AMDGPU.h" 69 #include "GCNSubtarget.h" 70 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 71 #include "llvm/CodeGen/MachineDominators.h" 72 #include "llvm/InitializePasses.h" 73 #include "llvm/Target/TargetMachine.h" 74 75 using namespace llvm; 76 77 #define DEBUG_TYPE "si-fix-sgpr-copies" 78 79 static cl::opt<bool> EnableM0Merge( 80 "amdgpu-enable-merge-m0", 81 cl::desc("Merge and hoist M0 initializations"), 82 cl::init(true)); 83 84 namespace { 85 86 class V2SCopyInfo { 87 public: 88 // VGPR to SGPR copy being processed 89 MachineInstr *Copy; 90 // All SALU instructions reachable from this copy in SSA graph 91 SetVector<MachineInstr *> SChain; 92 // Number of SGPR to VGPR copies that are used to put the SALU computation 93 // results back to VALU. 94 unsigned NumSVCopies = 0; 95 96 unsigned Score = 0; 97 // Actual count of v_readfirstlane_b32 98 // which need to be inserted to keep SChain SALU 99 unsigned NumReadfirstlanes = 0; 100 // Current score state. To speedup selection V2SCopyInfos for processing 101 bool NeedToBeConvertedToVALU = false; 102 // Unique ID. Used as a key for mapping to keep permanent order. 103 unsigned ID; 104 105 // Count of another VGPR to SGPR copies that contribute to the 106 // current copy SChain 107 unsigned SiblingPenalty = 0; 108 SetVector<unsigned> Siblings; 109 V2SCopyInfo() : Copy(nullptr), ID(0){}; 110 V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) 111 : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; 112 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 113 void dump() { 114 dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() 115 << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty 116 << "\nScore: " << Score << "\n"; 117 } 118 #endif 119 }; 120 121 class SIFixSGPRCopies { 122 MachineDominatorTree *MDT; 123 SmallVector<MachineInstr*, 4> SCCCopies; 124 SmallVector<MachineInstr*, 4> RegSequences; 125 SmallVector<MachineInstr*, 4> PHINodes; 126 SmallVector<MachineInstr*, 4> S2VCopies; 127 unsigned NextVGPRToSGPRCopyID = 0; 128 MapVector<unsigned, V2SCopyInfo> V2SCopies; 129 DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; 130 DenseSet<MachineInstr *> PHISources; 131 132 public: 133 MachineRegisterInfo *MRI; 134 const SIRegisterInfo *TRI; 135 const SIInstrInfo *TII; 136 137 SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {} 138 139 bool run(MachineFunction &MF); 140 void fixSCCCopies(MachineFunction &MF); 141 void prepareRegSequenceAndPHIs(MachineFunction &MF); 142 unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } 143 bool needToBeConvertedToVALU(V2SCopyInfo *I); 144 void analyzeVGPRToSGPRCopy(MachineInstr *MI); 145 void lowerVGPR2SGPRCopies(MachineFunction &MF); 146 // Handles copies which source register is: 147 // 1. Physical register 148 // 2. AGPR 149 // 3. Defined by the instruction the merely moves the immediate 150 bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); 151 152 void processPHINode(MachineInstr &MI); 153 154 // Check if MO is an immediate materialized into a VGPR, and if so replace it 155 // with an SGPR immediate. The VGPR immediate is also deleted if it does not 156 // have any other uses. 157 bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, 158 MachineBasicBlock *BlockToInsertTo, 159 MachineBasicBlock::iterator PointToInsertTo, 160 const DebugLoc &DL); 161 }; 162 163 class SIFixSGPRCopiesLegacy : public MachineFunctionPass { 164 public: 165 static char ID; 166 167 SIFixSGPRCopiesLegacy() : MachineFunctionPass(ID) {} 168 169 bool runOnMachineFunction(MachineFunction &MF) override { 170 MachineDominatorTree *MDT = 171 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 172 SIFixSGPRCopies Impl(MDT); 173 return Impl.run(MF); 174 } 175 176 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 177 178 void getAnalysisUsage(AnalysisUsage &AU) const override { 179 AU.addRequired<MachineDominatorTreeWrapperPass>(); 180 AU.addPreserved<MachineDominatorTreeWrapperPass>(); 181 AU.setPreservesCFG(); 182 MachineFunctionPass::getAnalysisUsage(AU); 183 } 184 }; 185 186 } // end anonymous namespace 187 188 INITIALIZE_PASS_BEGIN(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies", 189 false, false) 190 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 191 INITIALIZE_PASS_END(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies", 192 false, false) 193 194 char SIFixSGPRCopiesLegacy::ID = 0; 195 196 char &llvm::SIFixSGPRCopiesLegacyID = SIFixSGPRCopiesLegacy::ID; 197 198 FunctionPass *llvm::createSIFixSGPRCopiesLegacyPass() { 199 return new SIFixSGPRCopiesLegacy(); 200 } 201 202 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 203 getCopyRegClasses(const MachineInstr &Copy, 204 const SIRegisterInfo &TRI, 205 const MachineRegisterInfo &MRI) { 206 Register DstReg = Copy.getOperand(0).getReg(); 207 Register SrcReg = Copy.getOperand(1).getReg(); 208 209 const TargetRegisterClass *SrcRC = SrcReg.isVirtual() 210 ? MRI.getRegClass(SrcReg) 211 : TRI.getPhysRegBaseClass(SrcReg); 212 213 // We don't really care about the subregister here. 214 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 215 216 const TargetRegisterClass *DstRC = DstReg.isVirtual() 217 ? MRI.getRegClass(DstReg) 218 : TRI.getPhysRegBaseClass(DstReg); 219 220 return std::pair(SrcRC, DstRC); 221 } 222 223 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 224 const TargetRegisterClass *DstRC, 225 const SIRegisterInfo &TRI) { 226 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 227 TRI.hasVectorRegisters(SrcRC); 228 } 229 230 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 231 const TargetRegisterClass *DstRC, 232 const SIRegisterInfo &TRI) { 233 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 234 TRI.hasVectorRegisters(DstRC); 235 } 236 237 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 238 const SIRegisterInfo *TRI, 239 const SIInstrInfo *TII) { 240 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 241 auto &Src = MI.getOperand(1); 242 Register DstReg = MI.getOperand(0).getReg(); 243 Register SrcReg = Src.getReg(); 244 if (!SrcReg.isVirtual() || !DstReg.isVirtual()) 245 return false; 246 247 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 248 const auto *UseMI = MO.getParent(); 249 if (UseMI == &MI) 250 continue; 251 if (MO.isDef() || UseMI->getParent() != MI.getParent() || 252 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 253 return false; 254 255 unsigned OpIdx = MO.getOperandNo(); 256 if (OpIdx >= UseMI->getDesc().getNumOperands() || 257 !TII->isOperandLegal(*UseMI, OpIdx, &Src)) 258 return false; 259 } 260 // Change VGPR to SGPR destination. 261 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 262 return true; 263 } 264 265 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 266 // 267 // SGPRx = ... 268 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 269 // VGPRz = COPY SGPRy 270 // 271 // ==> 272 // 273 // VGPRx = COPY SGPRx 274 // VGPRz = REG_SEQUENCE VGPRx, sub0 275 // 276 // This exposes immediate folding opportunities when materializing 64-bit 277 // immediates. 278 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 279 const SIRegisterInfo *TRI, 280 const SIInstrInfo *TII, 281 MachineRegisterInfo &MRI) { 282 assert(MI.isRegSequence()); 283 284 Register DstReg = MI.getOperand(0).getReg(); 285 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 286 return false; 287 288 if (!MRI.hasOneUse(DstReg)) 289 return false; 290 291 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 292 if (!CopyUse.isCopy()) 293 return false; 294 295 // It is illegal to have vreg inputs to a physreg defining reg_sequence. 296 if (CopyUse.getOperand(0).getReg().isPhysical()) 297 return false; 298 299 const TargetRegisterClass *SrcRC, *DstRC; 300 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 301 302 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 303 return false; 304 305 if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 306 return true; 307 308 // TODO: Could have multiple extracts? 309 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 310 if (SubReg != AMDGPU::NoSubRegister) 311 return false; 312 313 MRI.setRegClass(DstReg, DstRC); 314 315 // SGPRx = ... 316 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 317 // VGPRz = COPY SGPRy 318 319 // => 320 // VGPRx = COPY SGPRx 321 // VGPRz = REG_SEQUENCE VGPRx, sub0 322 323 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 324 bool IsAGPR = TRI->isAGPRClass(DstRC); 325 326 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 327 const TargetRegisterClass *SrcRC = 328 TRI->getRegClassForOperandReg(MRI, MI.getOperand(I)); 329 assert(TRI->isSGPRClass(SrcRC) && 330 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 331 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 332 333 Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 334 335 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 336 TmpReg) 337 .add(MI.getOperand(I)); 338 339 if (IsAGPR) { 340 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 341 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 342 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 343 AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; 344 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 345 TmpAReg) 346 .addReg(TmpReg, RegState::Kill); 347 TmpReg = TmpAReg; 348 } 349 350 MI.getOperand(I).setReg(TmpReg); 351 } 352 353 CopyUse.eraseFromParent(); 354 return true; 355 } 356 357 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 358 const MachineInstr *MoveImm, 359 const SIInstrInfo *TII, 360 unsigned &SMovOp, 361 int64_t &Imm) { 362 if (Copy->getOpcode() != AMDGPU::COPY) 363 return false; 364 365 if (!MoveImm->isMoveImmediate()) 366 return false; 367 368 const MachineOperand *ImmOp = 369 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 370 if (!ImmOp->isImm()) 371 return false; 372 373 // FIXME: Handle copies with sub-regs. 374 if (Copy->getOperand(1).getSubReg()) 375 return false; 376 377 switch (MoveImm->getOpcode()) { 378 default: 379 return false; 380 case AMDGPU::V_MOV_B32_e32: 381 SMovOp = AMDGPU::S_MOV_B32; 382 break; 383 case AMDGPU::V_MOV_B64_PSEUDO: 384 SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; 385 break; 386 } 387 Imm = ImmOp->getImm(); 388 return true; 389 } 390 391 template <class UnaryPredicate> 392 bool searchPredecessors(const MachineBasicBlock *MBB, 393 const MachineBasicBlock *CutOff, 394 UnaryPredicate Predicate) { 395 if (MBB == CutOff) 396 return false; 397 398 DenseSet<const MachineBasicBlock *> Visited; 399 SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); 400 401 while (!Worklist.empty()) { 402 MachineBasicBlock *MBB = Worklist.pop_back_val(); 403 404 if (!Visited.insert(MBB).second) 405 continue; 406 if (MBB == CutOff) 407 continue; 408 if (Predicate(MBB)) 409 return true; 410 411 Worklist.append(MBB->pred_begin(), MBB->pred_end()); 412 } 413 414 return false; 415 } 416 417 // Checks if there is potential path From instruction To instruction. 418 // If CutOff is specified and it sits in between of that path we ignore 419 // a higher portion of the path and report it is not reachable. 420 static bool isReachable(const MachineInstr *From, 421 const MachineInstr *To, 422 const MachineBasicBlock *CutOff, 423 MachineDominatorTree &MDT) { 424 if (MDT.dominates(From, To)) 425 return true; 426 427 const MachineBasicBlock *MBBFrom = From->getParent(); 428 const MachineBasicBlock *MBBTo = To->getParent(); 429 430 // Do predecessor search. 431 // We should almost never get here since we do not usually produce M0 stores 432 // other than -1. 433 return searchPredecessors(MBBTo, CutOff, [MBBFrom] 434 (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 435 } 436 437 // Return the first non-prologue instruction in the block. 438 static MachineBasicBlock::iterator 439 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 440 MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 441 while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 442 ++I; 443 444 return I; 445 } 446 447 // Hoist and merge identical SGPR initializations into a common predecessor. 448 // This is intended to combine M0 initializations, but can work with any 449 // SGPR. A VGPR cannot be processed since we cannot guarantee vector 450 // executioon. 451 static bool hoistAndMergeSGPRInits(unsigned Reg, 452 const MachineRegisterInfo &MRI, 453 const TargetRegisterInfo *TRI, 454 MachineDominatorTree &MDT, 455 const TargetInstrInfo *TII) { 456 // List of inits by immediate value. 457 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 458 InitListMap Inits; 459 // List of clobbering instructions. 460 SmallVector<MachineInstr*, 8> Clobbers; 461 // List of instructions marked for deletion. 462 SmallSet<MachineInstr*, 8> MergedInstrs; 463 464 bool Changed = false; 465 466 for (auto &MI : MRI.def_instructions(Reg)) { 467 MachineOperand *Imm = nullptr; 468 for (auto &MO : MI.operands()) { 469 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 470 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 471 Imm = nullptr; 472 break; 473 } 474 if (MO.isImm()) 475 Imm = &MO; 476 } 477 if (Imm) 478 Inits[Imm->getImm()].push_front(&MI); 479 else 480 Clobbers.push_back(&MI); 481 } 482 483 for (auto &Init : Inits) { 484 auto &Defs = Init.second; 485 486 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 487 MachineInstr *MI1 = *I1; 488 489 for (auto I2 = std::next(I1); I2 != E; ) { 490 MachineInstr *MI2 = *I2; 491 492 // Check any possible interference 493 auto interferes = [&](MachineBasicBlock::iterator From, 494 MachineBasicBlock::iterator To) -> bool { 495 496 assert(MDT.dominates(&*To, &*From)); 497 498 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 499 const MachineBasicBlock *MBBFrom = From->getParent(); 500 const MachineBasicBlock *MBBTo = To->getParent(); 501 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 502 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 503 if (!MayClobberFrom && !MayClobberTo) 504 return false; 505 if ((MayClobberFrom && !MayClobberTo) || 506 (!MayClobberFrom && MayClobberTo)) 507 return true; 508 // Both can clobber, this is not an interference only if both are 509 // dominated by Clobber and belong to the same block or if Clobber 510 // properly dominates To, given that To >> From, so it dominates 511 // both and located in a common dominator. 512 return !((MBBFrom == MBBTo && 513 MDT.dominates(Clobber, &*From) && 514 MDT.dominates(Clobber, &*To)) || 515 MDT.properlyDominates(Clobber->getParent(), MBBTo)); 516 }; 517 518 return (llvm::any_of(Clobbers, interferes)) || 519 (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 520 return C.first != Init.first && 521 llvm::any_of(C.second, interferes); 522 })); 523 }; 524 525 if (MDT.dominates(MI1, MI2)) { 526 if (!interferes(MI2, MI1)) { 527 LLVM_DEBUG(dbgs() 528 << "Erasing from " 529 << printMBBReference(*MI2->getParent()) << " " << *MI2); 530 MergedInstrs.insert(MI2); 531 Changed = true; 532 ++I2; 533 continue; 534 } 535 } else if (MDT.dominates(MI2, MI1)) { 536 if (!interferes(MI1, MI2)) { 537 LLVM_DEBUG(dbgs() 538 << "Erasing from " 539 << printMBBReference(*MI1->getParent()) << " " << *MI1); 540 MergedInstrs.insert(MI1); 541 Changed = true; 542 ++I1; 543 break; 544 } 545 } else { 546 auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 547 MI2->getParent()); 548 if (!MBB) { 549 ++I2; 550 continue; 551 } 552 553 MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 554 if (!interferes(MI1, I) && !interferes(MI2, I)) { 555 LLVM_DEBUG(dbgs() 556 << "Erasing from " 557 << printMBBReference(*MI1->getParent()) << " " << *MI1 558 << "and moving from " 559 << printMBBReference(*MI2->getParent()) << " to " 560 << printMBBReference(*I->getParent()) << " " << *MI2); 561 I->getParent()->splice(I, MI2->getParent(), MI2); 562 MergedInstrs.insert(MI1); 563 Changed = true; 564 ++I1; 565 break; 566 } 567 } 568 ++I2; 569 } 570 ++I1; 571 } 572 } 573 574 // Remove initializations that were merged into another. 575 for (auto &Init : Inits) { 576 auto &Defs = Init.second; 577 auto I = Defs.begin(); 578 while (I != Defs.end()) { 579 if (MergedInstrs.count(*I)) { 580 (*I)->eraseFromParent(); 581 I = Defs.erase(I); 582 } else 583 ++I; 584 } 585 } 586 587 // Try to schedule SGPR initializations as early as possible in the MBB. 588 for (auto &Init : Inits) { 589 auto &Defs = Init.second; 590 for (auto *MI : Defs) { 591 auto *MBB = MI->getParent(); 592 MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 593 MachineBasicBlock::reverse_iterator B(BoundaryMI); 594 // Check if B should actually be a boundary. If not set the previous 595 // instruction as the boundary instead. 596 if (!TII->isBasicBlockPrologue(*B)) 597 B++; 598 599 auto R = std::next(MI->getReverseIterator()); 600 const unsigned Threshold = 50; 601 // Search until B or Threshold for a place to insert the initialization. 602 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 603 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 604 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 605 break; 606 607 // Move to directly after R. 608 if (&*--R != MI) 609 MBB->splice(*R, MBB, MI); 610 } 611 } 612 613 if (Changed) 614 MRI.clearKillFlags(Reg); 615 616 return Changed; 617 } 618 619 bool SIFixSGPRCopies::run(MachineFunction &MF) { 620 // Only need to run this in SelectionDAG path. 621 if (MF.getProperties().hasSelected()) 622 return false; 623 624 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 625 MRI = &MF.getRegInfo(); 626 TRI = ST.getRegisterInfo(); 627 TII = ST.getInstrInfo(); 628 629 for (MachineBasicBlock &MBB : MF) { 630 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 631 ++I) { 632 MachineInstr &MI = *I; 633 634 switch (MI.getOpcode()) { 635 default: 636 continue; 637 case AMDGPU::COPY: { 638 const TargetRegisterClass *SrcRC, *DstRC; 639 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 640 641 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 642 // Since VGPR to SGPR copies affect VGPR to SGPR copy 643 // score and, hence the lowering decision, let's try to get rid of 644 // them as early as possible 645 if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) 646 continue; 647 648 // Collect those not changed to try them after VGPR to SGPR copies 649 // lowering as there will be more opportunities. 650 S2VCopies.push_back(&MI); 651 } 652 if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) 653 continue; 654 if (lowerSpecialCase(MI, I)) 655 continue; 656 657 analyzeVGPRToSGPRCopy(&MI); 658 659 break; 660 } 661 case AMDGPU::WQM: 662 case AMDGPU::STRICT_WQM: 663 case AMDGPU::SOFT_WQM: 664 case AMDGPU::STRICT_WWM: 665 case AMDGPU::INSERT_SUBREG: 666 case AMDGPU::PHI: 667 case AMDGPU::REG_SEQUENCE: { 668 if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) { 669 for (MachineOperand &MO : MI.operands()) { 670 if (!MO.isReg() || !MO.getReg().isVirtual()) 671 continue; 672 const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); 673 if (SrcRC == &AMDGPU::VReg_1RegClass) 674 continue; 675 676 if (TRI->hasVectorRegisters(SrcRC)) { 677 const TargetRegisterClass *DestRC = 678 TRI->getEquivalentSGPRClass(SrcRC); 679 Register NewDst = MRI->createVirtualRegister(DestRC); 680 MachineBasicBlock *BlockToInsertCopy = 681 MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB() 682 : &MBB; 683 MachineBasicBlock::iterator PointToInsertCopy = 684 MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; 685 686 const DebugLoc &DL = MI.getDebugLoc(); 687 if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy, 688 PointToInsertCopy, DL)) { 689 MachineInstr *NewCopy = 690 BuildMI(*BlockToInsertCopy, PointToInsertCopy, DL, 691 TII->get(AMDGPU::COPY), NewDst) 692 .addReg(MO.getReg()); 693 MO.setReg(NewDst); 694 analyzeVGPRToSGPRCopy(NewCopy); 695 PHISources.insert(NewCopy); 696 } 697 } 698 } 699 } 700 701 if (MI.isPHI()) 702 PHINodes.push_back(&MI); 703 else if (MI.isRegSequence()) 704 RegSequences.push_back(&MI); 705 706 break; 707 } 708 case AMDGPU::V_WRITELANE_B32: { 709 // Some architectures allow more than one constant bus access without 710 // SGPR restriction 711 if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 712 break; 713 714 // Writelane is special in that it can use SGPR and M0 (which would 715 // normally count as using the constant bus twice - but in this case it 716 // is allowed since the lane selector doesn't count as a use of the 717 // constant bus). However, it is still required to abide by the 1 SGPR 718 // rule. Apply a fix here as we might have multiple SGPRs after 719 // legalizing VGPRs to SGPRs 720 int Src0Idx = 721 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 722 int Src1Idx = 723 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 724 MachineOperand &Src0 = MI.getOperand(Src0Idx); 725 MachineOperand &Src1 = MI.getOperand(Src1Idx); 726 727 // Check to see if the instruction violates the 1 SGPR rule 728 if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 729 Src0.getReg() != AMDGPU::M0) && 730 (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 731 Src1.getReg() != AMDGPU::M0)) { 732 733 // Check for trivially easy constant prop into one of the operands 734 // If this is the case then perform the operation now to resolve SGPR 735 // issue. If we don't do that here we will always insert a mov to m0 736 // that can't be resolved in later operand folding pass 737 bool Resolved = false; 738 for (MachineOperand *MO : {&Src0, &Src1}) { 739 if (MO->getReg().isVirtual()) { 740 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 741 if (DefMI && TII->isFoldableCopy(*DefMI)) { 742 const MachineOperand &Def = DefMI->getOperand(0); 743 if (Def.isReg() && 744 MO->getReg() == Def.getReg() && 745 MO->getSubReg() == Def.getSubReg()) { 746 const MachineOperand &Copied = DefMI->getOperand(1); 747 if (Copied.isImm() && 748 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 749 MO->ChangeToImmediate(Copied.getImm()); 750 Resolved = true; 751 break; 752 } 753 } 754 } 755 } 756 } 757 758 if (!Resolved) { 759 // Haven't managed to resolve by replacing an SGPR with an immediate 760 // Move src1 to be in M0 761 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 762 TII->get(AMDGPU::COPY), AMDGPU::M0) 763 .add(Src1); 764 Src1.ChangeToRegister(AMDGPU::M0, false); 765 } 766 } 767 break; 768 } 769 } 770 } 771 } 772 773 lowerVGPR2SGPRCopies(MF); 774 // Postprocessing 775 fixSCCCopies(MF); 776 for (auto *MI : S2VCopies) { 777 // Check if it is still valid 778 if (MI->isCopy()) { 779 const TargetRegisterClass *SrcRC, *DstRC; 780 std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); 781 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 782 tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII); 783 } 784 } 785 for (auto *MI : RegSequences) { 786 // Check if it is still valid 787 if (MI->isRegSequence()) 788 foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI); 789 } 790 for (auto *MI : PHINodes) { 791 processPHINode(*MI); 792 } 793 if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) 794 hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 795 796 SiblingPenalty.clear(); 797 V2SCopies.clear(); 798 SCCCopies.clear(); 799 RegSequences.clear(); 800 PHINodes.clear(); 801 S2VCopies.clear(); 802 PHISources.clear(); 803 804 return true; 805 } 806 807 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 808 bool AllAGPRUses = true; 809 SetVector<const MachineInstr *> worklist; 810 SmallSet<const MachineInstr *, 4> Visited; 811 SetVector<MachineInstr *> PHIOperands; 812 worklist.insert(&MI); 813 Visited.insert(&MI); 814 // HACK to make MIR tests with no uses happy 815 bool HasUses = false; 816 while (!worklist.empty()) { 817 const MachineInstr *Instr = worklist.pop_back_val(); 818 Register Reg = Instr->getOperand(0).getReg(); 819 for (const auto &Use : MRI->use_operands(Reg)) { 820 HasUses = true; 821 const MachineInstr *UseMI = Use.getParent(); 822 AllAGPRUses &= (UseMI->isCopy() && 823 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 824 TRI->isAGPR(*MRI, Use.getReg()); 825 if (UseMI->isCopy() || UseMI->isRegSequence()) { 826 if (Visited.insert(UseMI).second) 827 worklist.insert(UseMI); 828 829 continue; 830 } 831 } 832 } 833 834 Register PHIRes = MI.getOperand(0).getReg(); 835 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 836 if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC0)) { 837 LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 838 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 839 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 840 MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); 841 if (DefMI && DefMI->isPHI()) 842 PHIOperands.insert(DefMI); 843 } 844 } 845 846 if (TRI->isVectorRegister(*MRI, PHIRes) || 847 RC0 == &AMDGPU::VReg_1RegClass) { 848 LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 849 TII->legalizeOperands(MI, MDT); 850 } 851 852 // Propagate register class back to PHI operands which are PHI themselves. 853 while (!PHIOperands.empty()) { 854 processPHINode(*PHIOperands.pop_back_val()); 855 } 856 } 857 858 bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( 859 MachineOperand &MaybeVGPRConstMO, Register DstReg, 860 MachineBasicBlock *BlockToInsertTo, 861 MachineBasicBlock::iterator PointToInsertTo, const DebugLoc &DL) { 862 863 MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg()); 864 if (!DefMI || !DefMI->isMoveImmediate()) 865 return false; 866 867 MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0); 868 if (SrcConst->isReg()) 869 return false; 870 871 const TargetRegisterClass *SrcRC = 872 MRI->getRegClass(MaybeVGPRConstMO.getReg()); 873 unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); 874 unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 875 BuildMI(*BlockToInsertTo, PointToInsertTo, DL, TII->get(MoveOp), DstReg) 876 .add(*SrcConst); 877 if (MRI->hasOneUse(MaybeVGPRConstMO.getReg())) 878 DefMI->eraseFromParent(); 879 MaybeVGPRConstMO.setReg(DstReg); 880 return true; 881 } 882 883 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, 884 MachineBasicBlock::iterator &I) { 885 Register DstReg = MI.getOperand(0).getReg(); 886 Register SrcReg = MI.getOperand(1).getReg(); 887 if (!DstReg.isVirtual()) { 888 // If the destination register is a physical register there isn't 889 // really much we can do to fix this. 890 // Some special instructions use M0 as an input. Some even only use 891 // the first lane. Insert a readfirstlane and hope for the best. 892 if (DstReg == AMDGPU::M0 && 893 TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { 894 Register TmpReg = 895 MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 896 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 897 TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 898 .add(MI.getOperand(1)); 899 MI.getOperand(1).setReg(TmpReg); 900 } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), 901 MI, MI.getDebugLoc())) { 902 I = std::next(I); 903 MI.eraseFromParent(); 904 } 905 return true; 906 } 907 if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { 908 SIInstrWorklist worklist; 909 worklist.insert(&MI); 910 TII->moveToVALU(worklist, MDT); 911 return true; 912 } 913 914 unsigned SMovOp; 915 int64_t Imm; 916 // If we are just copying an immediate, we can replace the copy with 917 // s_mov_b32. 918 if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { 919 MI.getOperand(1).ChangeToImmediate(Imm); 920 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 921 MI.setDesc(TII->get(SMovOp)); 922 return true; 923 } 924 return false; 925 } 926 927 void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { 928 if (PHISources.contains(MI)) 929 return; 930 Register DstReg = MI->getOperand(0).getReg(); 931 const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); 932 933 V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, 934 TRI->getRegSizeInBits(*DstRC)); 935 SmallVector<MachineInstr *, 8> AnalysisWorklist; 936 // Needed because the SSA is not a tree but a graph and may have 937 // forks and joins. We should not then go same way twice. 938 DenseSet<MachineInstr *> Visited; 939 AnalysisWorklist.push_back(Info.Copy); 940 while (!AnalysisWorklist.empty()) { 941 942 MachineInstr *Inst = AnalysisWorklist.pop_back_val(); 943 944 if (!Visited.insert(Inst).second) 945 continue; 946 947 // Copies and REG_SEQUENCE do not contribute to the final assembly 948 // So, skip them but take care of the SGPR to VGPR copies bookkeeping. 949 if (Inst->isCopy() || Inst->isRegSequence()) { 950 if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { 951 if (!Inst->isCopy() || 952 !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { 953 Info.NumSVCopies++; 954 continue; 955 } 956 } 957 } 958 959 SiblingPenalty[Inst].insert(Info.ID); 960 961 SmallVector<MachineInstr *, 4> Users; 962 if ((TII->isSALU(*Inst) && Inst->isCompare()) || 963 (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { 964 auto I = Inst->getIterator(); 965 auto E = Inst->getParent()->end(); 966 while (++I != E && 967 !I->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) { 968 if (I->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr)) 969 Users.push_back(&*I); 970 } 971 } else if (Inst->getNumExplicitDefs() != 0) { 972 Register Reg = Inst->getOperand(0).getReg(); 973 if (Reg.isVirtual() && TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) { 974 for (auto &U : MRI->use_instructions(Reg)) 975 Users.push_back(&U); 976 } 977 } 978 for (auto *U : Users) { 979 if (TII->isSALU(*U)) 980 Info.SChain.insert(U); 981 AnalysisWorklist.push_back(U); 982 } 983 } 984 V2SCopies[Info.ID] = Info; 985 } 986 987 // The main function that computes the VGPR to SGPR copy score 988 // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU 989 bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { 990 if (Info->SChain.empty()) { 991 Info->Score = 0; 992 return true; 993 } 994 Info->Siblings = SiblingPenalty[*llvm::max_element( 995 Info->SChain, [&](MachineInstr *A, MachineInstr *B) -> bool { 996 return SiblingPenalty[A].size() < SiblingPenalty[B].size(); 997 })]; 998 Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; }); 999 // The loop below computes the number of another VGPR to SGPR V2SCopies 1000 // which contribute to the current copy SALU chain. We assume that all the 1001 // V2SCopies with the same source virtual register will be squashed to one 1002 // by regalloc. Also we take care of the V2SCopies of the differnt subregs 1003 // of the same register. 1004 SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; 1005 for (auto J : Info->Siblings) { 1006 auto *InfoIt = V2SCopies.find(J); 1007 if (InfoIt != V2SCopies.end()) { 1008 MachineInstr *SiblingCopy = InfoIt->second.Copy; 1009 if (SiblingCopy->isImplicitDef()) 1010 // the COPY has already been MoveToVALUed 1011 continue; 1012 1013 SrcRegs.insert(std::pair(SiblingCopy->getOperand(1).getReg(), 1014 SiblingCopy->getOperand(1).getSubReg())); 1015 } 1016 } 1017 Info->SiblingPenalty = SrcRegs.size(); 1018 1019 unsigned Penalty = 1020 Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; 1021 unsigned Profit = Info->SChain.size(); 1022 Info->Score = Penalty > Profit ? 0 : Profit - Penalty; 1023 Info->NeedToBeConvertedToVALU = Info->Score < 3; 1024 return Info->NeedToBeConvertedToVALU; 1025 } 1026 1027 void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { 1028 1029 SmallVector<unsigned, 8> LoweringWorklist; 1030 for (auto &C : V2SCopies) { 1031 if (needToBeConvertedToVALU(&C.second)) 1032 LoweringWorklist.push_back(C.second.ID); 1033 } 1034 1035 // Store all the V2S copy instructions that need to be moved to VALU 1036 // in the Copies worklist. 1037 SIInstrWorklist Copies; 1038 1039 while (!LoweringWorklist.empty()) { 1040 unsigned CurID = LoweringWorklist.pop_back_val(); 1041 auto *CurInfoIt = V2SCopies.find(CurID); 1042 if (CurInfoIt != V2SCopies.end()) { 1043 V2SCopyInfo C = CurInfoIt->second; 1044 LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); 1045 for (auto S : C.Siblings) { 1046 auto *SibInfoIt = V2SCopies.find(S); 1047 if (SibInfoIt != V2SCopies.end()) { 1048 V2SCopyInfo &SI = SibInfoIt->second; 1049 LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); 1050 if (!SI.NeedToBeConvertedToVALU) { 1051 SI.SChain.set_subtract(C.SChain); 1052 if (needToBeConvertedToVALU(&SI)) 1053 LoweringWorklist.push_back(SI.ID); 1054 } 1055 SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); 1056 } 1057 } 1058 LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy 1059 << " is being turned to VALU\n"); 1060 // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if 1061 // instead. 1062 V2SCopies.erase(C.ID); 1063 Copies.insert(C.Copy); 1064 } 1065 } 1066 1067 TII->moveToVALU(Copies, MDT); 1068 Copies.clear(); 1069 1070 // Now do actual lowering 1071 for (auto C : V2SCopies) { 1072 MachineInstr *MI = C.second.Copy; 1073 MachineBasicBlock *MBB = MI->getParent(); 1074 // We decide to turn V2S copy to v_readfirstlane_b32 1075 // remove it from the V2SCopies and remove it from all its siblings 1076 LLVM_DEBUG(dbgs() << "V2S copy " << *MI 1077 << " is being turned to v_readfirstlane_b32" 1078 << " Score: " << C.second.Score << "\n"); 1079 Register DstReg = MI->getOperand(0).getReg(); 1080 MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); 1081 1082 Register SrcReg = MI->getOperand(1).getReg(); 1083 unsigned SubReg = MI->getOperand(1).getSubReg(); 1084 const TargetRegisterClass *SrcRC = 1085 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1)); 1086 size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); 1087 if (SrcSize == 16) { 1088 assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() && 1089 "We do not expect to see 16-bit copies from VGPR to SGPR unless " 1090 "we have 16-bit VGPRs"); 1091 assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass || 1092 MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass); 1093 // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits 1094 MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); 1095 Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1096 const DebugLoc &DL = MI->getDebugLoc(); 1097 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass); 1098 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); 1099 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32) 1100 .addReg(SrcReg, 0, SubReg) 1101 .addImm(AMDGPU::lo16) 1102 .addReg(Undef) 1103 .addImm(AMDGPU::hi16); 1104 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 1105 .addReg(VReg32); 1106 } else if (SrcSize == 32) { 1107 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1108 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1109 MIB.addReg(SrcReg, 0, SubReg); 1110 } else { 1111 auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), 1112 TII->get(AMDGPU::REG_SEQUENCE), DstReg); 1113 int N = TRI->getRegSizeInBits(*SrcRC) / 32; 1114 for (int i = 0; i < N; i++) { 1115 Register PartialSrc = TII->buildExtractSubReg( 1116 Result, *MRI, MI->getOperand(1), SrcRC, 1117 TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); 1118 Register PartialDst = 1119 MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1120 BuildMI(*MBB, *Result, Result->getDebugLoc(), 1121 TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) 1122 .addReg(PartialSrc); 1123 Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); 1124 } 1125 } 1126 MI->eraseFromParent(); 1127 } 1128 } 1129 1130 void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { 1131 bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); 1132 for (MachineBasicBlock &MBB : MF) { 1133 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 1134 ++I) { 1135 MachineInstr &MI = *I; 1136 // May already have been lowered. 1137 if (!MI.isCopy()) 1138 continue; 1139 Register SrcReg = MI.getOperand(1).getReg(); 1140 Register DstReg = MI.getOperand(0).getReg(); 1141 if (SrcReg == AMDGPU::SCC) { 1142 Register SCCCopy = 1143 MRI->createVirtualRegister(TRI->getWaveMaskRegClass()); 1144 I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1145 MI.getDebugLoc(), 1146 TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 1147 : AMDGPU::S_CSELECT_B64), 1148 SCCCopy) 1149 .addImm(-1) 1150 .addImm(0); 1151 I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), 1152 TII->get(AMDGPU::COPY), DstReg) 1153 .addReg(SCCCopy); 1154 MI.eraseFromParent(); 1155 continue; 1156 } 1157 if (DstReg == AMDGPU::SCC) { 1158 unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 1159 Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1160 Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); 1161 I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1162 MI.getDebugLoc(), TII->get(Opcode)) 1163 .addReg(Tmp, getDefRegState(true)) 1164 .addReg(SrcReg) 1165 .addReg(Exec); 1166 MI.eraseFromParent(); 1167 } 1168 } 1169 } 1170 } 1171 1172 PreservedAnalyses 1173 SIFixSGPRCopiesPass::run(MachineFunction &MF, 1174 MachineFunctionAnalysisManager &MFAM) { 1175 MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF); 1176 SIFixSGPRCopies Impl(&MDT); 1177 bool Changed = Impl.run(MF); 1178 if (!Changed) 1179 return PreservedAnalyses::all(); 1180 1181 // TODO: We could detect CFG changed. 1182 auto PA = getMachineFunctionPassPreservedAnalyses(); 1183 return PA; 1184 } 1185