1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 11 /// will sometimes generate these illegal copies in situations like this: 12 /// 13 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14 /// 15 /// BB0: 16 /// %0 <sgpr> = SCALAR_INST 17 /// %1 <vsrc> = COPY %0 <sgpr> 18 /// ... 19 /// BRANCH %cond BB1, BB2 20 /// BB1: 21 /// %2 <vgpr> = VECTOR_INST 22 /// %3 <vsrc> = COPY %2 <vgpr> 23 /// BB2: 24 /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25 /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26 /// 27 /// 28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29 /// code will look like this: 30 /// 31 /// BB0: 32 /// %0 <sgpr> = SCALAR_INST 33 /// ... 34 /// BRANCH %cond BB1, BB2 35 /// BB1: 36 /// %2 <vgpr> = VECTOR_INST 37 /// %3 <vsrc> = COPY %2 <vgpr> 38 /// BB2: 39 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41 /// 42 /// Now that the result of the PHI instruction is an SGPR, the register 43 /// allocator is now forced to constrain the register class of %3 to 44 /// <sgpr> so we end up with final code like this: 45 /// 46 /// BB0: 47 /// %0 <sgpr> = SCALAR_INST 48 /// ... 49 /// BRANCH %cond BB1, BB2 50 /// BB1: 51 /// %2 <vgpr> = VECTOR_INST 52 /// %3 <sgpr> = COPY %2 <vgpr> 53 /// BB2: 54 /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55 /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56 /// 57 /// Now this code contains an illegal copy from a VGPR to an SGPR. 58 /// 59 /// In order to avoid this problem, this pass searches for PHI instructions 60 /// which define a <vsrc> register and constrains its definition class to 61 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 63 /// will be unable to perform the COPY removal from the above example which 64 /// ultimately led to the creation of an illegal COPY. 65 //===----------------------------------------------------------------------===// 66 67 #include "AMDGPU.h" 68 #include "GCNSubtarget.h" 69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 70 #include "llvm/ADT/SetOperations.h" 71 #include "llvm/CodeGen/MachineDominators.h" 72 #include "llvm/InitializePasses.h" 73 #include "llvm/Target/TargetMachine.h" 74 75 using namespace llvm; 76 77 #define DEBUG_TYPE "si-fix-sgpr-copies" 78 79 static cl::opt<bool> EnableM0Merge( 80 "amdgpu-enable-merge-m0", 81 cl::desc("Merge and hoist M0 initializations"), 82 cl::init(true)); 83 84 namespace { 85 class SIFixSGPRCopies : public MachineFunctionPass { 86 MachineDominatorTree *MDT; 87 unsigned NextVGPRToSGPRCopyID; 88 89 public: 90 static char ID; 91 92 MachineRegisterInfo *MRI; 93 const SIRegisterInfo *TRI; 94 const SIInstrInfo *TII; 95 96 SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} 97 98 bool runOnMachineFunction(MachineFunction &MF) override; 99 unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } 100 void lowerVGPR2SGPRCopies(MachineFunction &MF); 101 // Handles copies which source register is: 102 // 1. Physical register 103 // 2. AGPR 104 // 3. Defined by the instruction the merely moves the immediate 105 bool lowerSpecialCase(MachineInstr &MI); 106 107 MachineBasicBlock *processPHINode(MachineInstr &MI); 108 109 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 110 111 void getAnalysisUsage(AnalysisUsage &AU) const override { 112 AU.addRequired<MachineDominatorTree>(); 113 AU.addPreserved<MachineDominatorTree>(); 114 AU.setPreservesCFG(); 115 MachineFunctionPass::getAnalysisUsage(AU); 116 } 117 }; 118 119 } // end anonymous namespace 120 121 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 122 "SI Fix SGPR copies", false, false) 123 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 124 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 125 "SI Fix SGPR copies", false, false) 126 127 char SIFixSGPRCopies::ID = 0; 128 129 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 130 131 FunctionPass *llvm::createSIFixSGPRCopiesPass() { 132 return new SIFixSGPRCopies(); 133 } 134 135 static bool hasVectorOperands(const MachineInstr &MI, 136 const SIRegisterInfo *TRI) { 137 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 138 for (const MachineOperand &MO : MI.operands()) { 139 if (!MO.isReg() || !MO.getReg().isVirtual()) 140 continue; 141 142 if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg()))) 143 return true; 144 } 145 return false; 146 } 147 148 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 149 getCopyRegClasses(const MachineInstr &Copy, 150 const SIRegisterInfo &TRI, 151 const MachineRegisterInfo &MRI) { 152 Register DstReg = Copy.getOperand(0).getReg(); 153 Register SrcReg = Copy.getOperand(1).getReg(); 154 155 const TargetRegisterClass *SrcRC = SrcReg.isVirtual() 156 ? MRI.getRegClass(SrcReg) 157 : TRI.getPhysRegClass(SrcReg); 158 159 // We don't really care about the subregister here. 160 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 161 162 const TargetRegisterClass *DstRC = DstReg.isVirtual() 163 ? MRI.getRegClass(DstReg) 164 : TRI.getPhysRegClass(DstReg); 165 166 return std::make_pair(SrcRC, DstRC); 167 } 168 169 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 170 const TargetRegisterClass *DstRC, 171 const SIRegisterInfo &TRI) { 172 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 173 TRI.hasVectorRegisters(SrcRC); 174 } 175 176 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 177 const TargetRegisterClass *DstRC, 178 const SIRegisterInfo &TRI) { 179 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 180 TRI.hasVectorRegisters(DstRC); 181 } 182 183 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 184 const SIRegisterInfo *TRI, 185 const SIInstrInfo *TII) { 186 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 187 auto &Src = MI.getOperand(1); 188 Register DstReg = MI.getOperand(0).getReg(); 189 Register SrcReg = Src.getReg(); 190 if (!SrcReg.isVirtual() || !DstReg.isVirtual()) 191 return false; 192 193 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 194 const auto *UseMI = MO.getParent(); 195 if (UseMI == &MI) 196 continue; 197 if (MO.isDef() || UseMI->getParent() != MI.getParent() || 198 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 199 return false; 200 201 unsigned OpIdx = UseMI->getOperandNo(&MO); 202 if (OpIdx >= UseMI->getDesc().getNumOperands() || 203 !TII->isOperandLegal(*UseMI, OpIdx, &Src)) 204 return false; 205 } 206 // Change VGPR to SGPR destination. 207 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 208 return true; 209 } 210 211 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 212 // 213 // SGPRx = ... 214 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 215 // VGPRz = COPY SGPRy 216 // 217 // ==> 218 // 219 // VGPRx = COPY SGPRx 220 // VGPRz = REG_SEQUENCE VGPRx, sub0 221 // 222 // This exposes immediate folding opportunities when materializing 64-bit 223 // immediates. 224 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 225 const SIRegisterInfo *TRI, 226 const SIInstrInfo *TII, 227 MachineRegisterInfo &MRI) { 228 assert(MI.isRegSequence()); 229 230 Register DstReg = MI.getOperand(0).getReg(); 231 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 232 return false; 233 234 if (!MRI.hasOneUse(DstReg)) 235 return false; 236 237 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 238 if (!CopyUse.isCopy()) 239 return false; 240 241 // It is illegal to have vreg inputs to a physreg defining reg_sequence. 242 if (CopyUse.getOperand(0).getReg().isPhysical()) 243 return false; 244 245 const TargetRegisterClass *SrcRC, *DstRC; 246 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 247 248 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 249 return false; 250 251 if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 252 return true; 253 254 // TODO: Could have multiple extracts? 255 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 256 if (SubReg != AMDGPU::NoSubRegister) 257 return false; 258 259 MRI.setRegClass(DstReg, DstRC); 260 261 // SGPRx = ... 262 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 263 // VGPRz = COPY SGPRy 264 265 // => 266 // VGPRx = COPY SGPRx 267 // VGPRz = REG_SEQUENCE VGPRx, sub0 268 269 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 270 bool IsAGPR = TRI->isAGPRClass(DstRC); 271 272 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 273 Register SrcReg = MI.getOperand(I).getReg(); 274 unsigned SrcSubReg = MI.getOperand(I).getSubReg(); 275 276 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 277 assert(TRI->isSGPRClass(SrcRC) && 278 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 279 280 SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); 281 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 282 283 Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 284 285 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 286 TmpReg) 287 .add(MI.getOperand(I)); 288 289 if (IsAGPR) { 290 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 291 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 292 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 293 AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; 294 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 295 TmpAReg) 296 .addReg(TmpReg, RegState::Kill); 297 TmpReg = TmpAReg; 298 } 299 300 MI.getOperand(I).setReg(TmpReg); 301 } 302 303 CopyUse.eraseFromParent(); 304 return true; 305 } 306 307 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 308 const MachineInstr *MoveImm, 309 const SIInstrInfo *TII, 310 unsigned &SMovOp, 311 int64_t &Imm) { 312 if (Copy->getOpcode() != AMDGPU::COPY) 313 return false; 314 315 if (!MoveImm->isMoveImmediate()) 316 return false; 317 318 const MachineOperand *ImmOp = 319 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 320 if (!ImmOp->isImm()) 321 return false; 322 323 // FIXME: Handle copies with sub-regs. 324 if (Copy->getOperand(0).getSubReg()) 325 return false; 326 327 switch (MoveImm->getOpcode()) { 328 default: 329 return false; 330 case AMDGPU::V_MOV_B32_e32: 331 SMovOp = AMDGPU::S_MOV_B32; 332 break; 333 case AMDGPU::V_MOV_B64_PSEUDO: 334 SMovOp = AMDGPU::S_MOV_B64; 335 break; 336 } 337 Imm = ImmOp->getImm(); 338 return true; 339 } 340 341 template <class UnaryPredicate> 342 bool searchPredecessors(const MachineBasicBlock *MBB, 343 const MachineBasicBlock *CutOff, 344 UnaryPredicate Predicate) { 345 if (MBB == CutOff) 346 return false; 347 348 DenseSet<const MachineBasicBlock *> Visited; 349 SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); 350 351 while (!Worklist.empty()) { 352 MachineBasicBlock *MBB = Worklist.pop_back_val(); 353 354 if (!Visited.insert(MBB).second) 355 continue; 356 if (MBB == CutOff) 357 continue; 358 if (Predicate(MBB)) 359 return true; 360 361 Worklist.append(MBB->pred_begin(), MBB->pred_end()); 362 } 363 364 return false; 365 } 366 367 // Checks if there is potential path From instruction To instruction. 368 // If CutOff is specified and it sits in between of that path we ignore 369 // a higher portion of the path and report it is not reachable. 370 static bool isReachable(const MachineInstr *From, 371 const MachineInstr *To, 372 const MachineBasicBlock *CutOff, 373 MachineDominatorTree &MDT) { 374 if (MDT.dominates(From, To)) 375 return true; 376 377 const MachineBasicBlock *MBBFrom = From->getParent(); 378 const MachineBasicBlock *MBBTo = To->getParent(); 379 380 // Do predecessor search. 381 // We should almost never get here since we do not usually produce M0 stores 382 // other than -1. 383 return searchPredecessors(MBBTo, CutOff, [MBBFrom] 384 (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 385 } 386 387 // Return the first non-prologue instruction in the block. 388 static MachineBasicBlock::iterator 389 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 390 MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 391 while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 392 ++I; 393 394 return I; 395 } 396 397 // Hoist and merge identical SGPR initializations into a common predecessor. 398 // This is intended to combine M0 initializations, but can work with any 399 // SGPR. A VGPR cannot be processed since we cannot guarantee vector 400 // executioon. 401 static bool hoistAndMergeSGPRInits(unsigned Reg, 402 const MachineRegisterInfo &MRI, 403 const TargetRegisterInfo *TRI, 404 MachineDominatorTree &MDT, 405 const TargetInstrInfo *TII) { 406 // List of inits by immediate value. 407 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 408 InitListMap Inits; 409 // List of clobbering instructions. 410 SmallVector<MachineInstr*, 8> Clobbers; 411 // List of instructions marked for deletion. 412 SmallSet<MachineInstr*, 8> MergedInstrs; 413 414 bool Changed = false; 415 416 for (auto &MI : MRI.def_instructions(Reg)) { 417 MachineOperand *Imm = nullptr; 418 for (auto &MO : MI.operands()) { 419 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 420 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 421 Imm = nullptr; 422 break; 423 } else if (MO.isImm()) 424 Imm = &MO; 425 } 426 if (Imm) 427 Inits[Imm->getImm()].push_front(&MI); 428 else 429 Clobbers.push_back(&MI); 430 } 431 432 for (auto &Init : Inits) { 433 auto &Defs = Init.second; 434 435 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 436 MachineInstr *MI1 = *I1; 437 438 for (auto I2 = std::next(I1); I2 != E; ) { 439 MachineInstr *MI2 = *I2; 440 441 // Check any possible interference 442 auto interferes = [&](MachineBasicBlock::iterator From, 443 MachineBasicBlock::iterator To) -> bool { 444 445 assert(MDT.dominates(&*To, &*From)); 446 447 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 448 const MachineBasicBlock *MBBFrom = From->getParent(); 449 const MachineBasicBlock *MBBTo = To->getParent(); 450 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 451 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 452 if (!MayClobberFrom && !MayClobberTo) 453 return false; 454 if ((MayClobberFrom && !MayClobberTo) || 455 (!MayClobberFrom && MayClobberTo)) 456 return true; 457 // Both can clobber, this is not an interference only if both are 458 // dominated by Clobber and belong to the same block or if Clobber 459 // properly dominates To, given that To >> From, so it dominates 460 // both and located in a common dominator. 461 return !((MBBFrom == MBBTo && 462 MDT.dominates(Clobber, &*From) && 463 MDT.dominates(Clobber, &*To)) || 464 MDT.properlyDominates(Clobber->getParent(), MBBTo)); 465 }; 466 467 return (llvm::any_of(Clobbers, interferes)) || 468 (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 469 return C.first != Init.first && 470 llvm::any_of(C.second, interferes); 471 })); 472 }; 473 474 if (MDT.dominates(MI1, MI2)) { 475 if (!interferes(MI2, MI1)) { 476 LLVM_DEBUG(dbgs() 477 << "Erasing from " 478 << printMBBReference(*MI2->getParent()) << " " << *MI2); 479 MergedInstrs.insert(MI2); 480 Changed = true; 481 ++I2; 482 continue; 483 } 484 } else if (MDT.dominates(MI2, MI1)) { 485 if (!interferes(MI1, MI2)) { 486 LLVM_DEBUG(dbgs() 487 << "Erasing from " 488 << printMBBReference(*MI1->getParent()) << " " << *MI1); 489 MergedInstrs.insert(MI1); 490 Changed = true; 491 ++I1; 492 break; 493 } 494 } else { 495 auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 496 MI2->getParent()); 497 if (!MBB) { 498 ++I2; 499 continue; 500 } 501 502 MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 503 if (!interferes(MI1, I) && !interferes(MI2, I)) { 504 LLVM_DEBUG(dbgs() 505 << "Erasing from " 506 << printMBBReference(*MI1->getParent()) << " " << *MI1 507 << "and moving from " 508 << printMBBReference(*MI2->getParent()) << " to " 509 << printMBBReference(*I->getParent()) << " " << *MI2); 510 I->getParent()->splice(I, MI2->getParent(), MI2); 511 MergedInstrs.insert(MI1); 512 Changed = true; 513 ++I1; 514 break; 515 } 516 } 517 ++I2; 518 } 519 ++I1; 520 } 521 } 522 523 // Remove initializations that were merged into another. 524 for (auto &Init : Inits) { 525 auto &Defs = Init.second; 526 auto I = Defs.begin(); 527 while (I != Defs.end()) { 528 if (MergedInstrs.count(*I)) { 529 (*I)->eraseFromParent(); 530 I = Defs.erase(I); 531 } else 532 ++I; 533 } 534 } 535 536 // Try to schedule SGPR initializations as early as possible in the MBB. 537 for (auto &Init : Inits) { 538 auto &Defs = Init.second; 539 for (auto MI : Defs) { 540 auto MBB = MI->getParent(); 541 MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 542 MachineBasicBlock::reverse_iterator B(BoundaryMI); 543 // Check if B should actually be a boundary. If not set the previous 544 // instruction as the boundary instead. 545 if (!TII->isBasicBlockPrologue(*B)) 546 B++; 547 548 auto R = std::next(MI->getReverseIterator()); 549 const unsigned Threshold = 50; 550 // Search until B or Threshold for a place to insert the initialization. 551 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 552 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 553 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 554 break; 555 556 // Move to directly after R. 557 if (&*--R != MI) 558 MBB->splice(*R, MBB, MI); 559 } 560 } 561 562 if (Changed) 563 MRI.clearKillFlags(Reg); 564 565 return Changed; 566 } 567 568 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 569 // Only need to run this in SelectionDAG path. 570 if (MF.getProperties().hasProperty( 571 MachineFunctionProperties::Property::Selected)) 572 return false; 573 574 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 575 MRI = &MF.getRegInfo(); 576 TRI = ST.getRegisterInfo(); 577 TII = ST.getInstrInfo(); 578 MDT = &getAnalysis<MachineDominatorTree>(); 579 580 // We have to lower VGPR to SGPR copies before the main loop 581 // because the REG_SEQUENCE and PHI lowering in main loop 582 // convert the def-use chains to VALU and close the opportunities 583 // for keeping them scalar. 584 // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch 585 // addresses their lowering and unify the processing in one main loop. 586 lowerVGPR2SGPRCopies(MF); 587 588 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 589 BI != BE; ++BI) { 590 MachineBasicBlock *MBB = &*BI; 591 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 592 ++I) { 593 MachineInstr &MI = *I; 594 595 switch (MI.getOpcode()) { 596 default: 597 continue; 598 case AMDGPU::COPY: 599 case AMDGPU::WQM: 600 case AMDGPU::STRICT_WQM: 601 case AMDGPU::SOFT_WQM: 602 case AMDGPU::STRICT_WWM: { 603 Register DstReg = MI.getOperand(0).getReg(); 604 const TargetRegisterClass *SrcRC, *DstRC; 605 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 606 607 if (MI.isCopy()) { 608 Register SrcReg = MI.getOperand(1).getReg(); 609 if (SrcReg == AMDGPU::SCC) { 610 Register SCCCopy = MRI->createVirtualRegister( 611 TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); 612 I = BuildMI(*MI.getParent(), 613 std::next(MachineBasicBlock::iterator(MI)), 614 MI.getDebugLoc(), 615 TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 616 : AMDGPU::S_CSELECT_B64), 617 SCCCopy) 618 .addImm(-1) 619 .addImm(0); 620 I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), 621 TII->get(AMDGPU::COPY), DstReg) 622 .addReg(SCCCopy); 623 MI.eraseFromParent(); 624 continue; 625 } else if (DstReg == AMDGPU::SCC) { 626 unsigned Opcode = 627 ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 628 Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 629 Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); 630 I = BuildMI(*MI.getParent(), 631 std::next(MachineBasicBlock::iterator(MI)), 632 MI.getDebugLoc(), TII->get(Opcode)) 633 .addReg(Tmp, getDefRegState(true)) 634 .addReg(SrcReg) 635 .addReg(Exec); 636 MI.eraseFromParent(); 637 continue; 638 } 639 } 640 641 if (!DstReg.isVirtual()) { 642 // If the destination register is a physical register there isn't 643 // really much we can do to fix this. 644 // Some special instructions use M0 as an input. Some even only use 645 // the first lane. Insert a readfirstlane and hope for the best. 646 if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { 647 Register TmpReg 648 = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 649 650 BuildMI(*MBB, MI, MI.getDebugLoc(), 651 TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 652 .add(MI.getOperand(1)); 653 MI.getOperand(1).setReg(TmpReg); 654 } 655 656 continue; 657 } 658 659 if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 660 tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); 661 } 662 663 break; 664 } 665 case AMDGPU::PHI: { 666 MachineBasicBlock *NewBB = processPHINode(MI); 667 if (NewBB && NewBB != MBB) { 668 MBB = NewBB; 669 E = MBB->end(); 670 BI = MachineFunction::iterator(MBB); 671 BE = MF.end(); 672 } 673 assert((!NewBB || NewBB == I->getParent()) && 674 "moveToVALU did not return the right basic block"); 675 break; 676 } 677 case AMDGPU::REG_SEQUENCE: { 678 if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || 679 !hasVectorOperands(MI, TRI)) { 680 foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); 681 continue; 682 } 683 684 LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); 685 686 MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); 687 if (NewBB && NewBB != MBB) { 688 MBB = NewBB; 689 E = MBB->end(); 690 BI = MachineFunction::iterator(MBB); 691 BE = MF.end(); 692 } 693 assert((!NewBB || NewBB == I->getParent()) && 694 "moveToVALU did not return the right basic block"); 695 break; 696 } 697 case AMDGPU::INSERT_SUBREG: { 698 const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; 699 DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); 700 Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); 701 Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); 702 if (TRI->isSGPRClass(DstRC) && 703 (TRI->hasVectorRegisters(Src0RC) || 704 TRI->hasVectorRegisters(Src1RC))) { 705 LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); 706 MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); 707 if (NewBB && NewBB != MBB) { 708 MBB = NewBB; 709 E = MBB->end(); 710 BI = MachineFunction::iterator(MBB); 711 BE = MF.end(); 712 } 713 assert((!NewBB || NewBB == I->getParent()) && 714 "moveToVALU did not return the right basic block"); 715 } 716 break; 717 } 718 case AMDGPU::V_WRITELANE_B32: { 719 // Some architectures allow more than one constant bus access without 720 // SGPR restriction 721 if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 722 break; 723 724 // Writelane is special in that it can use SGPR and M0 (which would 725 // normally count as using the constant bus twice - but in this case it 726 // is allowed since the lane selector doesn't count as a use of the 727 // constant bus). However, it is still required to abide by the 1 SGPR 728 // rule. Apply a fix here as we might have multiple SGPRs after 729 // legalizing VGPRs to SGPRs 730 int Src0Idx = 731 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 732 int Src1Idx = 733 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 734 MachineOperand &Src0 = MI.getOperand(Src0Idx); 735 MachineOperand &Src1 = MI.getOperand(Src1Idx); 736 737 // Check to see if the instruction violates the 1 SGPR rule 738 if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 739 Src0.getReg() != AMDGPU::M0) && 740 (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 741 Src1.getReg() != AMDGPU::M0)) { 742 743 // Check for trivially easy constant prop into one of the operands 744 // If this is the case then perform the operation now to resolve SGPR 745 // issue. If we don't do that here we will always insert a mov to m0 746 // that can't be resolved in later operand folding pass 747 bool Resolved = false; 748 for (MachineOperand *MO : {&Src0, &Src1}) { 749 if (MO->getReg().isVirtual()) { 750 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 751 if (DefMI && TII->isFoldableCopy(*DefMI)) { 752 const MachineOperand &Def = DefMI->getOperand(0); 753 if (Def.isReg() && 754 MO->getReg() == Def.getReg() && 755 MO->getSubReg() == Def.getSubReg()) { 756 const MachineOperand &Copied = DefMI->getOperand(1); 757 if (Copied.isImm() && 758 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 759 MO->ChangeToImmediate(Copied.getImm()); 760 Resolved = true; 761 break; 762 } 763 } 764 } 765 } 766 } 767 768 if (!Resolved) { 769 // Haven't managed to resolve by replacing an SGPR with an immediate 770 // Move src1 to be in M0 771 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 772 TII->get(AMDGPU::COPY), AMDGPU::M0) 773 .add(Src1); 774 Src1.ChangeToRegister(AMDGPU::M0, false); 775 } 776 } 777 break; 778 } 779 } 780 } 781 } 782 783 if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) 784 hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 785 786 return true; 787 } 788 789 MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 790 unsigned numVGPRUses = 0; 791 bool AllAGPRUses = true; 792 SetVector<const MachineInstr *> worklist; 793 SmallSet<const MachineInstr *, 4> Visited; 794 SetVector<MachineInstr *> PHIOperands; 795 MachineBasicBlock *CreatedBB = nullptr; 796 worklist.insert(&MI); 797 Visited.insert(&MI); 798 while (!worklist.empty()) { 799 const MachineInstr *Instr = worklist.pop_back_val(); 800 Register Reg = Instr->getOperand(0).getReg(); 801 for (const auto &Use : MRI->use_operands(Reg)) { 802 const MachineInstr *UseMI = Use.getParent(); 803 AllAGPRUses &= (UseMI->isCopy() && 804 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 805 TRI->isAGPR(*MRI, Use.getReg()); 806 if (UseMI->isCopy() || UseMI->isRegSequence()) { 807 if (UseMI->isCopy() && 808 UseMI->getOperand(0).getReg().isPhysical() && 809 !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { 810 numVGPRUses++; 811 } 812 if (Visited.insert(UseMI).second) 813 worklist.insert(UseMI); 814 815 continue; 816 } 817 818 if (UseMI->isPHI()) { 819 const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); 820 if (!TRI->isSGPRReg(*MRI, Use.getReg()) && 821 UseRC != &AMDGPU::VReg_1RegClass) 822 numVGPRUses++; 823 continue; 824 } 825 826 const TargetRegisterClass *OpRC = 827 TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); 828 if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && 829 OpRC != &AMDGPU::VS_64RegClass) { 830 numVGPRUses++; 831 } 832 } 833 } 834 835 Register PHIRes = MI.getOperand(0).getReg(); 836 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 837 if (AllAGPRUses && numVGPRUses && !TRI->isAGPRClass(RC0)) { 838 LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 839 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 840 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 841 MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); 842 if (DefMI && DefMI->isPHI()) 843 PHIOperands.insert(DefMI); 844 } 845 } 846 847 bool hasVGPRInput = false; 848 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 849 Register InputReg = MI.getOperand(i).getReg(); 850 MachineInstr *Def = MRI->getVRegDef(InputReg); 851 if (TRI->isVectorRegister(*MRI, InputReg)) { 852 if (Def->isCopy()) { 853 Register SrcReg = Def->getOperand(1).getReg(); 854 const TargetRegisterClass *RC = 855 TRI->getRegClassForReg(*MRI, SrcReg); 856 if (TRI->isSGPRClass(RC)) 857 continue; 858 } 859 hasVGPRInput = true; 860 break; 861 } 862 else if (Def->isCopy() && 863 TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { 864 Register SrcReg = Def->getOperand(1).getReg(); 865 MachineInstr *SrcDef = MRI->getVRegDef(SrcReg); 866 unsigned SMovOp; 867 int64_t Imm; 868 if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) { 869 hasVGPRInput = true; 870 break; 871 } else { 872 // Formally, if we did not do this right away 873 // it would be done on the next iteration of the 874 // runOnMachineFunction main loop. But why not if we can? 875 MachineFunction *MF = MI.getParent()->getParent(); 876 Def->getOperand(1).ChangeToImmediate(Imm); 877 Def->addImplicitDefUseOperands(*MF); 878 Def->setDesc(TII->get(SMovOp)); 879 } 880 } 881 } 882 883 if ((!TRI->isVectorRegister(*MRI, PHIRes) && 884 RC0 != &AMDGPU::VReg_1RegClass) && 885 (hasVGPRInput || numVGPRUses > 1)) { 886 LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); 887 CreatedBB = TII->moveToVALU(MI); 888 } 889 else { 890 LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 891 TII->legalizeOperands(MI, MDT); 892 } 893 894 // Propagate register class back to PHI operands which are PHI themselves. 895 while (!PHIOperands.empty()) { 896 processPHINode(*PHIOperands.pop_back_val()); 897 } 898 return CreatedBB; 899 } 900 901 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { 902 MachineBasicBlock *MBB = MI.getParent(); 903 const TargetRegisterClass *SrcRC, *DstRC; 904 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 905 906 // We return true to indicate that no further processing needed 907 if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) 908 return true; 909 910 Register SrcReg = MI.getOperand(1).getReg(); 911 if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { 912 TII->moveToVALU(MI, MDT); 913 return true; 914 } 915 916 unsigned SMovOp; 917 int64_t Imm; 918 // If we are just copying an immediate, we can replace the copy with 919 // s_mov_b32. 920 if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { 921 MI.getOperand(1).ChangeToImmediate(Imm); 922 MI.addImplicitDefUseOperands(*MBB->getParent()); 923 MI.setDesc(TII->get(SMovOp)); 924 return true; 925 } 926 return false; 927 } 928 929 class V2SCopyInfo { 930 public: 931 // VGPR to SGPR copy being processed 932 MachineInstr *Copy; 933 // All SALU instructions reachable from this copy in SSA graph 934 DenseSet<MachineInstr *> SChain; 935 // Number of SGPR to VGPR copies that are used to put the SALU computation 936 // results back to VALU. 937 unsigned NumSVCopies; 938 939 unsigned Score; 940 // Actual count of v_readfirstlane_b32 941 // which need to be inserted to keep SChain SALU 942 unsigned NumReadfirstlanes; 943 // Current score state. To speedup selection V2SCopyInfos for processing 944 bool NeedToBeConvertedToVALU = false; 945 // Unique ID. Used as a key for mapping to keep permanent order. 946 unsigned ID; 947 948 // Count of another VGPR to SGPR copies that contribute to the 949 // current copy SChain 950 unsigned SiblingPenalty = 0; 951 SetVector<unsigned> Siblings; 952 V2SCopyInfo() : Copy(nullptr), ID(0){}; 953 V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) 954 : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; 955 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 956 void dump() { 957 dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() 958 << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty 959 << "\nScore: " << Score << "\n"; 960 } 961 #endif 962 }; 963 964 void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { 965 966 DenseMap<unsigned, V2SCopyInfo> Copies; 967 DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; 968 969 // The main function that computes the VGPR to SGPR copy score 970 // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU 971 auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool { 972 if (I->SChain.empty()) 973 return true; 974 I->Siblings = SiblingPenalty[*std::max_element( 975 I->SChain.begin(), I->SChain.end(), 976 [&](MachineInstr *A, MachineInstr *B) -> bool { 977 return SiblingPenalty[A].size() < SiblingPenalty[B].size(); 978 })]; 979 I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; }); 980 // The loop below computes the number of another VGPR to SGPR copies 981 // which contribute to the current copy SALU chain. We assume that all the 982 // copies with the same source virtual register will be squashed to one by 983 // regalloc. Also we take careof the copies of the differnt subregs of the 984 // same register. 985 SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; 986 for (auto J : I->Siblings) { 987 auto InfoIt = Copies.find(J); 988 if (InfoIt != Copies.end()) { 989 MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; 990 if (SiblingCopy->isImplicitDef()) 991 // the COPY has already been MoveToVALUed 992 continue; 993 994 SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), 995 SiblingCopy->getOperand(1).getSubReg())); 996 } 997 } 998 I->SiblingPenalty = SrcRegs.size(); 999 1000 unsigned Penalty = 1001 I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes; 1002 unsigned Profit = I->SChain.size(); 1003 I->Score = Penalty > Profit ? 0 : Profit - Penalty; 1004 I->NeedToBeConvertedToVALU = I->Score < 3; 1005 return I->NeedToBeConvertedToVALU; 1006 }; 1007 1008 auto needProcessing = [](MachineInstr &MI) -> bool { 1009 switch (MI.getOpcode()) { 1010 case AMDGPU::COPY: 1011 case AMDGPU::WQM: 1012 case AMDGPU::STRICT_WQM: 1013 case AMDGPU::SOFT_WQM: 1014 case AMDGPU::STRICT_WWM: 1015 return true; 1016 default: 1017 return false; 1018 } 1019 }; 1020 1021 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; 1022 ++BI) { 1023 MachineBasicBlock *MBB = &*BI; 1024 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 1025 ++I) { 1026 MachineInstr &MI = *I; 1027 if (!needProcessing(MI)) 1028 continue; 1029 if (lowerSpecialCase(MI)) 1030 continue; 1031 1032 // Compute the COPY width to pass it to V2SCopyInfo Ctor 1033 Register DstReg = MI.getOperand(0).getReg(); 1034 1035 const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); 1036 1037 V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI, 1038 TRI->getRegSizeInBits(*DstRC)); 1039 1040 SmallVector<MachineInstr *, 8> AnalysisWorklist; 1041 // Needed because the SSA is not a tree but a graph and may have 1042 // forks and joins. We should not then go same way twice. 1043 DenseSet<MachineInstr *> Visited; 1044 AnalysisWorklist.push_back(&MI); 1045 while (!AnalysisWorklist.empty()) { 1046 1047 MachineInstr *Inst = AnalysisWorklist.pop_back_val(); 1048 1049 if (!Visited.insert(Inst).second) 1050 continue; 1051 1052 // Copies and REG_SEQUENCE do not contribute to the final assembly 1053 // So, skip them but take care of the SGPR to VGPR copies bookkeeping. 1054 if (Inst->isCopy() || Inst->isRegSequence()) { 1055 if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { 1056 if (!Inst->isCopy() || 1057 !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { 1058 In.NumSVCopies++; 1059 continue; 1060 } 1061 } 1062 } 1063 1064 SiblingPenalty[Inst].insert(In.ID); 1065 1066 SmallVector<MachineInstr *, 4> Users; 1067 if ((TII->isSALU(*Inst) && Inst->isCompare()) || 1068 (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { 1069 auto I = Inst->getIterator(); 1070 auto E = Inst->getParent()->end(); 1071 while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) { 1072 if (I->readsRegister(AMDGPU::SCC)) 1073 Users.push_back(&*I); 1074 } 1075 } else if (Inst->getNumExplicitDefs() != 0) { 1076 Register Reg = Inst->getOperand(0).getReg(); 1077 if (TRI->isSGPRReg(*MRI, Reg)) 1078 for (auto &U : MRI->use_instructions(Reg)) 1079 Users.push_back(&U); 1080 } 1081 for (auto U : Users) { 1082 if (TII->isSALU(*U)) 1083 In.SChain.insert(U); 1084 AnalysisWorklist.push_back(U); 1085 } 1086 } 1087 Copies[In.ID] = In; 1088 } 1089 } 1090 1091 SmallVector<unsigned, 8> LoweringWorklist; 1092 for (auto &C : Copies) { 1093 if (needToBeConvertedToVALU(&C.second)) 1094 LoweringWorklist.push_back(C.second.ID); 1095 } 1096 1097 while (!LoweringWorklist.empty()) { 1098 unsigned CurID = LoweringWorklist.pop_back_val(); 1099 auto CurInfoIt = Copies.find(CurID); 1100 if (CurInfoIt != Copies.end()) { 1101 V2SCopyInfo C = CurInfoIt->getSecond(); 1102 LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); 1103 for (auto S : C.Siblings) { 1104 auto SibInfoIt = Copies.find(S); 1105 if (SibInfoIt != Copies.end()) { 1106 V2SCopyInfo &SI = SibInfoIt->getSecond(); 1107 LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); 1108 if (!SI.NeedToBeConvertedToVALU) { 1109 set_subtract(SI.SChain, C.SChain); 1110 if (needToBeConvertedToVALU(&SI)) 1111 LoweringWorklist.push_back(SI.ID); 1112 } 1113 SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); 1114 } 1115 } 1116 LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy 1117 << " is being turned to VALU\n"); 1118 Copies.erase(C.ID); 1119 TII->moveToVALU(*C.Copy, MDT); 1120 } 1121 } 1122 1123 // Now do actual lowering 1124 for (auto C : Copies) { 1125 MachineInstr *MI = C.second.Copy; 1126 MachineBasicBlock *MBB = MI->getParent(); 1127 // We decide to turn V2S copy to v_readfirstlane_b32 1128 // remove it from the V2SCopies and remove it from all its siblings 1129 LLVM_DEBUG(dbgs() << "V2S copy " << *MI 1130 << " is being turned to v_readfirstlane_b32" 1131 << " Score: " << C.second.Score << "\n"); 1132 Register DstReg = MI->getOperand(0).getReg(); 1133 Register SrcReg = MI->getOperand(1).getReg(); 1134 unsigned SubReg = MI->getOperand(1).getSubReg(); 1135 const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg); 1136 SrcRC = TRI->getSubRegClass(SrcRC, SubReg); 1137 size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); 1138 if (SrcSize == 16) { 1139 // HACK to handle possible 16bit VGPR source 1140 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1141 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1142 MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); 1143 } else if (SrcSize == 32) { 1144 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1145 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1146 MIB.addReg(SrcReg, 0, SubReg); 1147 } else { 1148 auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), 1149 TII->get(AMDGPU::REG_SEQUENCE), DstReg); 1150 int N = TRI->getRegSizeInBits(*SrcRC) / 32; 1151 for (int i = 0; i < N; i++) { 1152 Register PartialSrc = TII->buildExtractSubReg( 1153 Result, *MRI, MI->getOperand(1), SrcRC, 1154 TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); 1155 Register PartialDst = 1156 MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1157 BuildMI(*MBB, *Result, Result->getDebugLoc(), 1158 TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) 1159 .addReg(PartialSrc); 1160 Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); 1161 } 1162 } 1163 MI->eraseFromParent(); 1164 } 1165 } 1166