1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "SIInstrInfo.h" 28 #include "llvm/CodeGen/MachineDominators.h" 29 #include "llvm/CodeGen/MachineFunctionPass.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachinePostDominators.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/MachineSSAUpdater.h" 34 #include "llvm/IR/Function.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/InitializePasses.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Target/TargetMachine.h" 39 40 #define DEBUG_TYPE "si-i1-copies" 41 42 using namespace llvm; 43 44 static unsigned createLaneMaskReg(MachineFunction &MF); 45 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 46 47 namespace { 48 49 class SILowerI1Copies : public MachineFunctionPass { 50 public: 51 static char ID; 52 53 private: 54 bool IsWave32 = false; 55 MachineFunction *MF = nullptr; 56 MachineDominatorTree *DT = nullptr; 57 MachinePostDominatorTree *PDT = nullptr; 58 MachineRegisterInfo *MRI = nullptr; 59 const GCNSubtarget *ST = nullptr; 60 const SIInstrInfo *TII = nullptr; 61 62 unsigned ExecReg; 63 unsigned MovOp; 64 unsigned AndOp; 65 unsigned OrOp; 66 unsigned XorOp; 67 unsigned AndN2Op; 68 unsigned OrN2Op; 69 70 DenseSet<unsigned> ConstrainRegs; 71 72 public: 73 SILowerI1Copies() : MachineFunctionPass(ID) { 74 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 75 } 76 77 bool runOnMachineFunction(MachineFunction &MF) override; 78 79 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 80 81 void getAnalysisUsage(AnalysisUsage &AU) const override { 82 AU.setPreservesCFG(); 83 AU.addRequired<MachineDominatorTree>(); 84 AU.addRequired<MachinePostDominatorTree>(); 85 MachineFunctionPass::getAnalysisUsage(AU); 86 } 87 88 private: 89 void lowerCopiesFromI1(); 90 void lowerPhis(); 91 void lowerCopiesToI1(); 92 bool isConstantLaneMask(unsigned Reg, bool &Val) const; 93 void buildMergeLaneMasks(MachineBasicBlock &MBB, 94 MachineBasicBlock::iterator I, const DebugLoc &DL, 95 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 96 MachineBasicBlock::iterator 97 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 98 99 bool isVreg1(unsigned Reg) const { 100 return Register::isVirtualRegister(Reg) && 101 MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 102 } 103 104 bool isLaneMaskReg(unsigned Reg) const { 105 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 106 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 107 ST->getWavefrontSize(); 108 } 109 }; 110 111 /// Helper class that determines the relationship between incoming values of a 112 /// phi in the control flow graph to determine where an incoming value can 113 /// simply be taken as a scalar lane mask as-is, and where it needs to be 114 /// merged with another, previously defined lane mask. 115 /// 116 /// The approach is as follows: 117 /// - Determine all basic blocks which, starting from the incoming blocks, 118 /// a wave may reach before entering the def block (the block containing the 119 /// phi). 120 /// - If an incoming block has no predecessors in this set, we can take the 121 /// incoming value as a scalar lane mask as-is. 122 /// -- A special case of this is when the def block has a self-loop. 123 /// - Otherwise, the incoming value needs to be merged with a previously 124 /// defined lane mask. 125 /// - If there is a path into the set of reachable blocks that does _not_ go 126 /// through an incoming block where we can take the scalar lane mask as-is, 127 /// we need to invent an available value for the SSAUpdater. Choices are 128 /// 0 and undef, with differing consequences for how to merge values etc. 129 /// 130 /// TODO: We could use region analysis to quickly skip over SESE regions during 131 /// the traversal. 132 /// 133 class PhiIncomingAnalysis { 134 MachinePostDominatorTree &PDT; 135 136 // For each reachable basic block, whether it is a source in the induced 137 // subgraph of the CFG. 138 DenseMap<MachineBasicBlock *, bool> ReachableMap; 139 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 140 SmallVector<MachineBasicBlock *, 4> Stack; 141 SmallVector<MachineBasicBlock *, 4> Predecessors; 142 143 public: 144 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} 145 146 /// Returns whether \p MBB is a source in the induced subgraph of reachable 147 /// blocks. 148 bool isSource(MachineBasicBlock &MBB) const { 149 return ReachableMap.find(&MBB)->second; 150 } 151 152 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 153 154 void analyze(MachineBasicBlock &DefBlock, 155 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 156 assert(Stack.empty()); 157 ReachableMap.clear(); 158 ReachableOrdered.clear(); 159 Predecessors.clear(); 160 161 // Insert the def block first, so that it acts as an end point for the 162 // traversal. 163 ReachableMap.try_emplace(&DefBlock, false); 164 ReachableOrdered.push_back(&DefBlock); 165 166 for (MachineBasicBlock *MBB : IncomingBlocks) { 167 if (MBB == &DefBlock) { 168 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 169 continue; 170 } 171 172 ReachableMap.try_emplace(MBB, false); 173 ReachableOrdered.push_back(MBB); 174 175 // If this block has a divergent terminator and the def block is its 176 // post-dominator, the wave may first visit the other successors. 177 bool Divergent = false; 178 for (MachineInstr &MI : MBB->terminators()) { 179 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 180 MI.getOpcode() == AMDGPU::SI_IF || 181 MI.getOpcode() == AMDGPU::SI_ELSE || 182 MI.getOpcode() == AMDGPU::SI_LOOP) { 183 Divergent = true; 184 break; 185 } 186 } 187 188 if (Divergent && PDT.dominates(&DefBlock, MBB)) { 189 for (MachineBasicBlock *Succ : MBB->successors()) 190 Stack.push_back(Succ); 191 } 192 } 193 194 while (!Stack.empty()) { 195 MachineBasicBlock *MBB = Stack.pop_back_val(); 196 if (!ReachableMap.try_emplace(MBB, false).second) 197 continue; 198 ReachableOrdered.push_back(MBB); 199 200 for (MachineBasicBlock *Succ : MBB->successors()) 201 Stack.push_back(Succ); 202 } 203 204 for (MachineBasicBlock *MBB : ReachableOrdered) { 205 bool HaveReachablePred = false; 206 for (MachineBasicBlock *Pred : MBB->predecessors()) { 207 if (ReachableMap.count(Pred)) { 208 HaveReachablePred = true; 209 } else { 210 Stack.push_back(Pred); 211 } 212 } 213 if (!HaveReachablePred) 214 ReachableMap[MBB] = true; 215 if (HaveReachablePred) { 216 for (MachineBasicBlock *UnreachablePred : Stack) { 217 if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) 218 Predecessors.push_back(UnreachablePred); 219 } 220 } 221 Stack.clear(); 222 } 223 } 224 }; 225 226 /// Helper class that detects loops which require us to lower an i1 COPY into 227 /// bitwise manipulation. 228 /// 229 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 230 /// between loops with the same header. Consider this example: 231 /// 232 /// A-+-+ 233 /// | | | 234 /// B-+ | 235 /// | | 236 /// C---+ 237 /// 238 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 239 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 240 /// bitwise operations to combine results from different loop iterations when 241 /// B has a divergent branch (since by default we will compile this code such 242 /// that threads in a wave are merged at the entry of C). 243 /// 244 /// The following rule is implemented to determine whether bitwise operations 245 /// are required: use the bitwise lowering for a def in block B if a backward 246 /// edge to B is reachable without going through the nearest common 247 /// post-dominator of B and all uses of the def. 248 /// 249 /// TODO: This rule is conservative because it does not check whether the 250 /// relevant branches are actually divergent. 251 /// 252 /// The class is designed to cache the CFG traversal so that it can be re-used 253 /// for multiple defs within the same basic block. 254 /// 255 /// TODO: We could use region analysis to quickly skip over SESE regions during 256 /// the traversal. 257 /// 258 class LoopFinder { 259 MachineDominatorTree &DT; 260 MachinePostDominatorTree &PDT; 261 262 // All visited / reachable block, tagged by level (level 0 is the def block, 263 // level 1 are all blocks reachable including but not going through the def 264 // block's IPDOM, etc.). 265 DenseMap<MachineBasicBlock *, unsigned> Visited; 266 267 // Nearest common dominator of all visited blocks by level (level 0 is the 268 // def block). Used for seeding the SSAUpdater. 269 SmallVector<MachineBasicBlock *, 4> CommonDominators; 270 271 // Post-dominator of all visited blocks. 272 MachineBasicBlock *VisitedPostDom = nullptr; 273 274 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 275 // reachable without going through the IPDOM of the def block (if the IPDOM 276 // itself has an edge to the def block, the loop level is 2), etc. 277 unsigned FoundLoopLevel = ~0u; 278 279 MachineBasicBlock *DefBlock = nullptr; 280 SmallVector<MachineBasicBlock *, 4> Stack; 281 SmallVector<MachineBasicBlock *, 4> NextLevel; 282 283 public: 284 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 285 : DT(DT), PDT(PDT) {} 286 287 void initialize(MachineBasicBlock &MBB) { 288 Visited.clear(); 289 CommonDominators.clear(); 290 Stack.clear(); 291 NextLevel.clear(); 292 VisitedPostDom = nullptr; 293 FoundLoopLevel = ~0u; 294 295 DefBlock = &MBB; 296 } 297 298 /// Check whether a backward edge can be reached without going through the 299 /// given \p PostDom of the def block. 300 /// 301 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 302 unsigned findLoop(MachineBasicBlock *PostDom) { 303 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 304 305 if (!VisitedPostDom) 306 advanceLevel(); 307 308 unsigned Level = 0; 309 while (PDNode->getBlock() != PostDom) { 310 if (PDNode->getBlock() == VisitedPostDom) 311 advanceLevel(); 312 PDNode = PDNode->getIDom(); 313 Level++; 314 if (FoundLoopLevel == Level) 315 return Level; 316 } 317 318 return 0; 319 } 320 321 /// Add undef values dominating the loop and the optionally given additional 322 /// blocks, so that the SSA updater doesn't have to search all the way to the 323 /// function entry. 324 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 325 ArrayRef<MachineBasicBlock *> Blocks = {}) { 326 assert(LoopLevel < CommonDominators.size()); 327 328 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 329 for (MachineBasicBlock *MBB : Blocks) 330 Dom = DT.findNearestCommonDominator(Dom, MBB); 331 332 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 333 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 334 } else { 335 // The dominator is part of the loop or the given blocks, so add the 336 // undef value to unreachable predecessors instead. 337 for (MachineBasicBlock *Pred : Dom->predecessors()) { 338 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 339 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 340 } 341 } 342 } 343 344 private: 345 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 346 ArrayRef<MachineBasicBlock *> Blocks) const { 347 auto DomIt = Visited.find(&MBB); 348 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 349 return true; 350 351 if (llvm::find(Blocks, &MBB) != Blocks.end()) 352 return true; 353 354 return false; 355 } 356 357 void advanceLevel() { 358 MachineBasicBlock *VisitedDom; 359 360 if (!VisitedPostDom) { 361 VisitedPostDom = DefBlock; 362 VisitedDom = DefBlock; 363 Stack.push_back(DefBlock); 364 } else { 365 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 366 VisitedDom = CommonDominators.back(); 367 368 for (unsigned i = 0; i < NextLevel.size();) { 369 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 370 Stack.push_back(NextLevel[i]); 371 372 NextLevel[i] = NextLevel.back(); 373 NextLevel.pop_back(); 374 } else { 375 i++; 376 } 377 } 378 } 379 380 unsigned Level = CommonDominators.size(); 381 while (!Stack.empty()) { 382 MachineBasicBlock *MBB = Stack.pop_back_val(); 383 if (!PDT.dominates(VisitedPostDom, MBB)) 384 NextLevel.push_back(MBB); 385 386 Visited[MBB] = Level; 387 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 388 389 for (MachineBasicBlock *Succ : MBB->successors()) { 390 if (Succ == DefBlock) { 391 if (MBB == VisitedPostDom) 392 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 393 else 394 FoundLoopLevel = std::min(FoundLoopLevel, Level); 395 continue; 396 } 397 398 if (Visited.try_emplace(Succ, ~0u).second) { 399 if (MBB == VisitedPostDom) 400 NextLevel.push_back(Succ); 401 else 402 Stack.push_back(Succ); 403 } 404 } 405 } 406 407 CommonDominators.push_back(VisitedDom); 408 } 409 }; 410 411 } // End anonymous namespace. 412 413 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 414 false) 415 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 416 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 417 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 418 false) 419 420 char SILowerI1Copies::ID = 0; 421 422 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 423 424 FunctionPass *llvm::createSILowerI1CopiesPass() { 425 return new SILowerI1Copies(); 426 } 427 428 static unsigned createLaneMaskReg(MachineFunction &MF) { 429 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 430 MachineRegisterInfo &MRI = MF.getRegInfo(); 431 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 432 : &AMDGPU::SReg_64RegClass); 433 } 434 435 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 436 MachineFunction &MF = *MBB.getParent(); 437 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 438 const SIInstrInfo *TII = ST.getInstrInfo(); 439 unsigned UndefReg = createLaneMaskReg(MF); 440 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 441 UndefReg); 442 return UndefReg; 443 } 444 445 /// Lower all instructions that def or use vreg_1 registers. 446 /// 447 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 448 /// occur around inline assembly. We do this first, before vreg_1 registers 449 /// are changed to scalar mask registers. 450 /// 451 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 452 /// all others, because phi lowering looks through copies and can therefore 453 /// often make copy lowering unnecessary. 454 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 455 MF = &TheMF; 456 MRI = &MF->getRegInfo(); 457 DT = &getAnalysis<MachineDominatorTree>(); 458 PDT = &getAnalysis<MachinePostDominatorTree>(); 459 460 ST = &MF->getSubtarget<GCNSubtarget>(); 461 TII = ST->getInstrInfo(); 462 IsWave32 = ST->isWave32(); 463 464 if (IsWave32) { 465 ExecReg = AMDGPU::EXEC_LO; 466 MovOp = AMDGPU::S_MOV_B32; 467 AndOp = AMDGPU::S_AND_B32; 468 OrOp = AMDGPU::S_OR_B32; 469 XorOp = AMDGPU::S_XOR_B32; 470 AndN2Op = AMDGPU::S_ANDN2_B32; 471 OrN2Op = AMDGPU::S_ORN2_B32; 472 } else { 473 ExecReg = AMDGPU::EXEC; 474 MovOp = AMDGPU::S_MOV_B64; 475 AndOp = AMDGPU::S_AND_B64; 476 OrOp = AMDGPU::S_OR_B64; 477 XorOp = AMDGPU::S_XOR_B64; 478 AndN2Op = AMDGPU::S_ANDN2_B64; 479 OrN2Op = AMDGPU::S_ORN2_B64; 480 } 481 482 lowerCopiesFromI1(); 483 lowerPhis(); 484 lowerCopiesToI1(); 485 486 for (unsigned Reg : ConstrainRegs) 487 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 488 ConstrainRegs.clear(); 489 490 return true; 491 } 492 493 #ifndef NDEBUG 494 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 495 const MachineRegisterInfo &MRI, 496 Register Reg) { 497 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 498 return Size == 1 || Size == 32; 499 } 500 #endif 501 502 void SILowerI1Copies::lowerCopiesFromI1() { 503 SmallVector<MachineInstr *, 4> DeadCopies; 504 505 for (MachineBasicBlock &MBB : *MF) { 506 for (MachineInstr &MI : MBB) { 507 if (MI.getOpcode() != AMDGPU::COPY) 508 continue; 509 510 Register DstReg = MI.getOperand(0).getReg(); 511 Register SrcReg = MI.getOperand(1).getReg(); 512 if (!isVreg1(SrcReg)) 513 continue; 514 515 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 516 continue; 517 518 // Copy into a 32-bit vector register. 519 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 520 DebugLoc DL = MI.getDebugLoc(); 521 522 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 523 assert(!MI.getOperand(0).getSubReg()); 524 525 ConstrainRegs.insert(SrcReg); 526 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 527 .addImm(0) 528 .addImm(0) 529 .addImm(0) 530 .addImm(-1) 531 .addReg(SrcReg); 532 DeadCopies.push_back(&MI); 533 } 534 535 for (MachineInstr *MI : DeadCopies) 536 MI->eraseFromParent(); 537 DeadCopies.clear(); 538 } 539 } 540 541 void SILowerI1Copies::lowerPhis() { 542 MachineSSAUpdater SSAUpdater(*MF); 543 LoopFinder LF(*DT, *PDT); 544 PhiIncomingAnalysis PIA(*PDT); 545 SmallVector<MachineInstr *, 4> Vreg1Phis; 546 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 547 SmallVector<unsigned, 4> IncomingRegs; 548 SmallVector<unsigned, 4> IncomingUpdated; 549 #ifndef NDEBUG 550 DenseSet<unsigned> PhiRegisters; 551 #endif 552 553 for (MachineBasicBlock &MBB : *MF) { 554 for (MachineInstr &MI : MBB.phis()) { 555 if (isVreg1(MI.getOperand(0).getReg())) 556 Vreg1Phis.push_back(&MI); 557 } 558 } 559 560 MachineBasicBlock *PrevMBB = nullptr; 561 for (MachineInstr *MI : Vreg1Phis) { 562 MachineBasicBlock &MBB = *MI->getParent(); 563 if (&MBB != PrevMBB) { 564 LF.initialize(MBB); 565 PrevMBB = &MBB; 566 } 567 568 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 569 570 Register DstReg = MI->getOperand(0).getReg(); 571 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 572 : &AMDGPU::SReg_64RegClass); 573 574 // Collect incoming values. 575 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 576 assert(i + 1 < MI->getNumOperands()); 577 Register IncomingReg = MI->getOperand(i).getReg(); 578 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 579 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 580 581 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 582 IncomingReg = IncomingDef->getOperand(1).getReg(); 583 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 584 assert(!IncomingDef->getOperand(1).getSubReg()); 585 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 586 continue; 587 } else { 588 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 589 } 590 591 IncomingBlocks.push_back(IncomingMBB); 592 IncomingRegs.push_back(IncomingReg); 593 } 594 595 #ifndef NDEBUG 596 PhiRegisters.insert(DstReg); 597 #endif 598 599 // Phis in a loop that are observed outside the loop receive a simple but 600 // conservatively correct treatment. 601 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 602 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 603 DomBlocks.push_back(Use.getParent()); 604 605 MachineBasicBlock *PostDomBound = 606 PDT->findNearestCommonDominator(DomBlocks); 607 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 608 609 SSAUpdater.Initialize(DstReg); 610 611 if (FoundLoopLevel) { 612 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 613 614 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 615 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 616 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 617 IncomingUpdated.back()); 618 } 619 620 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 621 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 622 buildMergeLaneMasks( 623 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 624 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 625 } 626 } else { 627 // The phi is not observed from outside a loop. Use a more accurate 628 // lowering. 629 PIA.analyze(MBB, IncomingBlocks); 630 631 for (MachineBasicBlock *MBB : PIA.predecessors()) 632 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 633 634 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 635 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 636 if (PIA.isSource(IMBB)) { 637 IncomingUpdated.push_back(0); 638 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 639 } else { 640 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 641 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 642 } 643 } 644 645 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 646 if (!IncomingUpdated[i]) 647 continue; 648 649 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 650 buildMergeLaneMasks( 651 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 652 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 653 } 654 } 655 656 unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 657 if (NewReg != DstReg) { 658 MRI->replaceRegWith(NewReg, DstReg); 659 MI->eraseFromParent(); 660 } 661 662 IncomingBlocks.clear(); 663 IncomingRegs.clear(); 664 IncomingUpdated.clear(); 665 } 666 } 667 668 void SILowerI1Copies::lowerCopiesToI1() { 669 MachineSSAUpdater SSAUpdater(*MF); 670 LoopFinder LF(*DT, *PDT); 671 SmallVector<MachineInstr *, 4> DeadCopies; 672 673 for (MachineBasicBlock &MBB : *MF) { 674 LF.initialize(MBB); 675 676 for (MachineInstr &MI : MBB) { 677 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 678 MI.getOpcode() != AMDGPU::COPY) 679 continue; 680 681 Register DstReg = MI.getOperand(0).getReg(); 682 if (!isVreg1(DstReg)) 683 continue; 684 685 if (MRI->use_empty(DstReg)) { 686 DeadCopies.push_back(&MI); 687 continue; 688 } 689 690 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 691 692 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 693 : &AMDGPU::SReg_64RegClass); 694 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 695 continue; 696 697 DebugLoc DL = MI.getDebugLoc(); 698 Register SrcReg = MI.getOperand(1).getReg(); 699 assert(!MI.getOperand(1).getSubReg()); 700 701 if (!Register::isVirtualRegister(SrcReg) || 702 (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 703 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 704 unsigned TmpReg = createLaneMaskReg(*MF); 705 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 706 .addReg(SrcReg) 707 .addImm(0); 708 MI.getOperand(1).setReg(TmpReg); 709 SrcReg = TmpReg; 710 } 711 712 // Defs in a loop that are observed outside the loop must be transformed 713 // into appropriate bit manipulation. 714 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 715 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 716 DomBlocks.push_back(Use.getParent()); 717 718 MachineBasicBlock *PostDomBound = 719 PDT->findNearestCommonDominator(DomBlocks); 720 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 721 if (FoundLoopLevel) { 722 SSAUpdater.Initialize(DstReg); 723 SSAUpdater.AddAvailableValue(&MBB, DstReg); 724 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 725 726 buildMergeLaneMasks(MBB, MI, DL, DstReg, 727 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 728 DeadCopies.push_back(&MI); 729 } 730 } 731 732 for (MachineInstr *MI : DeadCopies) 733 MI->eraseFromParent(); 734 DeadCopies.clear(); 735 } 736 } 737 738 bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { 739 const MachineInstr *MI; 740 for (;;) { 741 MI = MRI->getUniqueVRegDef(Reg); 742 if (MI->getOpcode() != AMDGPU::COPY) 743 break; 744 745 Reg = MI->getOperand(1).getReg(); 746 if (!Register::isVirtualRegister(Reg)) 747 return false; 748 if (!isLaneMaskReg(Reg)) 749 return false; 750 } 751 752 if (MI->getOpcode() != MovOp) 753 return false; 754 755 if (!MI->getOperand(1).isImm()) 756 return false; 757 758 int64_t Imm = MI->getOperand(1).getImm(); 759 if (Imm == 0) { 760 Val = false; 761 return true; 762 } 763 if (Imm == -1) { 764 Val = true; 765 return true; 766 } 767 768 return false; 769 } 770 771 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 772 Def = false; 773 Use = false; 774 775 for (const MachineOperand &MO : MI.operands()) { 776 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 777 if (MO.isUse()) 778 Use = true; 779 else 780 Def = true; 781 } 782 } 783 } 784 785 /// Return a point at the end of the given \p MBB to insert SALU instructions 786 /// for lane mask calculation. Take terminators and SCC into account. 787 MachineBasicBlock::iterator 788 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 789 auto InsertionPt = MBB.getFirstTerminator(); 790 bool TerminatorsUseSCC = false; 791 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 792 bool DefsSCC; 793 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 794 if (TerminatorsUseSCC || DefsSCC) 795 break; 796 } 797 798 if (!TerminatorsUseSCC) 799 return InsertionPt; 800 801 while (InsertionPt != MBB.begin()) { 802 InsertionPt--; 803 804 bool DefSCC, UseSCC; 805 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 806 if (DefSCC) 807 return InsertionPt; 808 } 809 810 // We should have at least seen an IMPLICIT_DEF or COPY 811 llvm_unreachable("SCC used by terminator but no def in block"); 812 } 813 814 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 815 MachineBasicBlock::iterator I, 816 const DebugLoc &DL, unsigned DstReg, 817 unsigned PrevReg, unsigned CurReg) { 818 bool PrevVal; 819 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 820 bool CurVal; 821 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 822 823 if (PrevConstant && CurConstant) { 824 if (PrevVal == CurVal) { 825 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 826 } else if (CurVal) { 827 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 828 } else { 829 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 830 .addReg(ExecReg) 831 .addImm(-1); 832 } 833 return; 834 } 835 836 unsigned PrevMaskedReg = 0; 837 unsigned CurMaskedReg = 0; 838 if (!PrevConstant) { 839 if (CurConstant && CurVal) { 840 PrevMaskedReg = PrevReg; 841 } else { 842 PrevMaskedReg = createLaneMaskReg(*MF); 843 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 844 .addReg(PrevReg) 845 .addReg(ExecReg); 846 } 847 } 848 if (!CurConstant) { 849 // TODO: check whether CurReg is already masked by EXEC 850 if (PrevConstant && PrevVal) { 851 CurMaskedReg = CurReg; 852 } else { 853 CurMaskedReg = createLaneMaskReg(*MF); 854 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 855 .addReg(CurReg) 856 .addReg(ExecReg); 857 } 858 } 859 860 if (PrevConstant && !PrevVal) { 861 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 862 .addReg(CurMaskedReg); 863 } else if (CurConstant && !CurVal) { 864 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 865 .addReg(PrevMaskedReg); 866 } else if (PrevConstant && PrevVal) { 867 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 868 .addReg(CurMaskedReg) 869 .addReg(ExecReg); 870 } else { 871 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 872 .addReg(PrevMaskedReg) 873 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 874 } 875 } 876