1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachinePostDominators.h" 30 #include "llvm/CodeGen/MachineSSAUpdater.h" 31 #include "llvm/InitializePasses.h" 32 33 #define DEBUG_TYPE "si-i1-copies" 34 35 using namespace llvm; 36 37 static unsigned createLaneMaskReg(MachineFunction &MF); 38 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 39 40 namespace { 41 42 class SILowerI1Copies : public MachineFunctionPass { 43 public: 44 static char ID; 45 46 private: 47 bool IsWave32 = false; 48 MachineFunction *MF = nullptr; 49 MachineDominatorTree *DT = nullptr; 50 MachinePostDominatorTree *PDT = nullptr; 51 MachineRegisterInfo *MRI = nullptr; 52 const GCNSubtarget *ST = nullptr; 53 const SIInstrInfo *TII = nullptr; 54 55 unsigned ExecReg; 56 unsigned MovOp; 57 unsigned AndOp; 58 unsigned OrOp; 59 unsigned XorOp; 60 unsigned AndN2Op; 61 unsigned OrN2Op; 62 63 DenseSet<unsigned> ConstrainRegs; 64 65 public: 66 SILowerI1Copies() : MachineFunctionPass(ID) { 67 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 68 } 69 70 bool runOnMachineFunction(MachineFunction &MF) override; 71 72 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.setPreservesCFG(); 76 AU.addRequired<MachineDominatorTree>(); 77 AU.addRequired<MachinePostDominatorTree>(); 78 MachineFunctionPass::getAnalysisUsage(AU); 79 } 80 81 private: 82 bool lowerCopiesFromI1(); 83 bool lowerPhis(); 84 bool lowerCopiesToI1(); 85 bool isConstantLaneMask(Register Reg, bool &Val) const; 86 void buildMergeLaneMasks(MachineBasicBlock &MBB, 87 MachineBasicBlock::iterator I, const DebugLoc &DL, 88 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 89 MachineBasicBlock::iterator 90 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 91 92 bool isVreg1(Register Reg) const { 93 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 94 } 95 96 bool isLaneMaskReg(unsigned Reg) const { 97 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 98 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 99 ST->getWavefrontSize(); 100 } 101 }; 102 103 /// Helper class that determines the relationship between incoming values of a 104 /// phi in the control flow graph to determine where an incoming value can 105 /// simply be taken as a scalar lane mask as-is, and where it needs to be 106 /// merged with another, previously defined lane mask. 107 /// 108 /// The approach is as follows: 109 /// - Determine all basic blocks which, starting from the incoming blocks, 110 /// a wave may reach before entering the def block (the block containing the 111 /// phi). 112 /// - If an incoming block has no predecessors in this set, we can take the 113 /// incoming value as a scalar lane mask as-is. 114 /// -- A special case of this is when the def block has a self-loop. 115 /// - Otherwise, the incoming value needs to be merged with a previously 116 /// defined lane mask. 117 /// - If there is a path into the set of reachable blocks that does _not_ go 118 /// through an incoming block where we can take the scalar lane mask as-is, 119 /// we need to invent an available value for the SSAUpdater. Choices are 120 /// 0 and undef, with differing consequences for how to merge values etc. 121 /// 122 /// TODO: We could use region analysis to quickly skip over SESE regions during 123 /// the traversal. 124 /// 125 class PhiIncomingAnalysis { 126 MachinePostDominatorTree &PDT; 127 128 // For each reachable basic block, whether it is a source in the induced 129 // subgraph of the CFG. 130 DenseMap<MachineBasicBlock *, bool> ReachableMap; 131 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 132 SmallVector<MachineBasicBlock *, 4> Stack; 133 SmallVector<MachineBasicBlock *, 4> Predecessors; 134 135 public: 136 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} 137 138 /// Returns whether \p MBB is a source in the induced subgraph of reachable 139 /// blocks. 140 bool isSource(MachineBasicBlock &MBB) const { 141 return ReachableMap.find(&MBB)->second; 142 } 143 144 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 145 146 void analyze(MachineBasicBlock &DefBlock, 147 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 148 assert(Stack.empty()); 149 ReachableMap.clear(); 150 ReachableOrdered.clear(); 151 Predecessors.clear(); 152 153 // Insert the def block first, so that it acts as an end point for the 154 // traversal. 155 ReachableMap.try_emplace(&DefBlock, false); 156 ReachableOrdered.push_back(&DefBlock); 157 158 for (MachineBasicBlock *MBB : IncomingBlocks) { 159 if (MBB == &DefBlock) { 160 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 161 continue; 162 } 163 164 ReachableMap.try_emplace(MBB, false); 165 ReachableOrdered.push_back(MBB); 166 167 // If this block has a divergent terminator and the def block is its 168 // post-dominator, the wave may first visit the other successors. 169 bool Divergent = false; 170 for (MachineInstr &MI : MBB->terminators()) { 171 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 172 MI.getOpcode() == AMDGPU::SI_IF || 173 MI.getOpcode() == AMDGPU::SI_ELSE || 174 MI.getOpcode() == AMDGPU::SI_LOOP) { 175 Divergent = true; 176 break; 177 } 178 } 179 180 if (Divergent && PDT.dominates(&DefBlock, MBB)) 181 append_range(Stack, MBB->successors()); 182 } 183 184 while (!Stack.empty()) { 185 MachineBasicBlock *MBB = Stack.pop_back_val(); 186 if (!ReachableMap.try_emplace(MBB, false).second) 187 continue; 188 ReachableOrdered.push_back(MBB); 189 190 append_range(Stack, MBB->successors()); 191 } 192 193 for (MachineBasicBlock *MBB : ReachableOrdered) { 194 bool HaveReachablePred = false; 195 for (MachineBasicBlock *Pred : MBB->predecessors()) { 196 if (ReachableMap.count(Pred)) { 197 HaveReachablePred = true; 198 } else { 199 Stack.push_back(Pred); 200 } 201 } 202 if (!HaveReachablePred) 203 ReachableMap[MBB] = true; 204 if (HaveReachablePred) { 205 for (MachineBasicBlock *UnreachablePred : Stack) { 206 if (!llvm::is_contained(Predecessors, UnreachablePred)) 207 Predecessors.push_back(UnreachablePred); 208 } 209 } 210 Stack.clear(); 211 } 212 } 213 }; 214 215 /// Helper class that detects loops which require us to lower an i1 COPY into 216 /// bitwise manipulation. 217 /// 218 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 219 /// between loops with the same header. Consider this example: 220 /// 221 /// A-+-+ 222 /// | | | 223 /// B-+ | 224 /// | | 225 /// C---+ 226 /// 227 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 228 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 229 /// bitwise operations to combine results from different loop iterations when 230 /// B has a divergent branch (since by default we will compile this code such 231 /// that threads in a wave are merged at the entry of C). 232 /// 233 /// The following rule is implemented to determine whether bitwise operations 234 /// are required: use the bitwise lowering for a def in block B if a backward 235 /// edge to B is reachable without going through the nearest common 236 /// post-dominator of B and all uses of the def. 237 /// 238 /// TODO: This rule is conservative because it does not check whether the 239 /// relevant branches are actually divergent. 240 /// 241 /// The class is designed to cache the CFG traversal so that it can be re-used 242 /// for multiple defs within the same basic block. 243 /// 244 /// TODO: We could use region analysis to quickly skip over SESE regions during 245 /// the traversal. 246 /// 247 class LoopFinder { 248 MachineDominatorTree &DT; 249 MachinePostDominatorTree &PDT; 250 251 // All visited / reachable block, tagged by level (level 0 is the def block, 252 // level 1 are all blocks reachable including but not going through the def 253 // block's IPDOM, etc.). 254 DenseMap<MachineBasicBlock *, unsigned> Visited; 255 256 // Nearest common dominator of all visited blocks by level (level 0 is the 257 // def block). Used for seeding the SSAUpdater. 258 SmallVector<MachineBasicBlock *, 4> CommonDominators; 259 260 // Post-dominator of all visited blocks. 261 MachineBasicBlock *VisitedPostDom = nullptr; 262 263 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 264 // reachable without going through the IPDOM of the def block (if the IPDOM 265 // itself has an edge to the def block, the loop level is 2), etc. 266 unsigned FoundLoopLevel = ~0u; 267 268 MachineBasicBlock *DefBlock = nullptr; 269 SmallVector<MachineBasicBlock *, 4> Stack; 270 SmallVector<MachineBasicBlock *, 4> NextLevel; 271 272 public: 273 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 274 : DT(DT), PDT(PDT) {} 275 276 void initialize(MachineBasicBlock &MBB) { 277 Visited.clear(); 278 CommonDominators.clear(); 279 Stack.clear(); 280 NextLevel.clear(); 281 VisitedPostDom = nullptr; 282 FoundLoopLevel = ~0u; 283 284 DefBlock = &MBB; 285 } 286 287 /// Check whether a backward edge can be reached without going through the 288 /// given \p PostDom of the def block. 289 /// 290 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 291 unsigned findLoop(MachineBasicBlock *PostDom) { 292 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 293 294 if (!VisitedPostDom) 295 advanceLevel(); 296 297 unsigned Level = 0; 298 while (PDNode->getBlock() != PostDom) { 299 if (PDNode->getBlock() == VisitedPostDom) 300 advanceLevel(); 301 PDNode = PDNode->getIDom(); 302 Level++; 303 if (FoundLoopLevel == Level) 304 return Level; 305 } 306 307 return 0; 308 } 309 310 /// Add undef values dominating the loop and the optionally given additional 311 /// blocks, so that the SSA updater doesn't have to search all the way to the 312 /// function entry. 313 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 314 ArrayRef<MachineBasicBlock *> Blocks = {}) { 315 assert(LoopLevel < CommonDominators.size()); 316 317 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 318 for (MachineBasicBlock *MBB : Blocks) 319 Dom = DT.findNearestCommonDominator(Dom, MBB); 320 321 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 322 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 323 } else { 324 // The dominator is part of the loop or the given blocks, so add the 325 // undef value to unreachable predecessors instead. 326 for (MachineBasicBlock *Pred : Dom->predecessors()) { 327 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 328 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 329 } 330 } 331 } 332 333 private: 334 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 335 ArrayRef<MachineBasicBlock *> Blocks) const { 336 auto DomIt = Visited.find(&MBB); 337 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 338 return true; 339 340 if (llvm::is_contained(Blocks, &MBB)) 341 return true; 342 343 return false; 344 } 345 346 void advanceLevel() { 347 MachineBasicBlock *VisitedDom; 348 349 if (!VisitedPostDom) { 350 VisitedPostDom = DefBlock; 351 VisitedDom = DefBlock; 352 Stack.push_back(DefBlock); 353 } else { 354 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 355 VisitedDom = CommonDominators.back(); 356 357 for (unsigned i = 0; i < NextLevel.size();) { 358 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 359 Stack.push_back(NextLevel[i]); 360 361 NextLevel[i] = NextLevel.back(); 362 NextLevel.pop_back(); 363 } else { 364 i++; 365 } 366 } 367 } 368 369 unsigned Level = CommonDominators.size(); 370 while (!Stack.empty()) { 371 MachineBasicBlock *MBB = Stack.pop_back_val(); 372 if (!PDT.dominates(VisitedPostDom, MBB)) 373 NextLevel.push_back(MBB); 374 375 Visited[MBB] = Level; 376 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 377 378 for (MachineBasicBlock *Succ : MBB->successors()) { 379 if (Succ == DefBlock) { 380 if (MBB == VisitedPostDom) 381 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 382 else 383 FoundLoopLevel = std::min(FoundLoopLevel, Level); 384 continue; 385 } 386 387 if (Visited.try_emplace(Succ, ~0u).second) { 388 if (MBB == VisitedPostDom) 389 NextLevel.push_back(Succ); 390 else 391 Stack.push_back(Succ); 392 } 393 } 394 } 395 396 CommonDominators.push_back(VisitedDom); 397 } 398 }; 399 400 } // End anonymous namespace. 401 402 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 403 false) 404 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 405 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 406 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 407 false) 408 409 char SILowerI1Copies::ID = 0; 410 411 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 412 413 FunctionPass *llvm::createSILowerI1CopiesPass() { 414 return new SILowerI1Copies(); 415 } 416 417 static unsigned createLaneMaskReg(MachineFunction &MF) { 418 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 419 MachineRegisterInfo &MRI = MF.getRegInfo(); 420 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 421 : &AMDGPU::SReg_64RegClass); 422 } 423 424 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 425 MachineFunction &MF = *MBB.getParent(); 426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 427 const SIInstrInfo *TII = ST.getInstrInfo(); 428 unsigned UndefReg = createLaneMaskReg(MF); 429 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 430 UndefReg); 431 return UndefReg; 432 } 433 434 /// Lower all instructions that def or use vreg_1 registers. 435 /// 436 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 437 /// occur around inline assembly. We do this first, before vreg_1 registers 438 /// are changed to scalar mask registers. 439 /// 440 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 441 /// all others, because phi lowering looks through copies and can therefore 442 /// often make copy lowering unnecessary. 443 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 444 // Only need to run this in SelectionDAG path. 445 if (TheMF.getProperties().hasProperty( 446 MachineFunctionProperties::Property::Selected)) 447 return false; 448 449 MF = &TheMF; 450 MRI = &MF->getRegInfo(); 451 DT = &getAnalysis<MachineDominatorTree>(); 452 PDT = &getAnalysis<MachinePostDominatorTree>(); 453 454 ST = &MF->getSubtarget<GCNSubtarget>(); 455 TII = ST->getInstrInfo(); 456 IsWave32 = ST->isWave32(); 457 458 if (IsWave32) { 459 ExecReg = AMDGPU::EXEC_LO; 460 MovOp = AMDGPU::S_MOV_B32; 461 AndOp = AMDGPU::S_AND_B32; 462 OrOp = AMDGPU::S_OR_B32; 463 XorOp = AMDGPU::S_XOR_B32; 464 AndN2Op = AMDGPU::S_ANDN2_B32; 465 OrN2Op = AMDGPU::S_ORN2_B32; 466 } else { 467 ExecReg = AMDGPU::EXEC; 468 MovOp = AMDGPU::S_MOV_B64; 469 AndOp = AMDGPU::S_AND_B64; 470 OrOp = AMDGPU::S_OR_B64; 471 XorOp = AMDGPU::S_XOR_B64; 472 AndN2Op = AMDGPU::S_ANDN2_B64; 473 OrN2Op = AMDGPU::S_ORN2_B64; 474 } 475 476 bool Changed = false; 477 Changed |= lowerCopiesFromI1(); 478 Changed |= lowerPhis(); 479 Changed |= lowerCopiesToI1(); 480 481 assert(Changed || ConstrainRegs.empty()); 482 for (unsigned Reg : ConstrainRegs) 483 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 484 ConstrainRegs.clear(); 485 486 return Changed; 487 } 488 489 #ifndef NDEBUG 490 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 491 const MachineRegisterInfo &MRI, 492 Register Reg) { 493 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 494 return Size == 1 || Size == 32; 495 } 496 #endif 497 498 bool SILowerI1Copies::lowerCopiesFromI1() { 499 bool Changed = false; 500 SmallVector<MachineInstr *, 4> DeadCopies; 501 502 for (MachineBasicBlock &MBB : *MF) { 503 for (MachineInstr &MI : MBB) { 504 if (MI.getOpcode() != AMDGPU::COPY) 505 continue; 506 507 Register DstReg = MI.getOperand(0).getReg(); 508 Register SrcReg = MI.getOperand(1).getReg(); 509 if (!isVreg1(SrcReg)) 510 continue; 511 512 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 513 continue; 514 515 Changed = true; 516 517 // Copy into a 32-bit vector register. 518 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 519 DebugLoc DL = MI.getDebugLoc(); 520 521 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 522 assert(!MI.getOperand(0).getSubReg()); 523 524 ConstrainRegs.insert(SrcReg); 525 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 526 .addImm(0) 527 .addImm(0) 528 .addImm(0) 529 .addImm(-1) 530 .addReg(SrcReg); 531 DeadCopies.push_back(&MI); 532 } 533 534 for (MachineInstr *MI : DeadCopies) 535 MI->eraseFromParent(); 536 DeadCopies.clear(); 537 } 538 return Changed; 539 } 540 541 bool SILowerI1Copies::lowerPhis() { 542 MachineSSAUpdater SSAUpdater(*MF); 543 LoopFinder LF(*DT, *PDT); 544 PhiIncomingAnalysis PIA(*PDT); 545 SmallVector<MachineInstr *, 4> Vreg1Phis; 546 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 547 SmallVector<unsigned, 4> IncomingRegs; 548 SmallVector<unsigned, 4> IncomingUpdated; 549 #ifndef NDEBUG 550 DenseSet<unsigned> PhiRegisters; 551 #endif 552 553 for (MachineBasicBlock &MBB : *MF) { 554 for (MachineInstr &MI : MBB.phis()) { 555 if (isVreg1(MI.getOperand(0).getReg())) 556 Vreg1Phis.push_back(&MI); 557 } 558 } 559 if (Vreg1Phis.empty()) 560 return false; 561 562 MachineBasicBlock *PrevMBB = nullptr; 563 for (MachineInstr *MI : Vreg1Phis) { 564 MachineBasicBlock &MBB = *MI->getParent(); 565 if (&MBB != PrevMBB) { 566 LF.initialize(MBB); 567 PrevMBB = &MBB; 568 } 569 570 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 571 572 Register DstReg = MI->getOperand(0).getReg(); 573 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 574 : &AMDGPU::SReg_64RegClass); 575 576 // Collect incoming values. 577 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 578 assert(i + 1 < MI->getNumOperands()); 579 Register IncomingReg = MI->getOperand(i).getReg(); 580 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 581 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 582 583 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 584 IncomingReg = IncomingDef->getOperand(1).getReg(); 585 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 586 assert(!IncomingDef->getOperand(1).getSubReg()); 587 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 588 continue; 589 } else { 590 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 591 } 592 593 IncomingBlocks.push_back(IncomingMBB); 594 IncomingRegs.push_back(IncomingReg); 595 } 596 597 #ifndef NDEBUG 598 PhiRegisters.insert(DstReg); 599 #endif 600 601 // Phis in a loop that are observed outside the loop receive a simple but 602 // conservatively correct treatment. 603 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 604 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 605 DomBlocks.push_back(Use.getParent()); 606 607 MachineBasicBlock *PostDomBound = 608 PDT->findNearestCommonDominator(DomBlocks); 609 610 // FIXME: This fails to find irreducible cycles. If we have a def (other 611 // than a constant) in a pair of blocks that end up looping back to each 612 // other, it will be mishandle. Due to structurization this shouldn't occur 613 // in practice. 614 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 615 616 SSAUpdater.Initialize(DstReg); 617 618 if (FoundLoopLevel) { 619 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 620 621 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 622 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 623 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 624 IncomingUpdated.back()); 625 } 626 627 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 628 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 629 buildMergeLaneMasks( 630 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 631 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 632 } 633 } else { 634 // The phi is not observed from outside a loop. Use a more accurate 635 // lowering. 636 PIA.analyze(MBB, IncomingBlocks); 637 638 for (MachineBasicBlock *MBB : PIA.predecessors()) 639 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 640 641 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 642 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 643 if (PIA.isSource(IMBB)) { 644 IncomingUpdated.push_back(0); 645 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 646 } else { 647 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 648 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 649 } 650 } 651 652 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 653 if (!IncomingUpdated[i]) 654 continue; 655 656 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 657 buildMergeLaneMasks( 658 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 659 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 660 } 661 } 662 663 Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 664 if (NewReg != DstReg) { 665 MRI->replaceRegWith(NewReg, DstReg); 666 MI->eraseFromParent(); 667 } 668 669 IncomingBlocks.clear(); 670 IncomingRegs.clear(); 671 IncomingUpdated.clear(); 672 } 673 return true; 674 } 675 676 bool SILowerI1Copies::lowerCopiesToI1() { 677 bool Changed = false; 678 MachineSSAUpdater SSAUpdater(*MF); 679 LoopFinder LF(*DT, *PDT); 680 SmallVector<MachineInstr *, 4> DeadCopies; 681 682 for (MachineBasicBlock &MBB : *MF) { 683 LF.initialize(MBB); 684 685 for (MachineInstr &MI : MBB) { 686 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 687 MI.getOpcode() != AMDGPU::COPY) 688 continue; 689 690 Register DstReg = MI.getOperand(0).getReg(); 691 if (!isVreg1(DstReg)) 692 continue; 693 694 Changed = true; 695 696 if (MRI->use_empty(DstReg)) { 697 DeadCopies.push_back(&MI); 698 continue; 699 } 700 701 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 702 703 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 704 : &AMDGPU::SReg_64RegClass); 705 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 706 continue; 707 708 DebugLoc DL = MI.getDebugLoc(); 709 Register SrcReg = MI.getOperand(1).getReg(); 710 assert(!MI.getOperand(1).getSubReg()); 711 712 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 713 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 714 unsigned TmpReg = createLaneMaskReg(*MF); 715 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 716 .addReg(SrcReg) 717 .addImm(0); 718 MI.getOperand(1).setReg(TmpReg); 719 SrcReg = TmpReg; 720 } 721 722 // Defs in a loop that are observed outside the loop must be transformed 723 // into appropriate bit manipulation. 724 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 725 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 726 DomBlocks.push_back(Use.getParent()); 727 728 MachineBasicBlock *PostDomBound = 729 PDT->findNearestCommonDominator(DomBlocks); 730 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 731 if (FoundLoopLevel) { 732 SSAUpdater.Initialize(DstReg); 733 SSAUpdater.AddAvailableValue(&MBB, DstReg); 734 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 735 736 buildMergeLaneMasks(MBB, MI, DL, DstReg, 737 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 738 DeadCopies.push_back(&MI); 739 } 740 } 741 742 for (MachineInstr *MI : DeadCopies) 743 MI->eraseFromParent(); 744 DeadCopies.clear(); 745 } 746 return Changed; 747 } 748 749 bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { 750 const MachineInstr *MI; 751 for (;;) { 752 MI = MRI->getUniqueVRegDef(Reg); 753 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) 754 return true; 755 756 if (MI->getOpcode() != AMDGPU::COPY) 757 break; 758 759 Reg = MI->getOperand(1).getReg(); 760 if (!Reg.isVirtual()) 761 return false; 762 if (!isLaneMaskReg(Reg)) 763 return false; 764 } 765 766 if (MI->getOpcode() != MovOp) 767 return false; 768 769 if (!MI->getOperand(1).isImm()) 770 return false; 771 772 int64_t Imm = MI->getOperand(1).getImm(); 773 if (Imm == 0) { 774 Val = false; 775 return true; 776 } 777 if (Imm == -1) { 778 Val = true; 779 return true; 780 } 781 782 return false; 783 } 784 785 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 786 Def = false; 787 Use = false; 788 789 for (const MachineOperand &MO : MI.operands()) { 790 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 791 if (MO.isUse()) 792 Use = true; 793 else 794 Def = true; 795 } 796 } 797 } 798 799 /// Return a point at the end of the given \p MBB to insert SALU instructions 800 /// for lane mask calculation. Take terminators and SCC into account. 801 MachineBasicBlock::iterator 802 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 803 auto InsertionPt = MBB.getFirstTerminator(); 804 bool TerminatorsUseSCC = false; 805 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 806 bool DefsSCC; 807 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 808 if (TerminatorsUseSCC || DefsSCC) 809 break; 810 } 811 812 if (!TerminatorsUseSCC) 813 return InsertionPt; 814 815 while (InsertionPt != MBB.begin()) { 816 InsertionPt--; 817 818 bool DefSCC, UseSCC; 819 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 820 if (DefSCC) 821 return InsertionPt; 822 } 823 824 // We should have at least seen an IMPLICIT_DEF or COPY 825 llvm_unreachable("SCC used by terminator but no def in block"); 826 } 827 828 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 829 MachineBasicBlock::iterator I, 830 const DebugLoc &DL, unsigned DstReg, 831 unsigned PrevReg, unsigned CurReg) { 832 bool PrevVal = false; 833 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 834 bool CurVal = false; 835 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 836 837 if (PrevConstant && CurConstant) { 838 if (PrevVal == CurVal) { 839 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 840 } else if (CurVal) { 841 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 842 } else { 843 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 844 .addReg(ExecReg) 845 .addImm(-1); 846 } 847 return; 848 } 849 850 unsigned PrevMaskedReg = 0; 851 unsigned CurMaskedReg = 0; 852 if (!PrevConstant) { 853 if (CurConstant && CurVal) { 854 PrevMaskedReg = PrevReg; 855 } else { 856 PrevMaskedReg = createLaneMaskReg(*MF); 857 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 858 .addReg(PrevReg) 859 .addReg(ExecReg); 860 } 861 } 862 if (!CurConstant) { 863 // TODO: check whether CurReg is already masked by EXEC 864 if (PrevConstant && PrevVal) { 865 CurMaskedReg = CurReg; 866 } else { 867 CurMaskedReg = createLaneMaskReg(*MF); 868 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 869 .addReg(CurReg) 870 .addReg(ExecReg); 871 } 872 } 873 874 if (PrevConstant && !PrevVal) { 875 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 876 .addReg(CurMaskedReg); 877 } else if (CurConstant && !CurVal) { 878 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 879 .addReg(PrevMaskedReg); 880 } else if (PrevConstant && PrevVal) { 881 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 882 .addReg(CurMaskedReg) 883 .addReg(ExecReg); 884 } else { 885 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 886 .addReg(PrevMaskedReg) 887 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 888 } 889 } 890