1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "SIInstrInfo.h" 28 #include "llvm/CodeGen/MachineDominators.h" 29 #include "llvm/CodeGen/MachineFunctionPass.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachinePostDominators.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/MachineSSAUpdater.h" 34 #include "llvm/IR/Function.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/InitializePasses.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Target/TargetMachine.h" 39 40 #define DEBUG_TYPE "si-i1-copies" 41 42 using namespace llvm; 43 44 static unsigned createLaneMaskReg(MachineFunction &MF); 45 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 46 47 namespace { 48 49 class SILowerI1Copies : public MachineFunctionPass { 50 public: 51 static char ID; 52 53 private: 54 bool IsWave32 = false; 55 MachineFunction *MF = nullptr; 56 MachineDominatorTree *DT = nullptr; 57 MachinePostDominatorTree *PDT = nullptr; 58 MachineRegisterInfo *MRI = nullptr; 59 const GCNSubtarget *ST = nullptr; 60 const SIInstrInfo *TII = nullptr; 61 62 unsigned ExecReg; 63 unsigned MovOp; 64 unsigned AndOp; 65 unsigned OrOp; 66 unsigned XorOp; 67 unsigned AndN2Op; 68 unsigned OrN2Op; 69 70 DenseSet<unsigned> ConstrainRegs; 71 72 public: 73 SILowerI1Copies() : MachineFunctionPass(ID) { 74 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 75 } 76 77 bool runOnMachineFunction(MachineFunction &MF) override; 78 79 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 80 81 void getAnalysisUsage(AnalysisUsage &AU) const override { 82 AU.setPreservesCFG(); 83 AU.addRequired<MachineDominatorTree>(); 84 AU.addRequired<MachinePostDominatorTree>(); 85 MachineFunctionPass::getAnalysisUsage(AU); 86 } 87 88 private: 89 void lowerCopiesFromI1(); 90 void lowerPhis(); 91 void lowerCopiesToI1(); 92 bool isConstantLaneMask(unsigned Reg, bool &Val) const; 93 void buildMergeLaneMasks(MachineBasicBlock &MBB, 94 MachineBasicBlock::iterator I, const DebugLoc &DL, 95 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 96 MachineBasicBlock::iterator 97 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 98 99 bool isVreg1(unsigned Reg) const { 100 return Register::isVirtualRegister(Reg) && 101 MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 102 } 103 104 bool isLaneMaskReg(unsigned Reg) const { 105 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 106 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 107 ST->getWavefrontSize(); 108 } 109 }; 110 111 /// Helper class that determines the relationship between incoming values of a 112 /// phi in the control flow graph to determine where an incoming value can 113 /// simply be taken as a scalar lane mask as-is, and where it needs to be 114 /// merged with another, previously defined lane mask. 115 /// 116 /// The approach is as follows: 117 /// - Determine all basic blocks which, starting from the incoming blocks, 118 /// a wave may reach before entering the def block (the block containing the 119 /// phi). 120 /// - If an incoming block has no predecessors in this set, we can take the 121 /// incoming value as a scalar lane mask as-is. 122 /// -- A special case of this is when the def block has a self-loop. 123 /// - Otherwise, the incoming value needs to be merged with a previously 124 /// defined lane mask. 125 /// - If there is a path into the set of reachable blocks that does _not_ go 126 /// through an incoming block where we can take the scalar lane mask as-is, 127 /// we need to invent an available value for the SSAUpdater. Choices are 128 /// 0 and undef, with differing consequences for how to merge values etc. 129 /// 130 /// TODO: We could use region analysis to quickly skip over SESE regions during 131 /// the traversal. 132 /// 133 class PhiIncomingAnalysis { 134 MachinePostDominatorTree &PDT; 135 136 // For each reachable basic block, whether it is a source in the induced 137 // subgraph of the CFG. 138 DenseMap<MachineBasicBlock *, bool> ReachableMap; 139 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 140 SmallVector<MachineBasicBlock *, 4> Stack; 141 SmallVector<MachineBasicBlock *, 4> Predecessors; 142 143 public: 144 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} 145 146 /// Returns whether \p MBB is a source in the induced subgraph of reachable 147 /// blocks. 148 bool isSource(MachineBasicBlock &MBB) const { 149 return ReachableMap.find(&MBB)->second; 150 } 151 152 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 153 154 void analyze(MachineBasicBlock &DefBlock, 155 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 156 assert(Stack.empty()); 157 ReachableMap.clear(); 158 ReachableOrdered.clear(); 159 Predecessors.clear(); 160 161 // Insert the def block first, so that it acts as an end point for the 162 // traversal. 163 ReachableMap.try_emplace(&DefBlock, false); 164 ReachableOrdered.push_back(&DefBlock); 165 166 for (MachineBasicBlock *MBB : IncomingBlocks) { 167 if (MBB == &DefBlock) { 168 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 169 continue; 170 } 171 172 ReachableMap.try_emplace(MBB, false); 173 ReachableOrdered.push_back(MBB); 174 175 // If this block has a divergent terminator and the def block is its 176 // post-dominator, the wave may first visit the other successors. 177 bool Divergent = false; 178 for (MachineInstr &MI : MBB->terminators()) { 179 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 180 MI.getOpcode() == AMDGPU::SI_IF || 181 MI.getOpcode() == AMDGPU::SI_ELSE || 182 MI.getOpcode() == AMDGPU::SI_LOOP) { 183 Divergent = true; 184 break; 185 } 186 } 187 188 if (Divergent && PDT.dominates(&DefBlock, MBB)) { 189 for (MachineBasicBlock *Succ : MBB->successors()) 190 Stack.push_back(Succ); 191 } 192 } 193 194 while (!Stack.empty()) { 195 MachineBasicBlock *MBB = Stack.pop_back_val(); 196 if (!ReachableMap.try_emplace(MBB, false).second) 197 continue; 198 ReachableOrdered.push_back(MBB); 199 200 for (MachineBasicBlock *Succ : MBB->successors()) 201 Stack.push_back(Succ); 202 } 203 204 for (MachineBasicBlock *MBB : ReachableOrdered) { 205 bool HaveReachablePred = false; 206 for (MachineBasicBlock *Pred : MBB->predecessors()) { 207 if (ReachableMap.count(Pred)) { 208 HaveReachablePred = true; 209 } else { 210 Stack.push_back(Pred); 211 } 212 } 213 if (!HaveReachablePred) 214 ReachableMap[MBB] = true; 215 if (HaveReachablePred) { 216 for (MachineBasicBlock *UnreachablePred : Stack) { 217 if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) 218 Predecessors.push_back(UnreachablePred); 219 } 220 } 221 Stack.clear(); 222 } 223 } 224 }; 225 226 /// Helper class that detects loops which require us to lower an i1 COPY into 227 /// bitwise manipulation. 228 /// 229 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 230 /// between loops with the same header. Consider this example: 231 /// 232 /// A-+-+ 233 /// | | | 234 /// B-+ | 235 /// | | 236 /// C---+ 237 /// 238 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 239 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 240 /// bitwise operations to combine results from different loop iterations when 241 /// B has a divergent branch (since by default we will compile this code such 242 /// that threads in a wave are merged at the entry of C). 243 /// 244 /// The following rule is implemented to determine whether bitwise operations 245 /// are required: use the bitwise lowering for a def in block B if a backward 246 /// edge to B is reachable without going through the nearest common 247 /// post-dominator of B and all uses of the def. 248 /// 249 /// TODO: This rule is conservative because it does not check whether the 250 /// relevant branches are actually divergent. 251 /// 252 /// The class is designed to cache the CFG traversal so that it can be re-used 253 /// for multiple defs within the same basic block. 254 /// 255 /// TODO: We could use region analysis to quickly skip over SESE regions during 256 /// the traversal. 257 /// 258 class LoopFinder { 259 MachineDominatorTree &DT; 260 MachinePostDominatorTree &PDT; 261 262 // All visited / reachable block, tagged by level (level 0 is the def block, 263 // level 1 are all blocks reachable including but not going through the def 264 // block's IPDOM, etc.). 265 DenseMap<MachineBasicBlock *, unsigned> Visited; 266 267 // Nearest common dominator of all visited blocks by level (level 0 is the 268 // def block). Used for seeding the SSAUpdater. 269 SmallVector<MachineBasicBlock *, 4> CommonDominators; 270 271 // Post-dominator of all visited blocks. 272 MachineBasicBlock *VisitedPostDom = nullptr; 273 274 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 275 // reachable without going through the IPDOM of the def block (if the IPDOM 276 // itself has an edge to the def block, the loop level is 2), etc. 277 unsigned FoundLoopLevel = ~0u; 278 279 MachineBasicBlock *DefBlock = nullptr; 280 SmallVector<MachineBasicBlock *, 4> Stack; 281 SmallVector<MachineBasicBlock *, 4> NextLevel; 282 283 public: 284 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 285 : DT(DT), PDT(PDT) {} 286 287 void initialize(MachineBasicBlock &MBB) { 288 Visited.clear(); 289 CommonDominators.clear(); 290 Stack.clear(); 291 NextLevel.clear(); 292 VisitedPostDom = nullptr; 293 FoundLoopLevel = ~0u; 294 295 DefBlock = &MBB; 296 } 297 298 /// Check whether a backward edge can be reached without going through the 299 /// given \p PostDom of the def block. 300 /// 301 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 302 unsigned findLoop(MachineBasicBlock *PostDom) { 303 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 304 305 if (!VisitedPostDom) 306 advanceLevel(); 307 308 unsigned Level = 0; 309 while (PDNode->getBlock() != PostDom) { 310 if (PDNode->getBlock() == VisitedPostDom) 311 advanceLevel(); 312 PDNode = PDNode->getIDom(); 313 Level++; 314 if (FoundLoopLevel == Level) 315 return Level; 316 } 317 318 return 0; 319 } 320 321 /// Add undef values dominating the loop and the optionally given additional 322 /// blocks, so that the SSA updater doesn't have to search all the way to the 323 /// function entry. 324 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 325 ArrayRef<MachineBasicBlock *> Blocks = {}) { 326 assert(LoopLevel < CommonDominators.size()); 327 328 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 329 for (MachineBasicBlock *MBB : Blocks) 330 Dom = DT.findNearestCommonDominator(Dom, MBB); 331 332 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 333 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 334 } else { 335 // The dominator is part of the loop or the given blocks, so add the 336 // undef value to unreachable predecessors instead. 337 for (MachineBasicBlock *Pred : Dom->predecessors()) { 338 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 339 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 340 } 341 } 342 } 343 344 private: 345 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 346 ArrayRef<MachineBasicBlock *> Blocks) const { 347 auto DomIt = Visited.find(&MBB); 348 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 349 return true; 350 351 if (llvm::find(Blocks, &MBB) != Blocks.end()) 352 return true; 353 354 return false; 355 } 356 357 void advanceLevel() { 358 MachineBasicBlock *VisitedDom; 359 360 if (!VisitedPostDom) { 361 VisitedPostDom = DefBlock; 362 VisitedDom = DefBlock; 363 Stack.push_back(DefBlock); 364 } else { 365 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 366 VisitedDom = CommonDominators.back(); 367 368 for (unsigned i = 0; i < NextLevel.size();) { 369 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 370 Stack.push_back(NextLevel[i]); 371 372 NextLevel[i] = NextLevel.back(); 373 NextLevel.pop_back(); 374 } else { 375 i++; 376 } 377 } 378 } 379 380 unsigned Level = CommonDominators.size(); 381 while (!Stack.empty()) { 382 MachineBasicBlock *MBB = Stack.pop_back_val(); 383 if (!PDT.dominates(VisitedPostDom, MBB)) 384 NextLevel.push_back(MBB); 385 386 Visited[MBB] = Level; 387 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 388 389 for (MachineBasicBlock *Succ : MBB->successors()) { 390 if (Succ == DefBlock) { 391 if (MBB == VisitedPostDom) 392 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 393 else 394 FoundLoopLevel = std::min(FoundLoopLevel, Level); 395 continue; 396 } 397 398 if (Visited.try_emplace(Succ, ~0u).second) { 399 if (MBB == VisitedPostDom) 400 NextLevel.push_back(Succ); 401 else 402 Stack.push_back(Succ); 403 } 404 } 405 } 406 407 CommonDominators.push_back(VisitedDom); 408 } 409 }; 410 411 } // End anonymous namespace. 412 413 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 414 false) 415 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 416 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 417 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 418 false) 419 420 char SILowerI1Copies::ID = 0; 421 422 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 423 424 FunctionPass *llvm::createSILowerI1CopiesPass() { 425 return new SILowerI1Copies(); 426 } 427 428 static unsigned createLaneMaskReg(MachineFunction &MF) { 429 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 430 MachineRegisterInfo &MRI = MF.getRegInfo(); 431 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 432 : &AMDGPU::SReg_64RegClass); 433 } 434 435 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 436 MachineFunction &MF = *MBB.getParent(); 437 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 438 const SIInstrInfo *TII = ST.getInstrInfo(); 439 unsigned UndefReg = createLaneMaskReg(MF); 440 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 441 UndefReg); 442 return UndefReg; 443 } 444 445 /// Lower all instructions that def or use vreg_1 registers. 446 /// 447 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 448 /// occur around inline assembly. We do this first, before vreg_1 registers 449 /// are changed to scalar mask registers. 450 /// 451 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 452 /// all others, because phi lowering looks through copies and can therefore 453 /// often make copy lowering unnecessary. 454 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 455 // Only need to run this in SelectionDAG path. 456 if (TheMF.getProperties().hasProperty( 457 MachineFunctionProperties::Property::Selected)) 458 return false; 459 460 MF = &TheMF; 461 MRI = &MF->getRegInfo(); 462 DT = &getAnalysis<MachineDominatorTree>(); 463 PDT = &getAnalysis<MachinePostDominatorTree>(); 464 465 ST = &MF->getSubtarget<GCNSubtarget>(); 466 TII = ST->getInstrInfo(); 467 IsWave32 = ST->isWave32(); 468 469 if (IsWave32) { 470 ExecReg = AMDGPU::EXEC_LO; 471 MovOp = AMDGPU::S_MOV_B32; 472 AndOp = AMDGPU::S_AND_B32; 473 OrOp = AMDGPU::S_OR_B32; 474 XorOp = AMDGPU::S_XOR_B32; 475 AndN2Op = AMDGPU::S_ANDN2_B32; 476 OrN2Op = AMDGPU::S_ORN2_B32; 477 } else { 478 ExecReg = AMDGPU::EXEC; 479 MovOp = AMDGPU::S_MOV_B64; 480 AndOp = AMDGPU::S_AND_B64; 481 OrOp = AMDGPU::S_OR_B64; 482 XorOp = AMDGPU::S_XOR_B64; 483 AndN2Op = AMDGPU::S_ANDN2_B64; 484 OrN2Op = AMDGPU::S_ORN2_B64; 485 } 486 487 lowerCopiesFromI1(); 488 lowerPhis(); 489 lowerCopiesToI1(); 490 491 for (unsigned Reg : ConstrainRegs) 492 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 493 ConstrainRegs.clear(); 494 495 return true; 496 } 497 498 #ifndef NDEBUG 499 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 500 const MachineRegisterInfo &MRI, 501 Register Reg) { 502 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 503 return Size == 1 || Size == 32; 504 } 505 #endif 506 507 void SILowerI1Copies::lowerCopiesFromI1() { 508 SmallVector<MachineInstr *, 4> DeadCopies; 509 510 for (MachineBasicBlock &MBB : *MF) { 511 for (MachineInstr &MI : MBB) { 512 if (MI.getOpcode() != AMDGPU::COPY) 513 continue; 514 515 Register DstReg = MI.getOperand(0).getReg(); 516 Register SrcReg = MI.getOperand(1).getReg(); 517 if (!isVreg1(SrcReg)) 518 continue; 519 520 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 521 continue; 522 523 // Copy into a 32-bit vector register. 524 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 525 DebugLoc DL = MI.getDebugLoc(); 526 527 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 528 assert(!MI.getOperand(0).getSubReg()); 529 530 ConstrainRegs.insert(SrcReg); 531 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 532 .addImm(0) 533 .addImm(0) 534 .addImm(0) 535 .addImm(-1) 536 .addReg(SrcReg); 537 DeadCopies.push_back(&MI); 538 } 539 540 for (MachineInstr *MI : DeadCopies) 541 MI->eraseFromParent(); 542 DeadCopies.clear(); 543 } 544 } 545 546 void SILowerI1Copies::lowerPhis() { 547 MachineSSAUpdater SSAUpdater(*MF); 548 LoopFinder LF(*DT, *PDT); 549 PhiIncomingAnalysis PIA(*PDT); 550 SmallVector<MachineInstr *, 4> Vreg1Phis; 551 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 552 SmallVector<unsigned, 4> IncomingRegs; 553 SmallVector<unsigned, 4> IncomingUpdated; 554 #ifndef NDEBUG 555 DenseSet<unsigned> PhiRegisters; 556 #endif 557 558 for (MachineBasicBlock &MBB : *MF) { 559 for (MachineInstr &MI : MBB.phis()) { 560 if (isVreg1(MI.getOperand(0).getReg())) 561 Vreg1Phis.push_back(&MI); 562 } 563 } 564 565 MachineBasicBlock *PrevMBB = nullptr; 566 for (MachineInstr *MI : Vreg1Phis) { 567 MachineBasicBlock &MBB = *MI->getParent(); 568 if (&MBB != PrevMBB) { 569 LF.initialize(MBB); 570 PrevMBB = &MBB; 571 } 572 573 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 574 575 Register DstReg = MI->getOperand(0).getReg(); 576 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 577 : &AMDGPU::SReg_64RegClass); 578 579 // Collect incoming values. 580 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 581 assert(i + 1 < MI->getNumOperands()); 582 Register IncomingReg = MI->getOperand(i).getReg(); 583 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 584 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 585 586 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 587 IncomingReg = IncomingDef->getOperand(1).getReg(); 588 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 589 assert(!IncomingDef->getOperand(1).getSubReg()); 590 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 591 continue; 592 } else { 593 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 594 } 595 596 IncomingBlocks.push_back(IncomingMBB); 597 IncomingRegs.push_back(IncomingReg); 598 } 599 600 #ifndef NDEBUG 601 PhiRegisters.insert(DstReg); 602 #endif 603 604 // Phis in a loop that are observed outside the loop receive a simple but 605 // conservatively correct treatment. 606 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 607 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 608 DomBlocks.push_back(Use.getParent()); 609 610 MachineBasicBlock *PostDomBound = 611 PDT->findNearestCommonDominator(DomBlocks); 612 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 613 614 SSAUpdater.Initialize(DstReg); 615 616 if (FoundLoopLevel) { 617 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 618 619 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 620 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 621 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 622 IncomingUpdated.back()); 623 } 624 625 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 626 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 627 buildMergeLaneMasks( 628 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 629 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 630 } 631 } else { 632 // The phi is not observed from outside a loop. Use a more accurate 633 // lowering. 634 PIA.analyze(MBB, IncomingBlocks); 635 636 for (MachineBasicBlock *MBB : PIA.predecessors()) 637 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 638 639 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 640 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 641 if (PIA.isSource(IMBB)) { 642 IncomingUpdated.push_back(0); 643 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 644 } else { 645 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 646 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 647 } 648 } 649 650 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 651 if (!IncomingUpdated[i]) 652 continue; 653 654 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 655 buildMergeLaneMasks( 656 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 657 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 658 } 659 } 660 661 unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 662 if (NewReg != DstReg) { 663 MRI->replaceRegWith(NewReg, DstReg); 664 MI->eraseFromParent(); 665 } 666 667 IncomingBlocks.clear(); 668 IncomingRegs.clear(); 669 IncomingUpdated.clear(); 670 } 671 } 672 673 void SILowerI1Copies::lowerCopiesToI1() { 674 MachineSSAUpdater SSAUpdater(*MF); 675 LoopFinder LF(*DT, *PDT); 676 SmallVector<MachineInstr *, 4> DeadCopies; 677 678 for (MachineBasicBlock &MBB : *MF) { 679 LF.initialize(MBB); 680 681 for (MachineInstr &MI : MBB) { 682 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 683 MI.getOpcode() != AMDGPU::COPY) 684 continue; 685 686 Register DstReg = MI.getOperand(0).getReg(); 687 if (!isVreg1(DstReg)) 688 continue; 689 690 if (MRI->use_empty(DstReg)) { 691 DeadCopies.push_back(&MI); 692 continue; 693 } 694 695 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 696 697 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 698 : &AMDGPU::SReg_64RegClass); 699 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 700 continue; 701 702 DebugLoc DL = MI.getDebugLoc(); 703 Register SrcReg = MI.getOperand(1).getReg(); 704 assert(!MI.getOperand(1).getSubReg()); 705 706 if (!Register::isVirtualRegister(SrcReg) || 707 (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 708 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 709 unsigned TmpReg = createLaneMaskReg(*MF); 710 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 711 .addReg(SrcReg) 712 .addImm(0); 713 MI.getOperand(1).setReg(TmpReg); 714 SrcReg = TmpReg; 715 } 716 717 // Defs in a loop that are observed outside the loop must be transformed 718 // into appropriate bit manipulation. 719 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 720 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 721 DomBlocks.push_back(Use.getParent()); 722 723 MachineBasicBlock *PostDomBound = 724 PDT->findNearestCommonDominator(DomBlocks); 725 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 726 if (FoundLoopLevel) { 727 SSAUpdater.Initialize(DstReg); 728 SSAUpdater.AddAvailableValue(&MBB, DstReg); 729 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 730 731 buildMergeLaneMasks(MBB, MI, DL, DstReg, 732 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 733 DeadCopies.push_back(&MI); 734 } 735 } 736 737 for (MachineInstr *MI : DeadCopies) 738 MI->eraseFromParent(); 739 DeadCopies.clear(); 740 } 741 } 742 743 bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { 744 const MachineInstr *MI; 745 for (;;) { 746 MI = MRI->getUniqueVRegDef(Reg); 747 if (MI->getOpcode() != AMDGPU::COPY) 748 break; 749 750 Reg = MI->getOperand(1).getReg(); 751 if (!Register::isVirtualRegister(Reg)) 752 return false; 753 if (!isLaneMaskReg(Reg)) 754 return false; 755 } 756 757 if (MI->getOpcode() != MovOp) 758 return false; 759 760 if (!MI->getOperand(1).isImm()) 761 return false; 762 763 int64_t Imm = MI->getOperand(1).getImm(); 764 if (Imm == 0) { 765 Val = false; 766 return true; 767 } 768 if (Imm == -1) { 769 Val = true; 770 return true; 771 } 772 773 return false; 774 } 775 776 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 777 Def = false; 778 Use = false; 779 780 for (const MachineOperand &MO : MI.operands()) { 781 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 782 if (MO.isUse()) 783 Use = true; 784 else 785 Def = true; 786 } 787 } 788 } 789 790 /// Return a point at the end of the given \p MBB to insert SALU instructions 791 /// for lane mask calculation. Take terminators and SCC into account. 792 MachineBasicBlock::iterator 793 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 794 auto InsertionPt = MBB.getFirstTerminator(); 795 bool TerminatorsUseSCC = false; 796 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 797 bool DefsSCC; 798 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 799 if (TerminatorsUseSCC || DefsSCC) 800 break; 801 } 802 803 if (!TerminatorsUseSCC) 804 return InsertionPt; 805 806 while (InsertionPt != MBB.begin()) { 807 InsertionPt--; 808 809 bool DefSCC, UseSCC; 810 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 811 if (DefSCC) 812 return InsertionPt; 813 } 814 815 // We should have at least seen an IMPLICIT_DEF or COPY 816 llvm_unreachable("SCC used by terminator but no def in block"); 817 } 818 819 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 820 MachineBasicBlock::iterator I, 821 const DebugLoc &DL, unsigned DstReg, 822 unsigned PrevReg, unsigned CurReg) { 823 bool PrevVal; 824 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 825 bool CurVal; 826 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 827 828 if (PrevConstant && CurConstant) { 829 if (PrevVal == CurVal) { 830 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 831 } else if (CurVal) { 832 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 833 } else { 834 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 835 .addReg(ExecReg) 836 .addImm(-1); 837 } 838 return; 839 } 840 841 unsigned PrevMaskedReg = 0; 842 unsigned CurMaskedReg = 0; 843 if (!PrevConstant) { 844 if (CurConstant && CurVal) { 845 PrevMaskedReg = PrevReg; 846 } else { 847 PrevMaskedReg = createLaneMaskReg(*MF); 848 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 849 .addReg(PrevReg) 850 .addReg(ExecReg); 851 } 852 } 853 if (!CurConstant) { 854 // TODO: check whether CurReg is already masked by EXEC 855 if (PrevConstant && PrevVal) { 856 CurMaskedReg = CurReg; 857 } else { 858 CurMaskedReg = createLaneMaskReg(*MF); 859 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 860 .addReg(CurReg) 861 .addReg(ExecReg); 862 } 863 } 864 865 if (PrevConstant && !PrevVal) { 866 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 867 .addReg(CurMaskedReg); 868 } else if (CurConstant && !CurVal) { 869 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 870 .addReg(PrevMaskedReg); 871 } else if (PrevConstant && PrevVal) { 872 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 873 .addReg(CurMaskedReg) 874 .addReg(ExecReg); 875 } else { 876 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 877 .addReg(PrevMaskedReg) 878 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 879 } 880 } 881