1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "SIInstrInfo.h" 28 #include "llvm/CodeGen/MachineDominators.h" 29 #include "llvm/CodeGen/MachineFunctionPass.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachinePostDominators.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/MachineSSAUpdater.h" 34 #include "llvm/IR/Function.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/Support/Debug.h" 37 #include "llvm/Target/TargetMachine.h" 38 39 #define DEBUG_TYPE "si-i1-copies" 40 41 using namespace llvm; 42 43 static unsigned createLaneMaskReg(MachineFunction &MF); 44 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 45 46 namespace { 47 48 class SILowerI1Copies : public MachineFunctionPass { 49 public: 50 static char ID; 51 52 private: 53 bool IsWave32 = false; 54 MachineFunction *MF = nullptr; 55 MachineDominatorTree *DT = nullptr; 56 MachinePostDominatorTree *PDT = nullptr; 57 MachineRegisterInfo *MRI = nullptr; 58 const GCNSubtarget *ST = nullptr; 59 const SIInstrInfo *TII = nullptr; 60 61 unsigned ExecReg; 62 unsigned MovOp; 63 unsigned AndOp; 64 unsigned OrOp; 65 unsigned XorOp; 66 unsigned AndN2Op; 67 unsigned OrN2Op; 68 69 DenseSet<unsigned> ConstrainRegs; 70 71 public: 72 SILowerI1Copies() : MachineFunctionPass(ID) { 73 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 74 } 75 76 bool runOnMachineFunction(MachineFunction &MF) override; 77 78 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 79 80 void getAnalysisUsage(AnalysisUsage &AU) const override { 81 AU.setPreservesCFG(); 82 AU.addRequired<MachineDominatorTree>(); 83 AU.addRequired<MachinePostDominatorTree>(); 84 MachineFunctionPass::getAnalysisUsage(AU); 85 } 86 87 private: 88 void lowerCopiesFromI1(); 89 void lowerPhis(); 90 void lowerCopiesToI1(); 91 bool isConstantLaneMask(unsigned Reg, bool &Val) const; 92 void buildMergeLaneMasks(MachineBasicBlock &MBB, 93 MachineBasicBlock::iterator I, const DebugLoc &DL, 94 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 95 MachineBasicBlock::iterator 96 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 97 98 bool isVreg1(unsigned Reg) const { 99 return Register::isVirtualRegister(Reg) && 100 MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 101 } 102 103 bool isLaneMaskReg(unsigned Reg) const { 104 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 105 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 106 ST->getWavefrontSize(); 107 } 108 }; 109 110 /// Helper class that determines the relationship between incoming values of a 111 /// phi in the control flow graph to determine where an incoming value can 112 /// simply be taken as a scalar lane mask as-is, and where it needs to be 113 /// merged with another, previously defined lane mask. 114 /// 115 /// The approach is as follows: 116 /// - Determine all basic blocks which, starting from the incoming blocks, 117 /// a wave may reach before entering the def block (the block containing the 118 /// phi). 119 /// - If an incoming block has no predecessors in this set, we can take the 120 /// incoming value as a scalar lane mask as-is. 121 /// -- A special case of this is when the def block has a self-loop. 122 /// - Otherwise, the incoming value needs to be merged with a previously 123 /// defined lane mask. 124 /// - If there is a path into the set of reachable blocks that does _not_ go 125 /// through an incoming block where we can take the scalar lane mask as-is, 126 /// we need to invent an available value for the SSAUpdater. Choices are 127 /// 0 and undef, with differing consequences for how to merge values etc. 128 /// 129 /// TODO: We could use region analysis to quickly skip over SESE regions during 130 /// the traversal. 131 /// 132 class PhiIncomingAnalysis { 133 MachinePostDominatorTree &PDT; 134 135 // For each reachable basic block, whether it is a source in the induced 136 // subgraph of the CFG. 137 DenseMap<MachineBasicBlock *, bool> ReachableMap; 138 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 139 SmallVector<MachineBasicBlock *, 4> Stack; 140 SmallVector<MachineBasicBlock *, 4> Predecessors; 141 142 public: 143 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} 144 145 /// Returns whether \p MBB is a source in the induced subgraph of reachable 146 /// blocks. 147 bool isSource(MachineBasicBlock &MBB) const { 148 return ReachableMap.find(&MBB)->second; 149 } 150 151 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 152 153 void analyze(MachineBasicBlock &DefBlock, 154 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 155 assert(Stack.empty()); 156 ReachableMap.clear(); 157 ReachableOrdered.clear(); 158 Predecessors.clear(); 159 160 // Insert the def block first, so that it acts as an end point for the 161 // traversal. 162 ReachableMap.try_emplace(&DefBlock, false); 163 ReachableOrdered.push_back(&DefBlock); 164 165 for (MachineBasicBlock *MBB : IncomingBlocks) { 166 if (MBB == &DefBlock) { 167 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 168 continue; 169 } 170 171 ReachableMap.try_emplace(MBB, false); 172 ReachableOrdered.push_back(MBB); 173 174 // If this block has a divergent terminator and the def block is its 175 // post-dominator, the wave may first visit the other successors. 176 bool Divergent = false; 177 for (MachineInstr &MI : MBB->terminators()) { 178 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 179 MI.getOpcode() == AMDGPU::SI_IF || 180 MI.getOpcode() == AMDGPU::SI_ELSE || 181 MI.getOpcode() == AMDGPU::SI_LOOP) { 182 Divergent = true; 183 break; 184 } 185 } 186 187 if (Divergent && PDT.dominates(&DefBlock, MBB)) { 188 for (MachineBasicBlock *Succ : MBB->successors()) 189 Stack.push_back(Succ); 190 } 191 } 192 193 while (!Stack.empty()) { 194 MachineBasicBlock *MBB = Stack.pop_back_val(); 195 if (!ReachableMap.try_emplace(MBB, false).second) 196 continue; 197 ReachableOrdered.push_back(MBB); 198 199 for (MachineBasicBlock *Succ : MBB->successors()) 200 Stack.push_back(Succ); 201 } 202 203 for (MachineBasicBlock *MBB : ReachableOrdered) { 204 bool HaveReachablePred = false; 205 for (MachineBasicBlock *Pred : MBB->predecessors()) { 206 if (ReachableMap.count(Pred)) { 207 HaveReachablePred = true; 208 } else { 209 Stack.push_back(Pred); 210 } 211 } 212 if (!HaveReachablePred) 213 ReachableMap[MBB] = true; 214 if (HaveReachablePred) { 215 for (MachineBasicBlock *UnreachablePred : Stack) { 216 if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) 217 Predecessors.push_back(UnreachablePred); 218 } 219 } 220 Stack.clear(); 221 } 222 } 223 }; 224 225 /// Helper class that detects loops which require us to lower an i1 COPY into 226 /// bitwise manipulation. 227 /// 228 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 229 /// between loops with the same header. Consider this example: 230 /// 231 /// A-+-+ 232 /// | | | 233 /// B-+ | 234 /// | | 235 /// C---+ 236 /// 237 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 238 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 239 /// bitwise operations to combine results from different loop iterations when 240 /// B has a divergent branch (since by default we will compile this code such 241 /// that threads in a wave are merged at the entry of C). 242 /// 243 /// The following rule is implemented to determine whether bitwise operations 244 /// are required: use the bitwise lowering for a def in block B if a backward 245 /// edge to B is reachable without going through the nearest common 246 /// post-dominator of B and all uses of the def. 247 /// 248 /// TODO: This rule is conservative because it does not check whether the 249 /// relevant branches are actually divergent. 250 /// 251 /// The class is designed to cache the CFG traversal so that it can be re-used 252 /// for multiple defs within the same basic block. 253 /// 254 /// TODO: We could use region analysis to quickly skip over SESE regions during 255 /// the traversal. 256 /// 257 class LoopFinder { 258 MachineDominatorTree &DT; 259 MachinePostDominatorTree &PDT; 260 261 // All visited / reachable block, tagged by level (level 0 is the def block, 262 // level 1 are all blocks reachable including but not going through the def 263 // block's IPDOM, etc.). 264 DenseMap<MachineBasicBlock *, unsigned> Visited; 265 266 // Nearest common dominator of all visited blocks by level (level 0 is the 267 // def block). Used for seeding the SSAUpdater. 268 SmallVector<MachineBasicBlock *, 4> CommonDominators; 269 270 // Post-dominator of all visited blocks. 271 MachineBasicBlock *VisitedPostDom = nullptr; 272 273 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 274 // reachable without going through the IPDOM of the def block (if the IPDOM 275 // itself has an edge to the def block, the loop level is 2), etc. 276 unsigned FoundLoopLevel = ~0u; 277 278 MachineBasicBlock *DefBlock = nullptr; 279 SmallVector<MachineBasicBlock *, 4> Stack; 280 SmallVector<MachineBasicBlock *, 4> NextLevel; 281 282 public: 283 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 284 : DT(DT), PDT(PDT) {} 285 286 void initialize(MachineBasicBlock &MBB) { 287 Visited.clear(); 288 CommonDominators.clear(); 289 Stack.clear(); 290 NextLevel.clear(); 291 VisitedPostDom = nullptr; 292 FoundLoopLevel = ~0u; 293 294 DefBlock = &MBB; 295 } 296 297 /// Check whether a backward edge can be reached without going through the 298 /// given \p PostDom of the def block. 299 /// 300 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 301 unsigned findLoop(MachineBasicBlock *PostDom) { 302 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 303 304 if (!VisitedPostDom) 305 advanceLevel(); 306 307 unsigned Level = 0; 308 while (PDNode->getBlock() != PostDom) { 309 if (PDNode->getBlock() == VisitedPostDom) 310 advanceLevel(); 311 PDNode = PDNode->getIDom(); 312 Level++; 313 if (FoundLoopLevel == Level) 314 return Level; 315 } 316 317 return 0; 318 } 319 320 /// Add undef values dominating the loop and the optionally given additional 321 /// blocks, so that the SSA updater doesn't have to search all the way to the 322 /// function entry. 323 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 324 ArrayRef<MachineBasicBlock *> Blocks = {}) { 325 assert(LoopLevel < CommonDominators.size()); 326 327 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 328 for (MachineBasicBlock *MBB : Blocks) 329 Dom = DT.findNearestCommonDominator(Dom, MBB); 330 331 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 332 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 333 } else { 334 // The dominator is part of the loop or the given blocks, so add the 335 // undef value to unreachable predecessors instead. 336 for (MachineBasicBlock *Pred : Dom->predecessors()) { 337 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 338 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 339 } 340 } 341 } 342 343 private: 344 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 345 ArrayRef<MachineBasicBlock *> Blocks) const { 346 auto DomIt = Visited.find(&MBB); 347 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 348 return true; 349 350 if (llvm::find(Blocks, &MBB) != Blocks.end()) 351 return true; 352 353 return false; 354 } 355 356 void advanceLevel() { 357 MachineBasicBlock *VisitedDom; 358 359 if (!VisitedPostDom) { 360 VisitedPostDom = DefBlock; 361 VisitedDom = DefBlock; 362 Stack.push_back(DefBlock); 363 } else { 364 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 365 VisitedDom = CommonDominators.back(); 366 367 for (unsigned i = 0; i < NextLevel.size();) { 368 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 369 Stack.push_back(NextLevel[i]); 370 371 NextLevel[i] = NextLevel.back(); 372 NextLevel.pop_back(); 373 } else { 374 i++; 375 } 376 } 377 } 378 379 unsigned Level = CommonDominators.size(); 380 while (!Stack.empty()) { 381 MachineBasicBlock *MBB = Stack.pop_back_val(); 382 if (!PDT.dominates(VisitedPostDom, MBB)) 383 NextLevel.push_back(MBB); 384 385 Visited[MBB] = Level; 386 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 387 388 for (MachineBasicBlock *Succ : MBB->successors()) { 389 if (Succ == DefBlock) { 390 if (MBB == VisitedPostDom) 391 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 392 else 393 FoundLoopLevel = std::min(FoundLoopLevel, Level); 394 continue; 395 } 396 397 if (Visited.try_emplace(Succ, ~0u).second) { 398 if (MBB == VisitedPostDom) 399 NextLevel.push_back(Succ); 400 else 401 Stack.push_back(Succ); 402 } 403 } 404 } 405 406 CommonDominators.push_back(VisitedDom); 407 } 408 }; 409 410 } // End anonymous namespace. 411 412 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 413 false) 414 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 415 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 416 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 417 false) 418 419 char SILowerI1Copies::ID = 0; 420 421 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 422 423 FunctionPass *llvm::createSILowerI1CopiesPass() { 424 return new SILowerI1Copies(); 425 } 426 427 static unsigned createLaneMaskReg(MachineFunction &MF) { 428 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 429 MachineRegisterInfo &MRI = MF.getRegInfo(); 430 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 431 : &AMDGPU::SReg_64RegClass); 432 } 433 434 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 435 MachineFunction &MF = *MBB.getParent(); 436 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 437 const SIInstrInfo *TII = ST.getInstrInfo(); 438 unsigned UndefReg = createLaneMaskReg(MF); 439 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 440 UndefReg); 441 return UndefReg; 442 } 443 444 /// Lower all instructions that def or use vreg_1 registers. 445 /// 446 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 447 /// occur around inline assembly. We do this first, before vreg_1 registers 448 /// are changed to scalar mask registers. 449 /// 450 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 451 /// all others, because phi lowering looks through copies and can therefore 452 /// often make copy lowering unnecessary. 453 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 454 MF = &TheMF; 455 MRI = &MF->getRegInfo(); 456 DT = &getAnalysis<MachineDominatorTree>(); 457 PDT = &getAnalysis<MachinePostDominatorTree>(); 458 459 ST = &MF->getSubtarget<GCNSubtarget>(); 460 TII = ST->getInstrInfo(); 461 IsWave32 = ST->isWave32(); 462 463 if (IsWave32) { 464 ExecReg = AMDGPU::EXEC_LO; 465 MovOp = AMDGPU::S_MOV_B32; 466 AndOp = AMDGPU::S_AND_B32; 467 OrOp = AMDGPU::S_OR_B32; 468 XorOp = AMDGPU::S_XOR_B32; 469 AndN2Op = AMDGPU::S_ANDN2_B32; 470 OrN2Op = AMDGPU::S_ORN2_B32; 471 } else { 472 ExecReg = AMDGPU::EXEC; 473 MovOp = AMDGPU::S_MOV_B64; 474 AndOp = AMDGPU::S_AND_B64; 475 OrOp = AMDGPU::S_OR_B64; 476 XorOp = AMDGPU::S_XOR_B64; 477 AndN2Op = AMDGPU::S_ANDN2_B64; 478 OrN2Op = AMDGPU::S_ORN2_B64; 479 } 480 481 lowerCopiesFromI1(); 482 lowerPhis(); 483 lowerCopiesToI1(); 484 485 for (unsigned Reg : ConstrainRegs) 486 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 487 ConstrainRegs.clear(); 488 489 return true; 490 } 491 492 #ifndef NDEBUG 493 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 494 const MachineRegisterInfo &MRI, 495 Register Reg) { 496 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 497 return Size == 1 || Size == 32; 498 } 499 #endif 500 501 void SILowerI1Copies::lowerCopiesFromI1() { 502 SmallVector<MachineInstr *, 4> DeadCopies; 503 504 for (MachineBasicBlock &MBB : *MF) { 505 for (MachineInstr &MI : MBB) { 506 if (MI.getOpcode() != AMDGPU::COPY) 507 continue; 508 509 Register DstReg = MI.getOperand(0).getReg(); 510 Register SrcReg = MI.getOperand(1).getReg(); 511 if (!isVreg1(SrcReg)) 512 continue; 513 514 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 515 continue; 516 517 // Copy into a 32-bit vector register. 518 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 519 DebugLoc DL = MI.getDebugLoc(); 520 521 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 522 assert(!MI.getOperand(0).getSubReg()); 523 524 ConstrainRegs.insert(SrcReg); 525 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 526 .addImm(0) 527 .addImm(0) 528 .addImm(0) 529 .addImm(-1) 530 .addReg(SrcReg); 531 DeadCopies.push_back(&MI); 532 } 533 534 for (MachineInstr *MI : DeadCopies) 535 MI->eraseFromParent(); 536 DeadCopies.clear(); 537 } 538 } 539 540 void SILowerI1Copies::lowerPhis() { 541 MachineSSAUpdater SSAUpdater(*MF); 542 LoopFinder LF(*DT, *PDT); 543 PhiIncomingAnalysis PIA(*PDT); 544 SmallVector<MachineInstr *, 4> DeadPhis; 545 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 546 SmallVector<unsigned, 4> IncomingRegs; 547 SmallVector<unsigned, 4> IncomingUpdated; 548 #ifndef NDEBUG 549 DenseSet<unsigned> PhiRegisters; 550 #endif 551 552 for (MachineBasicBlock &MBB : *MF) { 553 LF.initialize(MBB); 554 555 for (MachineInstr &MI : MBB.phis()) { 556 Register DstReg = MI.getOperand(0).getReg(); 557 if (!isVreg1(DstReg)) 558 continue; 559 560 LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); 561 562 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 563 : &AMDGPU::SReg_64RegClass); 564 565 // Collect incoming values. 566 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 567 assert(i + 1 < MI.getNumOperands()); 568 Register IncomingReg = MI.getOperand(i).getReg(); 569 MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); 570 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 571 572 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 573 IncomingReg = IncomingDef->getOperand(1).getReg(); 574 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 575 assert(!IncomingDef->getOperand(1).getSubReg()); 576 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 577 continue; 578 } else { 579 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 580 } 581 582 IncomingBlocks.push_back(IncomingMBB); 583 IncomingRegs.push_back(IncomingReg); 584 } 585 586 #ifndef NDEBUG 587 PhiRegisters.insert(DstReg); 588 #endif 589 590 // Phis in a loop that are observed outside the loop receive a simple but 591 // conservatively correct treatment. 592 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 593 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 594 DomBlocks.push_back(Use.getParent()); 595 596 MachineBasicBlock *PostDomBound = 597 PDT->findNearestCommonDominator(DomBlocks); 598 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 599 600 SSAUpdater.Initialize(DstReg); 601 602 if (FoundLoopLevel) { 603 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 604 605 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 606 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 607 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 608 IncomingUpdated.back()); 609 } 610 611 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 612 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 613 buildMergeLaneMasks( 614 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 615 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 616 } 617 } else { 618 // The phi is not observed from outside a loop. Use a more accurate 619 // lowering. 620 PIA.analyze(MBB, IncomingBlocks); 621 622 for (MachineBasicBlock *MBB : PIA.predecessors()) 623 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 624 625 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 626 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 627 if (PIA.isSource(IMBB)) { 628 IncomingUpdated.push_back(0); 629 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 630 } else { 631 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 632 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 633 } 634 } 635 636 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 637 if (!IncomingUpdated[i]) 638 continue; 639 640 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 641 buildMergeLaneMasks( 642 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 643 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 644 } 645 } 646 647 unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 648 if (NewReg != DstReg) { 649 MRI->replaceRegWith(NewReg, DstReg); 650 651 // Ensure that DstReg has a single def and mark the old PHI node for 652 // deletion. 653 MI.getOperand(0).setReg(NewReg); 654 DeadPhis.push_back(&MI); 655 } 656 657 IncomingBlocks.clear(); 658 IncomingRegs.clear(); 659 IncomingUpdated.clear(); 660 } 661 662 for (MachineInstr *MI : DeadPhis) 663 MI->eraseFromParent(); 664 DeadPhis.clear(); 665 } 666 } 667 668 void SILowerI1Copies::lowerCopiesToI1() { 669 MachineSSAUpdater SSAUpdater(*MF); 670 LoopFinder LF(*DT, *PDT); 671 SmallVector<MachineInstr *, 4> DeadCopies; 672 673 for (MachineBasicBlock &MBB : *MF) { 674 LF.initialize(MBB); 675 676 for (MachineInstr &MI : MBB) { 677 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 678 MI.getOpcode() != AMDGPU::COPY) 679 continue; 680 681 Register DstReg = MI.getOperand(0).getReg(); 682 if (!isVreg1(DstReg)) 683 continue; 684 685 if (MRI->use_empty(DstReg)) { 686 DeadCopies.push_back(&MI); 687 continue; 688 } 689 690 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 691 692 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 693 : &AMDGPU::SReg_64RegClass); 694 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 695 continue; 696 697 DebugLoc DL = MI.getDebugLoc(); 698 Register SrcReg = MI.getOperand(1).getReg(); 699 assert(!MI.getOperand(1).getSubReg()); 700 701 if (!Register::isVirtualRegister(SrcReg) || 702 (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 703 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 704 unsigned TmpReg = createLaneMaskReg(*MF); 705 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 706 .addReg(SrcReg) 707 .addImm(0); 708 MI.getOperand(1).setReg(TmpReg); 709 SrcReg = TmpReg; 710 } 711 712 // Defs in a loop that are observed outside the loop must be transformed 713 // into appropriate bit manipulation. 714 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 715 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 716 DomBlocks.push_back(Use.getParent()); 717 718 MachineBasicBlock *PostDomBound = 719 PDT->findNearestCommonDominator(DomBlocks); 720 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 721 if (FoundLoopLevel) { 722 SSAUpdater.Initialize(DstReg); 723 SSAUpdater.AddAvailableValue(&MBB, DstReg); 724 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 725 726 buildMergeLaneMasks(MBB, MI, DL, DstReg, 727 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 728 DeadCopies.push_back(&MI); 729 } 730 } 731 732 for (MachineInstr *MI : DeadCopies) 733 MI->eraseFromParent(); 734 DeadCopies.clear(); 735 } 736 } 737 738 bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { 739 const MachineInstr *MI; 740 for (;;) { 741 MI = MRI->getUniqueVRegDef(Reg); 742 if (MI->getOpcode() != AMDGPU::COPY) 743 break; 744 745 Reg = MI->getOperand(1).getReg(); 746 if (!Register::isVirtualRegister(Reg)) 747 return false; 748 if (!isLaneMaskReg(Reg)) 749 return false; 750 } 751 752 if (MI->getOpcode() != MovOp) 753 return false; 754 755 if (!MI->getOperand(1).isImm()) 756 return false; 757 758 int64_t Imm = MI->getOperand(1).getImm(); 759 if (Imm == 0) { 760 Val = false; 761 return true; 762 } 763 if (Imm == -1) { 764 Val = true; 765 return true; 766 } 767 768 return false; 769 } 770 771 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 772 Def = false; 773 Use = false; 774 775 for (const MachineOperand &MO : MI.operands()) { 776 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 777 if (MO.isUse()) 778 Use = true; 779 else 780 Def = true; 781 } 782 } 783 } 784 785 /// Return a point at the end of the given \p MBB to insert SALU instructions 786 /// for lane mask calculation. Take terminators and SCC into account. 787 MachineBasicBlock::iterator 788 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 789 auto InsertionPt = MBB.getFirstTerminator(); 790 bool TerminatorsUseSCC = false; 791 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 792 bool DefsSCC; 793 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 794 if (TerminatorsUseSCC || DefsSCC) 795 break; 796 } 797 798 if (!TerminatorsUseSCC) 799 return InsertionPt; 800 801 while (InsertionPt != MBB.begin()) { 802 InsertionPt--; 803 804 bool DefSCC, UseSCC; 805 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 806 if (DefSCC) 807 return InsertionPt; 808 } 809 810 // We should have at least seen an IMPLICIT_DEF or COPY 811 llvm_unreachable("SCC used by terminator but no def in block"); 812 } 813 814 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 815 MachineBasicBlock::iterator I, 816 const DebugLoc &DL, unsigned DstReg, 817 unsigned PrevReg, unsigned CurReg) { 818 bool PrevVal; 819 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 820 bool CurVal; 821 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 822 823 if (PrevConstant && CurConstant) { 824 if (PrevVal == CurVal) { 825 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 826 } else if (CurVal) { 827 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 828 } else { 829 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 830 .addReg(ExecReg) 831 .addImm(-1); 832 } 833 return; 834 } 835 836 unsigned PrevMaskedReg = 0; 837 unsigned CurMaskedReg = 0; 838 if (!PrevConstant) { 839 if (CurConstant && CurVal) { 840 PrevMaskedReg = PrevReg; 841 } else { 842 PrevMaskedReg = createLaneMaskReg(*MF); 843 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 844 .addReg(PrevReg) 845 .addReg(ExecReg); 846 } 847 } 848 if (!CurConstant) { 849 // TODO: check whether CurReg is already masked by EXEC 850 if (PrevConstant && PrevVal) { 851 CurMaskedReg = CurReg; 852 } else { 853 CurMaskedReg = createLaneMaskReg(*MF); 854 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 855 .addReg(CurReg) 856 .addReg(ExecReg); 857 } 858 } 859 860 if (PrevConstant && !PrevVal) { 861 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 862 .addReg(CurMaskedReg); 863 } else if (CurConstant && !CurVal) { 864 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 865 .addReg(PrevMaskedReg); 866 } else if (PrevConstant && PrevVal) { 867 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 868 .addReg(CurMaskedReg) 869 .addReg(ExecReg); 870 } else { 871 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 872 .addReg(PrevMaskedReg) 873 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 874 } 875 } 876