1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachinePostDominators.h" 30 #include "llvm/CodeGen/MachineSSAUpdater.h" 31 #include "llvm/InitializePasses.h" 32 33 #define DEBUG_TYPE "si-i1-copies" 34 35 using namespace llvm; 36 37 static unsigned createLaneMaskReg(MachineFunction &MF); 38 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 39 40 namespace { 41 42 class SILowerI1Copies : public MachineFunctionPass { 43 public: 44 static char ID; 45 46 private: 47 bool IsWave32 = false; 48 MachineFunction *MF = nullptr; 49 MachineDominatorTree *DT = nullptr; 50 MachinePostDominatorTree *PDT = nullptr; 51 MachineRegisterInfo *MRI = nullptr; 52 const GCNSubtarget *ST = nullptr; 53 const SIInstrInfo *TII = nullptr; 54 55 unsigned ExecReg; 56 unsigned MovOp; 57 unsigned AndOp; 58 unsigned OrOp; 59 unsigned XorOp; 60 unsigned AndN2Op; 61 unsigned OrN2Op; 62 63 DenseSet<unsigned> ConstrainRegs; 64 65 public: 66 SILowerI1Copies() : MachineFunctionPass(ID) { 67 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 68 } 69 70 bool runOnMachineFunction(MachineFunction &MF) override; 71 72 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.setPreservesCFG(); 76 AU.addRequired<MachineDominatorTree>(); 77 AU.addRequired<MachinePostDominatorTree>(); 78 MachineFunctionPass::getAnalysisUsage(AU); 79 } 80 81 private: 82 void lowerCopiesFromI1(); 83 void lowerPhis(); 84 void lowerCopiesToI1(); 85 bool isConstantLaneMask(Register Reg, bool &Val) const; 86 void buildMergeLaneMasks(MachineBasicBlock &MBB, 87 MachineBasicBlock::iterator I, const DebugLoc &DL, 88 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 89 MachineBasicBlock::iterator 90 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 91 92 bool isVreg1(Register Reg) const { 93 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 94 } 95 96 bool isLaneMaskReg(unsigned Reg) const { 97 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 98 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 99 ST->getWavefrontSize(); 100 } 101 }; 102 103 /// Helper class that determines the relationship between incoming values of a 104 /// phi in the control flow graph to determine where an incoming value can 105 /// simply be taken as a scalar lane mask as-is, and where it needs to be 106 /// merged with another, previously defined lane mask. 107 /// 108 /// The approach is as follows: 109 /// - Determine all basic blocks which, starting from the incoming blocks, 110 /// a wave may reach before entering the def block (the block containing the 111 /// phi). 112 /// - If an incoming block has no predecessors in this set, we can take the 113 /// incoming value as a scalar lane mask as-is. 114 /// -- A special case of this is when the def block has a self-loop. 115 /// - Otherwise, the incoming value needs to be merged with a previously 116 /// defined lane mask. 117 /// - If there is a path into the set of reachable blocks that does _not_ go 118 /// through an incoming block where we can take the scalar lane mask as-is, 119 /// we need to invent an available value for the SSAUpdater. Choices are 120 /// 0 and undef, with differing consequences for how to merge values etc. 121 /// 122 /// TODO: We could use region analysis to quickly skip over SESE regions during 123 /// the traversal. 124 /// 125 class PhiIncomingAnalysis { 126 MachinePostDominatorTree &PDT; 127 128 // For each reachable basic block, whether it is a source in the induced 129 // subgraph of the CFG. 130 DenseMap<MachineBasicBlock *, bool> ReachableMap; 131 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 132 SmallVector<MachineBasicBlock *, 4> Stack; 133 SmallVector<MachineBasicBlock *, 4> Predecessors; 134 135 public: 136 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} 137 138 /// Returns whether \p MBB is a source in the induced subgraph of reachable 139 /// blocks. 140 bool isSource(MachineBasicBlock &MBB) const { 141 return ReachableMap.find(&MBB)->second; 142 } 143 144 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 145 146 void analyze(MachineBasicBlock &DefBlock, 147 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 148 assert(Stack.empty()); 149 ReachableMap.clear(); 150 ReachableOrdered.clear(); 151 Predecessors.clear(); 152 153 // Insert the def block first, so that it acts as an end point for the 154 // traversal. 155 ReachableMap.try_emplace(&DefBlock, false); 156 ReachableOrdered.push_back(&DefBlock); 157 158 for (MachineBasicBlock *MBB : IncomingBlocks) { 159 if (MBB == &DefBlock) { 160 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 161 continue; 162 } 163 164 ReachableMap.try_emplace(MBB, false); 165 ReachableOrdered.push_back(MBB); 166 167 // If this block has a divergent terminator and the def block is its 168 // post-dominator, the wave may first visit the other successors. 169 bool Divergent = false; 170 for (MachineInstr &MI : MBB->terminators()) { 171 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 172 MI.getOpcode() == AMDGPU::SI_IF || 173 MI.getOpcode() == AMDGPU::SI_ELSE || 174 MI.getOpcode() == AMDGPU::SI_LOOP) { 175 Divergent = true; 176 break; 177 } 178 } 179 180 if (Divergent && PDT.dominates(&DefBlock, MBB)) 181 append_range(Stack, MBB->successors()); 182 } 183 184 while (!Stack.empty()) { 185 MachineBasicBlock *MBB = Stack.pop_back_val(); 186 if (!ReachableMap.try_emplace(MBB, false).second) 187 continue; 188 ReachableOrdered.push_back(MBB); 189 190 append_range(Stack, MBB->successors()); 191 } 192 193 for (MachineBasicBlock *MBB : ReachableOrdered) { 194 bool HaveReachablePred = false; 195 for (MachineBasicBlock *Pred : MBB->predecessors()) { 196 if (ReachableMap.count(Pred)) { 197 HaveReachablePred = true; 198 } else { 199 Stack.push_back(Pred); 200 } 201 } 202 if (!HaveReachablePred) 203 ReachableMap[MBB] = true; 204 if (HaveReachablePred) { 205 for (MachineBasicBlock *UnreachablePred : Stack) { 206 if (!llvm::is_contained(Predecessors, UnreachablePred)) 207 Predecessors.push_back(UnreachablePred); 208 } 209 } 210 Stack.clear(); 211 } 212 } 213 }; 214 215 /// Helper class that detects loops which require us to lower an i1 COPY into 216 /// bitwise manipulation. 217 /// 218 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 219 /// between loops with the same header. Consider this example: 220 /// 221 /// A-+-+ 222 /// | | | 223 /// B-+ | 224 /// | | 225 /// C---+ 226 /// 227 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 228 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 229 /// bitwise operations to combine results from different loop iterations when 230 /// B has a divergent branch (since by default we will compile this code such 231 /// that threads in a wave are merged at the entry of C). 232 /// 233 /// The following rule is implemented to determine whether bitwise operations 234 /// are required: use the bitwise lowering for a def in block B if a backward 235 /// edge to B is reachable without going through the nearest common 236 /// post-dominator of B and all uses of the def. 237 /// 238 /// TODO: This rule is conservative because it does not check whether the 239 /// relevant branches are actually divergent. 240 /// 241 /// The class is designed to cache the CFG traversal so that it can be re-used 242 /// for multiple defs within the same basic block. 243 /// 244 /// TODO: We could use region analysis to quickly skip over SESE regions during 245 /// the traversal. 246 /// 247 class LoopFinder { 248 MachineDominatorTree &DT; 249 MachinePostDominatorTree &PDT; 250 251 // All visited / reachable block, tagged by level (level 0 is the def block, 252 // level 1 are all blocks reachable including but not going through the def 253 // block's IPDOM, etc.). 254 DenseMap<MachineBasicBlock *, unsigned> Visited; 255 256 // Nearest common dominator of all visited blocks by level (level 0 is the 257 // def block). Used for seeding the SSAUpdater. 258 SmallVector<MachineBasicBlock *, 4> CommonDominators; 259 260 // Post-dominator of all visited blocks. 261 MachineBasicBlock *VisitedPostDom = nullptr; 262 263 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 264 // reachable without going through the IPDOM of the def block (if the IPDOM 265 // itself has an edge to the def block, the loop level is 2), etc. 266 unsigned FoundLoopLevel = ~0u; 267 268 MachineBasicBlock *DefBlock = nullptr; 269 SmallVector<MachineBasicBlock *, 4> Stack; 270 SmallVector<MachineBasicBlock *, 4> NextLevel; 271 272 public: 273 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 274 : DT(DT), PDT(PDT) {} 275 276 void initialize(MachineBasicBlock &MBB) { 277 Visited.clear(); 278 CommonDominators.clear(); 279 Stack.clear(); 280 NextLevel.clear(); 281 VisitedPostDom = nullptr; 282 FoundLoopLevel = ~0u; 283 284 DefBlock = &MBB; 285 } 286 287 /// Check whether a backward edge can be reached without going through the 288 /// given \p PostDom of the def block. 289 /// 290 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 291 unsigned findLoop(MachineBasicBlock *PostDom) { 292 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 293 294 if (!VisitedPostDom) 295 advanceLevel(); 296 297 unsigned Level = 0; 298 while (PDNode->getBlock() != PostDom) { 299 if (PDNode->getBlock() == VisitedPostDom) 300 advanceLevel(); 301 PDNode = PDNode->getIDom(); 302 Level++; 303 if (FoundLoopLevel == Level) 304 return Level; 305 } 306 307 return 0; 308 } 309 310 /// Add undef values dominating the loop and the optionally given additional 311 /// blocks, so that the SSA updater doesn't have to search all the way to the 312 /// function entry. 313 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 314 ArrayRef<MachineBasicBlock *> Blocks = {}) { 315 assert(LoopLevel < CommonDominators.size()); 316 317 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 318 for (MachineBasicBlock *MBB : Blocks) 319 Dom = DT.findNearestCommonDominator(Dom, MBB); 320 321 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 322 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 323 } else { 324 // The dominator is part of the loop or the given blocks, so add the 325 // undef value to unreachable predecessors instead. 326 for (MachineBasicBlock *Pred : Dom->predecessors()) { 327 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 328 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 329 } 330 } 331 } 332 333 private: 334 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 335 ArrayRef<MachineBasicBlock *> Blocks) const { 336 auto DomIt = Visited.find(&MBB); 337 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 338 return true; 339 340 if (llvm::is_contained(Blocks, &MBB)) 341 return true; 342 343 return false; 344 } 345 346 void advanceLevel() { 347 MachineBasicBlock *VisitedDom; 348 349 if (!VisitedPostDom) { 350 VisitedPostDom = DefBlock; 351 VisitedDom = DefBlock; 352 Stack.push_back(DefBlock); 353 } else { 354 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 355 VisitedDom = CommonDominators.back(); 356 357 for (unsigned i = 0; i < NextLevel.size();) { 358 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 359 Stack.push_back(NextLevel[i]); 360 361 NextLevel[i] = NextLevel.back(); 362 NextLevel.pop_back(); 363 } else { 364 i++; 365 } 366 } 367 } 368 369 unsigned Level = CommonDominators.size(); 370 while (!Stack.empty()) { 371 MachineBasicBlock *MBB = Stack.pop_back_val(); 372 if (!PDT.dominates(VisitedPostDom, MBB)) 373 NextLevel.push_back(MBB); 374 375 Visited[MBB] = Level; 376 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 377 378 for (MachineBasicBlock *Succ : MBB->successors()) { 379 if (Succ == DefBlock) { 380 if (MBB == VisitedPostDom) 381 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 382 else 383 FoundLoopLevel = std::min(FoundLoopLevel, Level); 384 continue; 385 } 386 387 if (Visited.try_emplace(Succ, ~0u).second) { 388 if (MBB == VisitedPostDom) 389 NextLevel.push_back(Succ); 390 else 391 Stack.push_back(Succ); 392 } 393 } 394 } 395 396 CommonDominators.push_back(VisitedDom); 397 } 398 }; 399 400 } // End anonymous namespace. 401 402 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 403 false) 404 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 405 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 406 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 407 false) 408 409 char SILowerI1Copies::ID = 0; 410 411 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 412 413 FunctionPass *llvm::createSILowerI1CopiesPass() { 414 return new SILowerI1Copies(); 415 } 416 417 static unsigned createLaneMaskReg(MachineFunction &MF) { 418 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 419 MachineRegisterInfo &MRI = MF.getRegInfo(); 420 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 421 : &AMDGPU::SReg_64RegClass); 422 } 423 424 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 425 MachineFunction &MF = *MBB.getParent(); 426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 427 const SIInstrInfo *TII = ST.getInstrInfo(); 428 unsigned UndefReg = createLaneMaskReg(MF); 429 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 430 UndefReg); 431 return UndefReg; 432 } 433 434 /// Lower all instructions that def or use vreg_1 registers. 435 /// 436 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 437 /// occur around inline assembly. We do this first, before vreg_1 registers 438 /// are changed to scalar mask registers. 439 /// 440 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 441 /// all others, because phi lowering looks through copies and can therefore 442 /// often make copy lowering unnecessary. 443 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 444 // Only need to run this in SelectionDAG path. 445 if (TheMF.getProperties().hasProperty( 446 MachineFunctionProperties::Property::Selected)) 447 return false; 448 449 MF = &TheMF; 450 MRI = &MF->getRegInfo(); 451 DT = &getAnalysis<MachineDominatorTree>(); 452 PDT = &getAnalysis<MachinePostDominatorTree>(); 453 454 ST = &MF->getSubtarget<GCNSubtarget>(); 455 TII = ST->getInstrInfo(); 456 IsWave32 = ST->isWave32(); 457 458 if (IsWave32) { 459 ExecReg = AMDGPU::EXEC_LO; 460 MovOp = AMDGPU::S_MOV_B32; 461 AndOp = AMDGPU::S_AND_B32; 462 OrOp = AMDGPU::S_OR_B32; 463 XorOp = AMDGPU::S_XOR_B32; 464 AndN2Op = AMDGPU::S_ANDN2_B32; 465 OrN2Op = AMDGPU::S_ORN2_B32; 466 } else { 467 ExecReg = AMDGPU::EXEC; 468 MovOp = AMDGPU::S_MOV_B64; 469 AndOp = AMDGPU::S_AND_B64; 470 OrOp = AMDGPU::S_OR_B64; 471 XorOp = AMDGPU::S_XOR_B64; 472 AndN2Op = AMDGPU::S_ANDN2_B64; 473 OrN2Op = AMDGPU::S_ORN2_B64; 474 } 475 476 lowerCopiesFromI1(); 477 lowerPhis(); 478 lowerCopiesToI1(); 479 480 for (unsigned Reg : ConstrainRegs) 481 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 482 ConstrainRegs.clear(); 483 484 return true; 485 } 486 487 #ifndef NDEBUG 488 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 489 const MachineRegisterInfo &MRI, 490 Register Reg) { 491 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 492 return Size == 1 || Size == 32; 493 } 494 #endif 495 496 void SILowerI1Copies::lowerCopiesFromI1() { 497 SmallVector<MachineInstr *, 4> DeadCopies; 498 499 for (MachineBasicBlock &MBB : *MF) { 500 for (MachineInstr &MI : MBB) { 501 if (MI.getOpcode() != AMDGPU::COPY) 502 continue; 503 504 Register DstReg = MI.getOperand(0).getReg(); 505 Register SrcReg = MI.getOperand(1).getReg(); 506 if (!isVreg1(SrcReg)) 507 continue; 508 509 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 510 continue; 511 512 // Copy into a 32-bit vector register. 513 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 514 DebugLoc DL = MI.getDebugLoc(); 515 516 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 517 assert(!MI.getOperand(0).getSubReg()); 518 519 ConstrainRegs.insert(SrcReg); 520 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 521 .addImm(0) 522 .addImm(0) 523 .addImm(0) 524 .addImm(-1) 525 .addReg(SrcReg); 526 DeadCopies.push_back(&MI); 527 } 528 529 for (MachineInstr *MI : DeadCopies) 530 MI->eraseFromParent(); 531 DeadCopies.clear(); 532 } 533 } 534 535 void SILowerI1Copies::lowerPhis() { 536 MachineSSAUpdater SSAUpdater(*MF); 537 LoopFinder LF(*DT, *PDT); 538 PhiIncomingAnalysis PIA(*PDT); 539 SmallVector<MachineInstr *, 4> Vreg1Phis; 540 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 541 SmallVector<unsigned, 4> IncomingRegs; 542 SmallVector<unsigned, 4> IncomingUpdated; 543 #ifndef NDEBUG 544 DenseSet<unsigned> PhiRegisters; 545 #endif 546 547 for (MachineBasicBlock &MBB : *MF) { 548 for (MachineInstr &MI : MBB.phis()) { 549 if (isVreg1(MI.getOperand(0).getReg())) 550 Vreg1Phis.push_back(&MI); 551 } 552 } 553 554 MachineBasicBlock *PrevMBB = nullptr; 555 for (MachineInstr *MI : Vreg1Phis) { 556 MachineBasicBlock &MBB = *MI->getParent(); 557 if (&MBB != PrevMBB) { 558 LF.initialize(MBB); 559 PrevMBB = &MBB; 560 } 561 562 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 563 564 Register DstReg = MI->getOperand(0).getReg(); 565 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 566 : &AMDGPU::SReg_64RegClass); 567 568 // Collect incoming values. 569 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 570 assert(i + 1 < MI->getNumOperands()); 571 Register IncomingReg = MI->getOperand(i).getReg(); 572 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 573 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 574 575 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 576 IncomingReg = IncomingDef->getOperand(1).getReg(); 577 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 578 assert(!IncomingDef->getOperand(1).getSubReg()); 579 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 580 continue; 581 } else { 582 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 583 } 584 585 IncomingBlocks.push_back(IncomingMBB); 586 IncomingRegs.push_back(IncomingReg); 587 } 588 589 #ifndef NDEBUG 590 PhiRegisters.insert(DstReg); 591 #endif 592 593 // Phis in a loop that are observed outside the loop receive a simple but 594 // conservatively correct treatment. 595 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 596 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 597 DomBlocks.push_back(Use.getParent()); 598 599 MachineBasicBlock *PostDomBound = 600 PDT->findNearestCommonDominator(DomBlocks); 601 602 // FIXME: This fails to find irreducible cycles. If we have a def (other 603 // than a constant) in a pair of blocks that end up looping back to each 604 // other, it will be mishandle. Due to structurization this shouldn't occur 605 // in practice. 606 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 607 608 SSAUpdater.Initialize(DstReg); 609 610 if (FoundLoopLevel) { 611 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 612 613 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 614 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 615 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 616 IncomingUpdated.back()); 617 } 618 619 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 620 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 621 buildMergeLaneMasks( 622 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 623 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 624 } 625 } else { 626 // The phi is not observed from outside a loop. Use a more accurate 627 // lowering. 628 PIA.analyze(MBB, IncomingBlocks); 629 630 for (MachineBasicBlock *MBB : PIA.predecessors()) 631 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 632 633 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 634 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 635 if (PIA.isSource(IMBB)) { 636 IncomingUpdated.push_back(0); 637 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 638 } else { 639 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 640 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 641 } 642 } 643 644 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 645 if (!IncomingUpdated[i]) 646 continue; 647 648 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 649 buildMergeLaneMasks( 650 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 651 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 652 } 653 } 654 655 Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 656 if (NewReg != DstReg) { 657 MRI->replaceRegWith(NewReg, DstReg); 658 MI->eraseFromParent(); 659 } 660 661 IncomingBlocks.clear(); 662 IncomingRegs.clear(); 663 IncomingUpdated.clear(); 664 } 665 } 666 667 void SILowerI1Copies::lowerCopiesToI1() { 668 MachineSSAUpdater SSAUpdater(*MF); 669 LoopFinder LF(*DT, *PDT); 670 SmallVector<MachineInstr *, 4> DeadCopies; 671 672 for (MachineBasicBlock &MBB : *MF) { 673 LF.initialize(MBB); 674 675 for (MachineInstr &MI : MBB) { 676 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 677 MI.getOpcode() != AMDGPU::COPY) 678 continue; 679 680 Register DstReg = MI.getOperand(0).getReg(); 681 if (!isVreg1(DstReg)) 682 continue; 683 684 if (MRI->use_empty(DstReg)) { 685 DeadCopies.push_back(&MI); 686 continue; 687 } 688 689 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 690 691 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 692 : &AMDGPU::SReg_64RegClass); 693 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 694 continue; 695 696 DebugLoc DL = MI.getDebugLoc(); 697 Register SrcReg = MI.getOperand(1).getReg(); 698 assert(!MI.getOperand(1).getSubReg()); 699 700 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 701 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 702 unsigned TmpReg = createLaneMaskReg(*MF); 703 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 704 .addReg(SrcReg) 705 .addImm(0); 706 MI.getOperand(1).setReg(TmpReg); 707 SrcReg = TmpReg; 708 } 709 710 // Defs in a loop that are observed outside the loop must be transformed 711 // into appropriate bit manipulation. 712 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 713 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 714 DomBlocks.push_back(Use.getParent()); 715 716 MachineBasicBlock *PostDomBound = 717 PDT->findNearestCommonDominator(DomBlocks); 718 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 719 if (FoundLoopLevel) { 720 SSAUpdater.Initialize(DstReg); 721 SSAUpdater.AddAvailableValue(&MBB, DstReg); 722 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 723 724 buildMergeLaneMasks(MBB, MI, DL, DstReg, 725 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 726 DeadCopies.push_back(&MI); 727 } 728 } 729 730 for (MachineInstr *MI : DeadCopies) 731 MI->eraseFromParent(); 732 DeadCopies.clear(); 733 } 734 } 735 736 bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { 737 const MachineInstr *MI; 738 for (;;) { 739 MI = MRI->getUniqueVRegDef(Reg); 740 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) 741 return true; 742 743 if (MI->getOpcode() != AMDGPU::COPY) 744 break; 745 746 Reg = MI->getOperand(1).getReg(); 747 if (!Reg.isVirtual()) 748 return false; 749 if (!isLaneMaskReg(Reg)) 750 return false; 751 } 752 753 if (MI->getOpcode() != MovOp) 754 return false; 755 756 if (!MI->getOperand(1).isImm()) 757 return false; 758 759 int64_t Imm = MI->getOperand(1).getImm(); 760 if (Imm == 0) { 761 Val = false; 762 return true; 763 } 764 if (Imm == -1) { 765 Val = true; 766 return true; 767 } 768 769 return false; 770 } 771 772 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 773 Def = false; 774 Use = false; 775 776 for (const MachineOperand &MO : MI.operands()) { 777 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 778 if (MO.isUse()) 779 Use = true; 780 else 781 Def = true; 782 } 783 } 784 } 785 786 /// Return a point at the end of the given \p MBB to insert SALU instructions 787 /// for lane mask calculation. Take terminators and SCC into account. 788 MachineBasicBlock::iterator 789 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 790 auto InsertionPt = MBB.getFirstTerminator(); 791 bool TerminatorsUseSCC = false; 792 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 793 bool DefsSCC; 794 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 795 if (TerminatorsUseSCC || DefsSCC) 796 break; 797 } 798 799 if (!TerminatorsUseSCC) 800 return InsertionPt; 801 802 while (InsertionPt != MBB.begin()) { 803 InsertionPt--; 804 805 bool DefSCC, UseSCC; 806 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 807 if (DefSCC) 808 return InsertionPt; 809 } 810 811 // We should have at least seen an IMPLICIT_DEF or COPY 812 llvm_unreachable("SCC used by terminator but no def in block"); 813 } 814 815 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 816 MachineBasicBlock::iterator I, 817 const DebugLoc &DL, unsigned DstReg, 818 unsigned PrevReg, unsigned CurReg) { 819 bool PrevVal = false; 820 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 821 bool CurVal = false; 822 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 823 824 if (PrevConstant && CurConstant) { 825 if (PrevVal == CurVal) { 826 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 827 } else if (CurVal) { 828 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 829 } else { 830 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 831 .addReg(ExecReg) 832 .addImm(-1); 833 } 834 return; 835 } 836 837 unsigned PrevMaskedReg = 0; 838 unsigned CurMaskedReg = 0; 839 if (!PrevConstant) { 840 if (CurConstant && CurVal) { 841 PrevMaskedReg = PrevReg; 842 } else { 843 PrevMaskedReg = createLaneMaskReg(*MF); 844 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 845 .addReg(PrevReg) 846 .addReg(ExecReg); 847 } 848 } 849 if (!CurConstant) { 850 // TODO: check whether CurReg is already masked by EXEC 851 if (PrevConstant && PrevVal) { 852 CurMaskedReg = CurReg; 853 } else { 854 CurMaskedReg = createLaneMaskReg(*MF); 855 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 856 .addReg(CurReg) 857 .addReg(ExecReg); 858 } 859 } 860 861 if (PrevConstant && !PrevVal) { 862 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 863 .addReg(CurMaskedReg); 864 } else if (CurConstant && !CurVal) { 865 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 866 .addReg(PrevMaskedReg); 867 } else if (PrevConstant && PrevVal) { 868 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 869 .addReg(CurMaskedReg) 870 .addReg(ExecReg); 871 } else { 872 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 873 .addReg(PrevMaskedReg) 874 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 875 } 876 } 877