1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachinePostDominators.h" 30 #include "llvm/CodeGen/MachineSSAUpdater.h" 31 #include "llvm/InitializePasses.h" 32 33 #define DEBUG_TYPE "si-i1-copies" 34 35 using namespace llvm; 36 37 static unsigned createLaneMaskReg(MachineFunction &MF); 38 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 39 40 namespace { 41 42 class SILowerI1Copies : public MachineFunctionPass { 43 public: 44 static char ID; 45 46 private: 47 bool IsWave32 = false; 48 MachineFunction *MF = nullptr; 49 MachineDominatorTree *DT = nullptr; 50 MachinePostDominatorTree *PDT = nullptr; 51 MachineRegisterInfo *MRI = nullptr; 52 const GCNSubtarget *ST = nullptr; 53 const SIInstrInfo *TII = nullptr; 54 55 unsigned ExecReg; 56 unsigned MovOp; 57 unsigned AndOp; 58 unsigned OrOp; 59 unsigned XorOp; 60 unsigned AndN2Op; 61 unsigned OrN2Op; 62 63 DenseSet<unsigned> ConstrainRegs; 64 65 public: 66 SILowerI1Copies() : MachineFunctionPass(ID) { 67 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 68 } 69 70 bool runOnMachineFunction(MachineFunction &MF) override; 71 72 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.setPreservesCFG(); 76 AU.addRequired<MachineDominatorTree>(); 77 AU.addRequired<MachinePostDominatorTree>(); 78 MachineFunctionPass::getAnalysisUsage(AU); 79 } 80 81 private: 82 bool lowerCopiesFromI1(); 83 bool lowerPhis(); 84 bool lowerCopiesToI1(); 85 bool isConstantLaneMask(Register Reg, bool &Val) const; 86 void buildMergeLaneMasks(MachineBasicBlock &MBB, 87 MachineBasicBlock::iterator I, const DebugLoc &DL, 88 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 89 MachineBasicBlock::iterator 90 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 91 92 bool isVreg1(Register Reg) const { 93 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 94 } 95 96 bool isLaneMaskReg(unsigned Reg) const { 97 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 98 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 99 ST->getWavefrontSize(); 100 } 101 }; 102 103 /// Helper class that determines the relationship between incoming values of a 104 /// phi in the control flow graph to determine where an incoming value can 105 /// simply be taken as a scalar lane mask as-is, and where it needs to be 106 /// merged with another, previously defined lane mask. 107 /// 108 /// The approach is as follows: 109 /// - Determine all basic blocks which, starting from the incoming blocks, 110 /// a wave may reach before entering the def block (the block containing the 111 /// phi). 112 /// - If an incoming block has no predecessors in this set, we can take the 113 /// incoming value as a scalar lane mask as-is. 114 /// -- A special case of this is when the def block has a self-loop. 115 /// - Otherwise, the incoming value needs to be merged with a previously 116 /// defined lane mask. 117 /// - If there is a path into the set of reachable blocks that does _not_ go 118 /// through an incoming block where we can take the scalar lane mask as-is, 119 /// we need to invent an available value for the SSAUpdater. Choices are 120 /// 0 and undef, with differing consequences for how to merge values etc. 121 /// 122 /// TODO: We could use region analysis to quickly skip over SESE regions during 123 /// the traversal. 124 /// 125 class PhiIncomingAnalysis { 126 MachinePostDominatorTree &PDT; 127 const SIInstrInfo *TII; 128 129 // For each reachable basic block, whether it is a source in the induced 130 // subgraph of the CFG. 131 DenseMap<MachineBasicBlock *, bool> ReachableMap; 132 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 133 SmallVector<MachineBasicBlock *, 4> Stack; 134 SmallVector<MachineBasicBlock *, 4> Predecessors; 135 136 public: 137 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII) 138 : PDT(PDT), TII(TII) {} 139 140 /// Returns whether \p MBB is a source in the induced subgraph of reachable 141 /// blocks. 142 bool isSource(MachineBasicBlock &MBB) const { 143 return ReachableMap.find(&MBB)->second; 144 } 145 146 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 147 148 void analyze(MachineBasicBlock &DefBlock, 149 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 150 assert(Stack.empty()); 151 ReachableMap.clear(); 152 ReachableOrdered.clear(); 153 Predecessors.clear(); 154 155 // Insert the def block first, so that it acts as an end point for the 156 // traversal. 157 ReachableMap.try_emplace(&DefBlock, false); 158 ReachableOrdered.push_back(&DefBlock); 159 160 for (MachineBasicBlock *MBB : IncomingBlocks) { 161 if (MBB == &DefBlock) { 162 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 163 continue; 164 } 165 166 ReachableMap.try_emplace(MBB, false); 167 ReachableOrdered.push_back(MBB); 168 169 // If this block has a divergent terminator and the def block is its 170 // post-dominator, the wave may first visit the other successors. 171 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB)) 172 append_range(Stack, MBB->successors()); 173 } 174 175 while (!Stack.empty()) { 176 MachineBasicBlock *MBB = Stack.pop_back_val(); 177 if (!ReachableMap.try_emplace(MBB, false).second) 178 continue; 179 ReachableOrdered.push_back(MBB); 180 181 append_range(Stack, MBB->successors()); 182 } 183 184 for (MachineBasicBlock *MBB : ReachableOrdered) { 185 bool HaveReachablePred = false; 186 for (MachineBasicBlock *Pred : MBB->predecessors()) { 187 if (ReachableMap.count(Pred)) { 188 HaveReachablePred = true; 189 } else { 190 Stack.push_back(Pred); 191 } 192 } 193 if (!HaveReachablePred) 194 ReachableMap[MBB] = true; 195 if (HaveReachablePred) { 196 for (MachineBasicBlock *UnreachablePred : Stack) { 197 if (!llvm::is_contained(Predecessors, UnreachablePred)) 198 Predecessors.push_back(UnreachablePred); 199 } 200 } 201 Stack.clear(); 202 } 203 } 204 }; 205 206 /// Helper class that detects loops which require us to lower an i1 COPY into 207 /// bitwise manipulation. 208 /// 209 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 210 /// between loops with the same header. Consider this example: 211 /// 212 /// A-+-+ 213 /// | | | 214 /// B-+ | 215 /// | | 216 /// C---+ 217 /// 218 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 219 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 220 /// bitwise operations to combine results from different loop iterations when 221 /// B has a divergent branch (since by default we will compile this code such 222 /// that threads in a wave are merged at the entry of C). 223 /// 224 /// The following rule is implemented to determine whether bitwise operations 225 /// are required: use the bitwise lowering for a def in block B if a backward 226 /// edge to B is reachable without going through the nearest common 227 /// post-dominator of B and all uses of the def. 228 /// 229 /// TODO: This rule is conservative because it does not check whether the 230 /// relevant branches are actually divergent. 231 /// 232 /// The class is designed to cache the CFG traversal so that it can be re-used 233 /// for multiple defs within the same basic block. 234 /// 235 /// TODO: We could use region analysis to quickly skip over SESE regions during 236 /// the traversal. 237 /// 238 class LoopFinder { 239 MachineDominatorTree &DT; 240 MachinePostDominatorTree &PDT; 241 242 // All visited / reachable block, tagged by level (level 0 is the def block, 243 // level 1 are all blocks reachable including but not going through the def 244 // block's IPDOM, etc.). 245 DenseMap<MachineBasicBlock *, unsigned> Visited; 246 247 // Nearest common dominator of all visited blocks by level (level 0 is the 248 // def block). Used for seeding the SSAUpdater. 249 SmallVector<MachineBasicBlock *, 4> CommonDominators; 250 251 // Post-dominator of all visited blocks. 252 MachineBasicBlock *VisitedPostDom = nullptr; 253 254 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 255 // reachable without going through the IPDOM of the def block (if the IPDOM 256 // itself has an edge to the def block, the loop level is 2), etc. 257 unsigned FoundLoopLevel = ~0u; 258 259 MachineBasicBlock *DefBlock = nullptr; 260 SmallVector<MachineBasicBlock *, 4> Stack; 261 SmallVector<MachineBasicBlock *, 4> NextLevel; 262 263 public: 264 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 265 : DT(DT), PDT(PDT) {} 266 267 void initialize(MachineBasicBlock &MBB) { 268 Visited.clear(); 269 CommonDominators.clear(); 270 Stack.clear(); 271 NextLevel.clear(); 272 VisitedPostDom = nullptr; 273 FoundLoopLevel = ~0u; 274 275 DefBlock = &MBB; 276 } 277 278 /// Check whether a backward edge can be reached without going through the 279 /// given \p PostDom of the def block. 280 /// 281 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 282 unsigned findLoop(MachineBasicBlock *PostDom) { 283 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 284 285 if (!VisitedPostDom) 286 advanceLevel(); 287 288 unsigned Level = 0; 289 while (PDNode->getBlock() != PostDom) { 290 if (PDNode->getBlock() == VisitedPostDom) 291 advanceLevel(); 292 PDNode = PDNode->getIDom(); 293 Level++; 294 if (FoundLoopLevel == Level) 295 return Level; 296 } 297 298 return 0; 299 } 300 301 /// Add undef values dominating the loop and the optionally given additional 302 /// blocks, so that the SSA updater doesn't have to search all the way to the 303 /// function entry. 304 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 305 ArrayRef<MachineBasicBlock *> Blocks = {}) { 306 assert(LoopLevel < CommonDominators.size()); 307 308 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 309 for (MachineBasicBlock *MBB : Blocks) 310 Dom = DT.findNearestCommonDominator(Dom, MBB); 311 312 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 313 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 314 } else { 315 // The dominator is part of the loop or the given blocks, so add the 316 // undef value to unreachable predecessors instead. 317 for (MachineBasicBlock *Pred : Dom->predecessors()) { 318 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 319 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 320 } 321 } 322 } 323 324 private: 325 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 326 ArrayRef<MachineBasicBlock *> Blocks) const { 327 auto DomIt = Visited.find(&MBB); 328 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 329 return true; 330 331 if (llvm::is_contained(Blocks, &MBB)) 332 return true; 333 334 return false; 335 } 336 337 void advanceLevel() { 338 MachineBasicBlock *VisitedDom; 339 340 if (!VisitedPostDom) { 341 VisitedPostDom = DefBlock; 342 VisitedDom = DefBlock; 343 Stack.push_back(DefBlock); 344 } else { 345 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 346 VisitedDom = CommonDominators.back(); 347 348 for (unsigned i = 0; i < NextLevel.size();) { 349 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 350 Stack.push_back(NextLevel[i]); 351 352 NextLevel[i] = NextLevel.back(); 353 NextLevel.pop_back(); 354 } else { 355 i++; 356 } 357 } 358 } 359 360 unsigned Level = CommonDominators.size(); 361 while (!Stack.empty()) { 362 MachineBasicBlock *MBB = Stack.pop_back_val(); 363 if (!PDT.dominates(VisitedPostDom, MBB)) 364 NextLevel.push_back(MBB); 365 366 Visited[MBB] = Level; 367 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 368 369 for (MachineBasicBlock *Succ : MBB->successors()) { 370 if (Succ == DefBlock) { 371 if (MBB == VisitedPostDom) 372 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 373 else 374 FoundLoopLevel = std::min(FoundLoopLevel, Level); 375 continue; 376 } 377 378 if (Visited.try_emplace(Succ, ~0u).second) { 379 if (MBB == VisitedPostDom) 380 NextLevel.push_back(Succ); 381 else 382 Stack.push_back(Succ); 383 } 384 } 385 } 386 387 CommonDominators.push_back(VisitedDom); 388 } 389 }; 390 391 } // End anonymous namespace. 392 393 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 394 false) 395 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 396 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 397 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 398 false) 399 400 char SILowerI1Copies::ID = 0; 401 402 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 403 404 FunctionPass *llvm::createSILowerI1CopiesPass() { 405 return new SILowerI1Copies(); 406 } 407 408 static unsigned createLaneMaskReg(MachineFunction &MF) { 409 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 410 MachineRegisterInfo &MRI = MF.getRegInfo(); 411 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 412 : &AMDGPU::SReg_64RegClass); 413 } 414 415 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 416 MachineFunction &MF = *MBB.getParent(); 417 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 418 const SIInstrInfo *TII = ST.getInstrInfo(); 419 unsigned UndefReg = createLaneMaskReg(MF); 420 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 421 UndefReg); 422 return UndefReg; 423 } 424 425 /// Lower all instructions that def or use vreg_1 registers. 426 /// 427 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 428 /// occur around inline assembly. We do this first, before vreg_1 registers 429 /// are changed to scalar mask registers. 430 /// 431 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 432 /// all others, because phi lowering looks through copies and can therefore 433 /// often make copy lowering unnecessary. 434 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 435 // Only need to run this in SelectionDAG path. 436 if (TheMF.getProperties().hasProperty( 437 MachineFunctionProperties::Property::Selected)) 438 return false; 439 440 MF = &TheMF; 441 MRI = &MF->getRegInfo(); 442 DT = &getAnalysis<MachineDominatorTree>(); 443 PDT = &getAnalysis<MachinePostDominatorTree>(); 444 445 ST = &MF->getSubtarget<GCNSubtarget>(); 446 TII = ST->getInstrInfo(); 447 IsWave32 = ST->isWave32(); 448 449 if (IsWave32) { 450 ExecReg = AMDGPU::EXEC_LO; 451 MovOp = AMDGPU::S_MOV_B32; 452 AndOp = AMDGPU::S_AND_B32; 453 OrOp = AMDGPU::S_OR_B32; 454 XorOp = AMDGPU::S_XOR_B32; 455 AndN2Op = AMDGPU::S_ANDN2_B32; 456 OrN2Op = AMDGPU::S_ORN2_B32; 457 } else { 458 ExecReg = AMDGPU::EXEC; 459 MovOp = AMDGPU::S_MOV_B64; 460 AndOp = AMDGPU::S_AND_B64; 461 OrOp = AMDGPU::S_OR_B64; 462 XorOp = AMDGPU::S_XOR_B64; 463 AndN2Op = AMDGPU::S_ANDN2_B64; 464 OrN2Op = AMDGPU::S_ORN2_B64; 465 } 466 467 bool Changed = false; 468 Changed |= lowerCopiesFromI1(); 469 Changed |= lowerPhis(); 470 Changed |= lowerCopiesToI1(); 471 472 assert(Changed || ConstrainRegs.empty()); 473 for (unsigned Reg : ConstrainRegs) 474 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 475 ConstrainRegs.clear(); 476 477 return Changed; 478 } 479 480 #ifndef NDEBUG 481 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 482 const MachineRegisterInfo &MRI, 483 Register Reg) { 484 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 485 return Size == 1 || Size == 32; 486 } 487 #endif 488 489 bool SILowerI1Copies::lowerCopiesFromI1() { 490 bool Changed = false; 491 SmallVector<MachineInstr *, 4> DeadCopies; 492 493 for (MachineBasicBlock &MBB : *MF) { 494 for (MachineInstr &MI : MBB) { 495 if (MI.getOpcode() != AMDGPU::COPY) 496 continue; 497 498 Register DstReg = MI.getOperand(0).getReg(); 499 Register SrcReg = MI.getOperand(1).getReg(); 500 if (!isVreg1(SrcReg)) 501 continue; 502 503 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 504 continue; 505 506 Changed = true; 507 508 // Copy into a 32-bit vector register. 509 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 510 DebugLoc DL = MI.getDebugLoc(); 511 512 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 513 assert(!MI.getOperand(0).getSubReg()); 514 515 ConstrainRegs.insert(SrcReg); 516 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 517 .addImm(0) 518 .addImm(0) 519 .addImm(0) 520 .addImm(-1) 521 .addReg(SrcReg); 522 DeadCopies.push_back(&MI); 523 } 524 525 for (MachineInstr *MI : DeadCopies) 526 MI->eraseFromParent(); 527 DeadCopies.clear(); 528 } 529 return Changed; 530 } 531 532 bool SILowerI1Copies::lowerPhis() { 533 MachineSSAUpdater SSAUpdater(*MF); 534 LoopFinder LF(*DT, *PDT); 535 PhiIncomingAnalysis PIA(*PDT, TII); 536 SmallVector<MachineInstr *, 4> Vreg1Phis; 537 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 538 SmallVector<unsigned, 4> IncomingRegs; 539 SmallVector<unsigned, 4> IncomingUpdated; 540 #ifndef NDEBUG 541 DenseSet<unsigned> PhiRegisters; 542 #endif 543 544 for (MachineBasicBlock &MBB : *MF) { 545 for (MachineInstr &MI : MBB.phis()) { 546 if (isVreg1(MI.getOperand(0).getReg())) 547 Vreg1Phis.push_back(&MI); 548 } 549 } 550 if (Vreg1Phis.empty()) 551 return false; 552 553 MachineBasicBlock *PrevMBB = nullptr; 554 for (MachineInstr *MI : Vreg1Phis) { 555 MachineBasicBlock &MBB = *MI->getParent(); 556 if (&MBB != PrevMBB) { 557 LF.initialize(MBB); 558 PrevMBB = &MBB; 559 } 560 561 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 562 563 Register DstReg = MI->getOperand(0).getReg(); 564 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 565 : &AMDGPU::SReg_64RegClass); 566 567 // Collect incoming values. 568 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 569 assert(i + 1 < MI->getNumOperands()); 570 Register IncomingReg = MI->getOperand(i).getReg(); 571 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 572 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 573 574 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 575 IncomingReg = IncomingDef->getOperand(1).getReg(); 576 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 577 assert(!IncomingDef->getOperand(1).getSubReg()); 578 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 579 continue; 580 } else { 581 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 582 } 583 584 IncomingBlocks.push_back(IncomingMBB); 585 IncomingRegs.push_back(IncomingReg); 586 } 587 588 #ifndef NDEBUG 589 PhiRegisters.insert(DstReg); 590 #endif 591 592 // Phis in a loop that are observed outside the loop receive a simple but 593 // conservatively correct treatment. 594 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 595 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 596 DomBlocks.push_back(Use.getParent()); 597 598 MachineBasicBlock *PostDomBound = 599 PDT->findNearestCommonDominator(DomBlocks); 600 601 // FIXME: This fails to find irreducible cycles. If we have a def (other 602 // than a constant) in a pair of blocks that end up looping back to each 603 // other, it will be mishandle. Due to structurization this shouldn't occur 604 // in practice. 605 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 606 607 SSAUpdater.Initialize(DstReg); 608 609 if (FoundLoopLevel) { 610 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 611 612 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 613 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 614 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 615 IncomingUpdated.back()); 616 } 617 618 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 619 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 620 buildMergeLaneMasks( 621 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 622 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 623 } 624 } else { 625 // The phi is not observed from outside a loop. Use a more accurate 626 // lowering. 627 PIA.analyze(MBB, IncomingBlocks); 628 629 for (MachineBasicBlock *MBB : PIA.predecessors()) 630 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 631 632 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 633 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 634 if (PIA.isSource(IMBB)) { 635 IncomingUpdated.push_back(0); 636 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 637 } else { 638 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 639 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 640 } 641 } 642 643 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 644 if (!IncomingUpdated[i]) 645 continue; 646 647 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 648 buildMergeLaneMasks( 649 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 650 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 651 } 652 } 653 654 Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 655 if (NewReg != DstReg) { 656 MRI->replaceRegWith(NewReg, DstReg); 657 MI->eraseFromParent(); 658 } 659 660 IncomingBlocks.clear(); 661 IncomingRegs.clear(); 662 IncomingUpdated.clear(); 663 } 664 return true; 665 } 666 667 bool SILowerI1Copies::lowerCopiesToI1() { 668 bool Changed = false; 669 MachineSSAUpdater SSAUpdater(*MF); 670 LoopFinder LF(*DT, *PDT); 671 SmallVector<MachineInstr *, 4> DeadCopies; 672 673 for (MachineBasicBlock &MBB : *MF) { 674 LF.initialize(MBB); 675 676 for (MachineInstr &MI : MBB) { 677 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 678 MI.getOpcode() != AMDGPU::COPY) 679 continue; 680 681 Register DstReg = MI.getOperand(0).getReg(); 682 if (!isVreg1(DstReg)) 683 continue; 684 685 Changed = true; 686 687 if (MRI->use_empty(DstReg)) { 688 DeadCopies.push_back(&MI); 689 continue; 690 } 691 692 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 693 694 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 695 : &AMDGPU::SReg_64RegClass); 696 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 697 continue; 698 699 DebugLoc DL = MI.getDebugLoc(); 700 Register SrcReg = MI.getOperand(1).getReg(); 701 assert(!MI.getOperand(1).getSubReg()); 702 703 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 704 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 705 unsigned TmpReg = createLaneMaskReg(*MF); 706 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 707 .addReg(SrcReg) 708 .addImm(0); 709 MI.getOperand(1).setReg(TmpReg); 710 SrcReg = TmpReg; 711 } 712 713 // Defs in a loop that are observed outside the loop must be transformed 714 // into appropriate bit manipulation. 715 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 716 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 717 DomBlocks.push_back(Use.getParent()); 718 719 MachineBasicBlock *PostDomBound = 720 PDT->findNearestCommonDominator(DomBlocks); 721 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 722 if (FoundLoopLevel) { 723 SSAUpdater.Initialize(DstReg); 724 SSAUpdater.AddAvailableValue(&MBB, DstReg); 725 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 726 727 buildMergeLaneMasks(MBB, MI, DL, DstReg, 728 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 729 DeadCopies.push_back(&MI); 730 } 731 } 732 733 for (MachineInstr *MI : DeadCopies) 734 MI->eraseFromParent(); 735 DeadCopies.clear(); 736 } 737 return Changed; 738 } 739 740 bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { 741 const MachineInstr *MI; 742 for (;;) { 743 MI = MRI->getUniqueVRegDef(Reg); 744 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) 745 return true; 746 747 if (MI->getOpcode() != AMDGPU::COPY) 748 break; 749 750 Reg = MI->getOperand(1).getReg(); 751 if (!Reg.isVirtual()) 752 return false; 753 if (!isLaneMaskReg(Reg)) 754 return false; 755 } 756 757 if (MI->getOpcode() != MovOp) 758 return false; 759 760 if (!MI->getOperand(1).isImm()) 761 return false; 762 763 int64_t Imm = MI->getOperand(1).getImm(); 764 if (Imm == 0) { 765 Val = false; 766 return true; 767 } 768 if (Imm == -1) { 769 Val = true; 770 return true; 771 } 772 773 return false; 774 } 775 776 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 777 Def = false; 778 Use = false; 779 780 for (const MachineOperand &MO : MI.operands()) { 781 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 782 if (MO.isUse()) 783 Use = true; 784 else 785 Def = true; 786 } 787 } 788 } 789 790 /// Return a point at the end of the given \p MBB to insert SALU instructions 791 /// for lane mask calculation. Take terminators and SCC into account. 792 MachineBasicBlock::iterator 793 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 794 auto InsertionPt = MBB.getFirstTerminator(); 795 bool TerminatorsUseSCC = false; 796 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 797 bool DefsSCC; 798 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 799 if (TerminatorsUseSCC || DefsSCC) 800 break; 801 } 802 803 if (!TerminatorsUseSCC) 804 return InsertionPt; 805 806 while (InsertionPt != MBB.begin()) { 807 InsertionPt--; 808 809 bool DefSCC, UseSCC; 810 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 811 if (DefSCC) 812 return InsertionPt; 813 } 814 815 // We should have at least seen an IMPLICIT_DEF or COPY 816 llvm_unreachable("SCC used by terminator but no def in block"); 817 } 818 819 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 820 MachineBasicBlock::iterator I, 821 const DebugLoc &DL, unsigned DstReg, 822 unsigned PrevReg, unsigned CurReg) { 823 bool PrevVal = false; 824 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 825 bool CurVal = false; 826 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 827 828 if (PrevConstant && CurConstant) { 829 if (PrevVal == CurVal) { 830 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 831 } else if (CurVal) { 832 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 833 } else { 834 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 835 .addReg(ExecReg) 836 .addImm(-1); 837 } 838 return; 839 } 840 841 unsigned PrevMaskedReg = 0; 842 unsigned CurMaskedReg = 0; 843 if (!PrevConstant) { 844 if (CurConstant && CurVal) { 845 PrevMaskedReg = PrevReg; 846 } else { 847 PrevMaskedReg = createLaneMaskReg(*MF); 848 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 849 .addReg(PrevReg) 850 .addReg(ExecReg); 851 } 852 } 853 if (!CurConstant) { 854 // TODO: check whether CurReg is already masked by EXEC 855 if (PrevConstant && PrevVal) { 856 CurMaskedReg = CurReg; 857 } else { 858 CurMaskedReg = createLaneMaskReg(*MF); 859 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 860 .addReg(CurReg) 861 .addReg(ExecReg); 862 } 863 } 864 865 if (PrevConstant && !PrevVal) { 866 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 867 .addReg(CurMaskedReg); 868 } else if (CurConstant && !CurVal) { 869 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 870 .addReg(PrevMaskedReg); 871 } else if (PrevConstant && PrevVal) { 872 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 873 .addReg(CurMaskedReg) 874 .addReg(ExecReg); 875 } else { 876 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 877 .addReg(PrevMaskedReg) 878 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 879 } 880 } 881