1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "SILowerI1Copies.h" 25 #include "AMDGPU.h" 26 #include "llvm/CodeGen/MachineSSAUpdater.h" 27 #include "llvm/InitializePasses.h" 28 29 #define DEBUG_TYPE "si-i1-copies" 30 31 using namespace llvm; 32 33 static Register 34 insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, 35 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs); 36 37 namespace { 38 39 class Vreg1LoweringHelper : public PhiLoweringHelper { 40 public: 41 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, 42 MachinePostDominatorTree *PDT); 43 44 private: 45 DenseSet<Register> ConstrainRegs; 46 47 public: 48 void markAsLaneMask(Register DstReg) const override; 49 void getCandidatesForLowering( 50 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; 51 void collectIncomingValuesFromPhi( 52 const MachineInstr *MI, 53 SmallVectorImpl<Incoming> &Incomings) const override; 54 void replaceDstReg(Register NewReg, Register OldReg, 55 MachineBasicBlock *MBB) override; 56 void buildMergeLaneMasks(MachineBasicBlock &MBB, 57 MachineBasicBlock::iterator I, const DebugLoc &DL, 58 Register DstReg, Register PrevReg, 59 Register CurReg) override; 60 void constrainAsLaneMask(Incoming &In) override; 61 62 bool lowerCopiesFromI1(); 63 bool lowerCopiesToI1(); 64 bool cleanConstrainRegs(bool Changed); 65 bool isVreg1(Register Reg) const { 66 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 67 } 68 }; 69 70 Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, 71 MachineDominatorTree *DT, 72 MachinePostDominatorTree *PDT) 73 : PhiLoweringHelper(MF, DT, PDT) {} 74 75 bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { 76 assert(Changed || ConstrainRegs.empty()); 77 for (Register Reg : ConstrainRegs) 78 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 79 ConstrainRegs.clear(); 80 81 return Changed; 82 } 83 84 /// Helper class that determines the relationship between incoming values of a 85 /// phi in the control flow graph to determine where an incoming value can 86 /// simply be taken as a scalar lane mask as-is, and where it needs to be 87 /// merged with another, previously defined lane mask. 88 /// 89 /// The approach is as follows: 90 /// - Determine all basic blocks which, starting from the incoming blocks, 91 /// a wave may reach before entering the def block (the block containing the 92 /// phi). 93 /// - If an incoming block has no predecessors in this set, we can take the 94 /// incoming value as a scalar lane mask as-is. 95 /// -- A special case of this is when the def block has a self-loop. 96 /// - Otherwise, the incoming value needs to be merged with a previously 97 /// defined lane mask. 98 /// - If there is a path into the set of reachable blocks that does _not_ go 99 /// through an incoming block where we can take the scalar lane mask as-is, 100 /// we need to invent an available value for the SSAUpdater. Choices are 101 /// 0 and undef, with differing consequences for how to merge values etc. 102 /// 103 /// TODO: We could use region analysis to quickly skip over SESE regions during 104 /// the traversal. 105 /// 106 class PhiIncomingAnalysis { 107 MachinePostDominatorTree &PDT; 108 const SIInstrInfo *TII; 109 110 // For each reachable basic block, whether it is a source in the induced 111 // subgraph of the CFG. 112 MapVector<MachineBasicBlock *, bool> ReachableMap; 113 SmallVector<MachineBasicBlock *, 4> Stack; 114 SmallVector<MachineBasicBlock *, 4> Predecessors; 115 116 public: 117 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII) 118 : PDT(PDT), TII(TII) {} 119 120 /// Returns whether \p MBB is a source in the induced subgraph of reachable 121 /// blocks. 122 bool isSource(MachineBasicBlock &MBB) const { 123 return ReachableMap.find(&MBB)->second; 124 } 125 126 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 127 128 void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) { 129 assert(Stack.empty()); 130 ReachableMap.clear(); 131 Predecessors.clear(); 132 133 // Insert the def block first, so that it acts as an end point for the 134 // traversal. 135 ReachableMap.try_emplace(&DefBlock, false); 136 137 for (auto Incoming : Incomings) { 138 MachineBasicBlock *MBB = Incoming.Block; 139 if (MBB == &DefBlock) { 140 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 141 continue; 142 } 143 144 ReachableMap.try_emplace(MBB, false); 145 146 // If this block has a divergent terminator and the def block is its 147 // post-dominator, the wave may first visit the other successors. 148 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB)) 149 append_range(Stack, MBB->successors()); 150 } 151 152 while (!Stack.empty()) { 153 MachineBasicBlock *MBB = Stack.pop_back_val(); 154 if (ReachableMap.try_emplace(MBB, false).second) 155 append_range(Stack, MBB->successors()); 156 } 157 158 for (auto &[MBB, Reachable] : ReachableMap) { 159 bool HaveReachablePred = false; 160 for (MachineBasicBlock *Pred : MBB->predecessors()) { 161 if (ReachableMap.count(Pred)) { 162 HaveReachablePred = true; 163 } else { 164 Stack.push_back(Pred); 165 } 166 } 167 if (!HaveReachablePred) 168 Reachable = true; 169 if (HaveReachablePred) { 170 for (MachineBasicBlock *UnreachablePred : Stack) { 171 if (!llvm::is_contained(Predecessors, UnreachablePred)) 172 Predecessors.push_back(UnreachablePred); 173 } 174 } 175 Stack.clear(); 176 } 177 } 178 }; 179 180 /// Helper class that detects loops which require us to lower an i1 COPY into 181 /// bitwise manipulation. 182 /// 183 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 184 /// between loops with the same header. Consider this example: 185 /// 186 /// A-+-+ 187 /// | | | 188 /// B-+ | 189 /// | | 190 /// C---+ 191 /// 192 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 193 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 194 /// bitwise operations to combine results from different loop iterations when 195 /// B has a divergent branch (since by default we will compile this code such 196 /// that threads in a wave are merged at the entry of C). 197 /// 198 /// The following rule is implemented to determine whether bitwise operations 199 /// are required: use the bitwise lowering for a def in block B if a backward 200 /// edge to B is reachable without going through the nearest common 201 /// post-dominator of B and all uses of the def. 202 /// 203 /// TODO: This rule is conservative because it does not check whether the 204 /// relevant branches are actually divergent. 205 /// 206 /// The class is designed to cache the CFG traversal so that it can be re-used 207 /// for multiple defs within the same basic block. 208 /// 209 /// TODO: We could use region analysis to quickly skip over SESE regions during 210 /// the traversal. 211 /// 212 class LoopFinder { 213 MachineDominatorTree &DT; 214 MachinePostDominatorTree &PDT; 215 216 // All visited / reachable block, tagged by level (level 0 is the def block, 217 // level 1 are all blocks reachable including but not going through the def 218 // block's IPDOM, etc.). 219 DenseMap<MachineBasicBlock *, unsigned> Visited; 220 221 // Nearest common dominator of all visited blocks by level (level 0 is the 222 // def block). Used for seeding the SSAUpdater. 223 SmallVector<MachineBasicBlock *, 4> CommonDominators; 224 225 // Post-dominator of all visited blocks. 226 MachineBasicBlock *VisitedPostDom = nullptr; 227 228 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 229 // reachable without going through the IPDOM of the def block (if the IPDOM 230 // itself has an edge to the def block, the loop level is 2), etc. 231 unsigned FoundLoopLevel = ~0u; 232 233 MachineBasicBlock *DefBlock = nullptr; 234 SmallVector<MachineBasicBlock *, 4> Stack; 235 SmallVector<MachineBasicBlock *, 4> NextLevel; 236 237 public: 238 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 239 : DT(DT), PDT(PDT) {} 240 241 void initialize(MachineBasicBlock &MBB) { 242 Visited.clear(); 243 CommonDominators.clear(); 244 Stack.clear(); 245 NextLevel.clear(); 246 VisitedPostDom = nullptr; 247 FoundLoopLevel = ~0u; 248 249 DefBlock = &MBB; 250 } 251 252 /// Check whether a backward edge can be reached without going through the 253 /// given \p PostDom of the def block. 254 /// 255 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 256 unsigned findLoop(MachineBasicBlock *PostDom) { 257 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 258 259 if (!VisitedPostDom) 260 advanceLevel(); 261 262 unsigned Level = 0; 263 while (PDNode->getBlock() != PostDom) { 264 if (PDNode->getBlock() == VisitedPostDom) 265 advanceLevel(); 266 PDNode = PDNode->getIDom(); 267 Level++; 268 if (FoundLoopLevel == Level) 269 return Level; 270 } 271 272 return 0; 273 } 274 275 /// Add undef values dominating the loop and the optionally given additional 276 /// blocks, so that the SSA updater doesn't have to search all the way to the 277 /// function entry. 278 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 279 MachineRegisterInfo &MRI, 280 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs, 281 ArrayRef<Incoming> Incomings = {}) { 282 assert(LoopLevel < CommonDominators.size()); 283 284 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 285 for (auto &Incoming : Incomings) 286 Dom = DT.findNearestCommonDominator(Dom, Incoming.Block); 287 288 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) { 289 SSAUpdater.AddAvailableValue( 290 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs)); 291 } else { 292 // The dominator is part of the loop or the given blocks, so add the 293 // undef value to unreachable predecessors instead. 294 for (MachineBasicBlock *Pred : Dom->predecessors()) { 295 if (!inLoopLevel(*Pred, LoopLevel, Incomings)) 296 SSAUpdater.AddAvailableValue( 297 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs)); 298 } 299 } 300 } 301 302 private: 303 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 304 ArrayRef<Incoming> Incomings) const { 305 auto DomIt = Visited.find(&MBB); 306 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 307 return true; 308 309 for (auto &Incoming : Incomings) 310 if (Incoming.Block == &MBB) 311 return true; 312 313 return false; 314 } 315 316 void advanceLevel() { 317 MachineBasicBlock *VisitedDom; 318 319 if (!VisitedPostDom) { 320 VisitedPostDom = DefBlock; 321 VisitedDom = DefBlock; 322 Stack.push_back(DefBlock); 323 } else { 324 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 325 VisitedDom = CommonDominators.back(); 326 327 for (unsigned i = 0; i < NextLevel.size();) { 328 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 329 Stack.push_back(NextLevel[i]); 330 331 NextLevel[i] = NextLevel.back(); 332 NextLevel.pop_back(); 333 } else { 334 i++; 335 } 336 } 337 } 338 339 unsigned Level = CommonDominators.size(); 340 while (!Stack.empty()) { 341 MachineBasicBlock *MBB = Stack.pop_back_val(); 342 if (!PDT.dominates(VisitedPostDom, MBB)) 343 NextLevel.push_back(MBB); 344 345 Visited[MBB] = Level; 346 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 347 348 for (MachineBasicBlock *Succ : MBB->successors()) { 349 if (Succ == DefBlock) { 350 if (MBB == VisitedPostDom) 351 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 352 else 353 FoundLoopLevel = std::min(FoundLoopLevel, Level); 354 continue; 355 } 356 357 if (Visited.try_emplace(Succ, ~0u).second) { 358 if (MBB == VisitedPostDom) 359 NextLevel.push_back(Succ); 360 else 361 Stack.push_back(Succ); 362 } 363 } 364 } 365 366 CommonDominators.push_back(VisitedDom); 367 } 368 }; 369 370 } // End anonymous namespace. 371 372 Register 373 llvm::createLaneMaskReg(MachineRegisterInfo *MRI, 374 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { 375 return MRI->createVirtualRegister(LaneMaskRegAttrs); 376 } 377 378 static Register 379 insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, 380 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { 381 MachineFunction &MF = *MBB->getParent(); 382 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 383 const SIInstrInfo *TII = ST.getInstrInfo(); 384 Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 385 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 386 UndefReg); 387 return UndefReg; 388 } 389 390 #ifndef NDEBUG 391 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 392 const MachineRegisterInfo &MRI, 393 Register Reg) { 394 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 395 return Size == 1 || Size == 32; 396 } 397 #endif 398 399 bool Vreg1LoweringHelper::lowerCopiesFromI1() { 400 bool Changed = false; 401 SmallVector<MachineInstr *, 4> DeadCopies; 402 403 for (MachineBasicBlock &MBB : *MF) { 404 for (MachineInstr &MI : MBB) { 405 if (MI.getOpcode() != AMDGPU::COPY) 406 continue; 407 408 Register DstReg = MI.getOperand(0).getReg(); 409 Register SrcReg = MI.getOperand(1).getReg(); 410 if (!isVreg1(SrcReg)) 411 continue; 412 413 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 414 continue; 415 416 Changed = true; 417 418 // Copy into a 32-bit vector register. 419 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 420 DebugLoc DL = MI.getDebugLoc(); 421 422 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 423 assert(!MI.getOperand(0).getSubReg()); 424 425 ConstrainRegs.insert(SrcReg); 426 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 427 .addImm(0) 428 .addImm(0) 429 .addImm(0) 430 .addImm(-1) 431 .addReg(SrcReg); 432 DeadCopies.push_back(&MI); 433 } 434 435 for (MachineInstr *MI : DeadCopies) 436 MI->eraseFromParent(); 437 DeadCopies.clear(); 438 } 439 return Changed; 440 } 441 442 PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, 443 MachineDominatorTree *DT, 444 MachinePostDominatorTree *PDT) 445 : MF(MF), DT(DT), PDT(PDT) { 446 MRI = &MF->getRegInfo(); 447 448 ST = &MF->getSubtarget<GCNSubtarget>(); 449 TII = ST->getInstrInfo(); 450 IsWave32 = ST->isWave32(); 451 452 if (IsWave32) { 453 ExecReg = AMDGPU::EXEC_LO; 454 MovOp = AMDGPU::S_MOV_B32; 455 AndOp = AMDGPU::S_AND_B32; 456 OrOp = AMDGPU::S_OR_B32; 457 XorOp = AMDGPU::S_XOR_B32; 458 AndN2Op = AMDGPU::S_ANDN2_B32; 459 OrN2Op = AMDGPU::S_ORN2_B32; 460 } else { 461 ExecReg = AMDGPU::EXEC; 462 MovOp = AMDGPU::S_MOV_B64; 463 AndOp = AMDGPU::S_AND_B64; 464 OrOp = AMDGPU::S_OR_B64; 465 XorOp = AMDGPU::S_XOR_B64; 466 AndN2Op = AMDGPU::S_ANDN2_B64; 467 OrN2Op = AMDGPU::S_ORN2_B64; 468 } 469 } 470 471 bool PhiLoweringHelper::lowerPhis() { 472 MachineSSAUpdater SSAUpdater(*MF); 473 LoopFinder LF(*DT, *PDT); 474 PhiIncomingAnalysis PIA(*PDT, TII); 475 SmallVector<MachineInstr *, 4> Vreg1Phis; 476 SmallVector<Incoming, 4> Incomings; 477 478 getCandidatesForLowering(Vreg1Phis); 479 if (Vreg1Phis.empty()) 480 return false; 481 482 DT->updateDFSNumbers(); 483 MachineBasicBlock *PrevMBB = nullptr; 484 for (MachineInstr *MI : Vreg1Phis) { 485 MachineBasicBlock &MBB = *MI->getParent(); 486 if (&MBB != PrevMBB) { 487 LF.initialize(MBB); 488 PrevMBB = &MBB; 489 } 490 491 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 492 493 Register DstReg = MI->getOperand(0).getReg(); 494 markAsLaneMask(DstReg); 495 initializeLaneMaskRegisterAttributes(DstReg); 496 497 collectIncomingValuesFromPhi(MI, Incomings); 498 499 // Sort the incomings such that incoming values that dominate other incoming 500 // values are sorted earlier. This allows us to do some amount of on-the-fly 501 // constant folding. 502 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block. 503 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) { 504 return DT->getNode(LHS.Block)->getDFSNumIn() < 505 DT->getNode(RHS.Block)->getDFSNumIn(); 506 }); 507 508 #ifndef NDEBUG 509 PhiRegisters.insert(DstReg); 510 #endif 511 512 // Phis in a loop that are observed outside the loop receive a simple but 513 // conservatively correct treatment. 514 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 515 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 516 DomBlocks.push_back(Use.getParent()); 517 518 MachineBasicBlock *PostDomBound = 519 PDT->findNearestCommonDominator(DomBlocks); 520 521 // FIXME: This fails to find irreducible cycles. If we have a def (other 522 // than a constant) in a pair of blocks that end up looping back to each 523 // other, it will be mishandle. Due to structurization this shouldn't occur 524 // in practice. 525 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 526 527 SSAUpdater.Initialize(DstReg); 528 529 if (FoundLoopLevel) { 530 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs, 531 Incomings); 532 533 for (auto &Incoming : Incomings) { 534 Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 535 SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg); 536 } 537 538 for (auto &Incoming : Incomings) { 539 MachineBasicBlock &IMBB = *Incoming.Block; 540 buildMergeLaneMasks( 541 IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, 542 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); 543 } 544 } else { 545 // The phi is not observed from outside a loop. Use a more accurate 546 // lowering. 547 PIA.analyze(MBB, Incomings); 548 549 for (MachineBasicBlock *MBB : PIA.predecessors()) 550 SSAUpdater.AddAvailableValue( 551 MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs)); 552 553 for (auto &Incoming : Incomings) { 554 MachineBasicBlock &IMBB = *Incoming.Block; 555 if (PIA.isSource(IMBB)) { 556 constrainAsLaneMask(Incoming); 557 SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg); 558 } else { 559 Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 560 SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg); 561 } 562 } 563 564 for (auto &Incoming : Incomings) { 565 if (!Incoming.UpdatedReg.isValid()) 566 continue; 567 568 MachineBasicBlock &IMBB = *Incoming.Block; 569 buildMergeLaneMasks( 570 IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, 571 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); 572 } 573 } 574 575 Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 576 if (NewReg != DstReg) { 577 replaceDstReg(NewReg, DstReg, &MBB); 578 MI->eraseFromParent(); 579 } 580 581 Incomings.clear(); 582 } 583 return true; 584 } 585 586 bool Vreg1LoweringHelper::lowerCopiesToI1() { 587 bool Changed = false; 588 MachineSSAUpdater SSAUpdater(*MF); 589 LoopFinder LF(*DT, *PDT); 590 SmallVector<MachineInstr *, 4> DeadCopies; 591 592 for (MachineBasicBlock &MBB : *MF) { 593 LF.initialize(MBB); 594 595 for (MachineInstr &MI : MBB) { 596 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 597 MI.getOpcode() != AMDGPU::COPY) 598 continue; 599 600 Register DstReg = MI.getOperand(0).getReg(); 601 if (!isVreg1(DstReg)) 602 continue; 603 604 Changed = true; 605 606 if (MRI->use_empty(DstReg)) { 607 DeadCopies.push_back(&MI); 608 continue; 609 } 610 611 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 612 613 markAsLaneMask(DstReg); 614 initializeLaneMaskRegisterAttributes(DstReg); 615 616 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 617 continue; 618 619 DebugLoc DL = MI.getDebugLoc(); 620 Register SrcReg = MI.getOperand(1).getReg(); 621 assert(!MI.getOperand(1).getSubReg()); 622 623 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 624 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 625 Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 626 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 627 .addReg(SrcReg) 628 .addImm(0); 629 MI.getOperand(1).setReg(TmpReg); 630 SrcReg = TmpReg; 631 } else { 632 // SrcReg needs to be live beyond copy. 633 MI.getOperand(1).setIsKill(false); 634 } 635 636 // Defs in a loop that are observed outside the loop must be transformed 637 // into appropriate bit manipulation. 638 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 639 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 640 DomBlocks.push_back(Use.getParent()); 641 642 MachineBasicBlock *PostDomBound = 643 PDT->findNearestCommonDominator(DomBlocks); 644 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 645 if (FoundLoopLevel) { 646 SSAUpdater.Initialize(DstReg); 647 SSAUpdater.AddAvailableValue(&MBB, DstReg); 648 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs); 649 650 buildMergeLaneMasks(MBB, MI, DL, DstReg, 651 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 652 DeadCopies.push_back(&MI); 653 } 654 } 655 656 for (MachineInstr *MI : DeadCopies) 657 MI->eraseFromParent(); 658 DeadCopies.clear(); 659 } 660 return Changed; 661 } 662 663 bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { 664 const MachineInstr *MI; 665 for (;;) { 666 MI = MRI->getUniqueVRegDef(Reg); 667 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) 668 return true; 669 670 if (MI->getOpcode() != AMDGPU::COPY) 671 break; 672 673 Reg = MI->getOperand(1).getReg(); 674 if (!Reg.isVirtual()) 675 return false; 676 if (!isLaneMaskReg(Reg)) 677 return false; 678 } 679 680 if (MI->getOpcode() != MovOp) 681 return false; 682 683 if (!MI->getOperand(1).isImm()) 684 return false; 685 686 int64_t Imm = MI->getOperand(1).getImm(); 687 if (Imm == 0) { 688 Val = false; 689 return true; 690 } 691 if (Imm == -1) { 692 Val = true; 693 return true; 694 } 695 696 return false; 697 } 698 699 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 700 Def = false; 701 Use = false; 702 703 for (const MachineOperand &MO : MI.operands()) { 704 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 705 if (MO.isUse()) 706 Use = true; 707 else 708 Def = true; 709 } 710 } 711 } 712 713 /// Return a point at the end of the given \p MBB to insert SALU instructions 714 /// for lane mask calculation. Take terminators and SCC into account. 715 MachineBasicBlock::iterator 716 PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 717 auto InsertionPt = MBB.getFirstTerminator(); 718 bool TerminatorsUseSCC = false; 719 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 720 bool DefsSCC; 721 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 722 if (TerminatorsUseSCC || DefsSCC) 723 break; 724 } 725 726 if (!TerminatorsUseSCC) 727 return InsertionPt; 728 729 while (InsertionPt != MBB.begin()) { 730 InsertionPt--; 731 732 bool DefSCC, UseSCC; 733 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 734 if (DefSCC) 735 return InsertionPt; 736 } 737 738 // We should have at least seen an IMPLICIT_DEF or COPY 739 llvm_unreachable("SCC used by terminator but no def in block"); 740 } 741 742 // VReg_1 -> SReg_32 or SReg_64 743 void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { 744 MRI->setRegClass(DstReg, ST->getBoolRC()); 745 } 746 747 void Vreg1LoweringHelper::getCandidatesForLowering( 748 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { 749 for (MachineBasicBlock &MBB : *MF) { 750 for (MachineInstr &MI : MBB.phis()) { 751 if (isVreg1(MI.getOperand(0).getReg())) 752 Vreg1Phis.push_back(&MI); 753 } 754 } 755 } 756 757 void Vreg1LoweringHelper::collectIncomingValuesFromPhi( 758 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { 759 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 760 assert(i + 1 < MI->getNumOperands()); 761 Register IncomingReg = MI->getOperand(i).getReg(); 762 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 763 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 764 765 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 766 IncomingReg = IncomingDef->getOperand(1).getReg(); 767 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 768 assert(!IncomingDef->getOperand(1).getSubReg()); 769 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 770 continue; 771 } else { 772 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 773 } 774 775 Incomings.emplace_back(IncomingReg, IncomingMBB, Register()); 776 } 777 } 778 779 void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, 780 MachineBasicBlock *MBB) { 781 MRI->replaceRegWith(NewReg, OldReg); 782 } 783 784 void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, 785 MachineBasicBlock::iterator I, 786 const DebugLoc &DL, 787 Register DstReg, Register PrevReg, 788 Register CurReg) { 789 bool PrevVal = false; 790 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 791 bool CurVal = false; 792 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 793 794 if (PrevConstant && CurConstant) { 795 if (PrevVal == CurVal) { 796 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 797 } else if (CurVal) { 798 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 799 } else { 800 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 801 .addReg(ExecReg) 802 .addImm(-1); 803 } 804 return; 805 } 806 807 Register PrevMaskedReg; 808 Register CurMaskedReg; 809 if (!PrevConstant) { 810 if (CurConstant && CurVal) { 811 PrevMaskedReg = PrevReg; 812 } else { 813 PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 814 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 815 .addReg(PrevReg) 816 .addReg(ExecReg); 817 } 818 } 819 if (!CurConstant) { 820 // TODO: check whether CurReg is already masked by EXEC 821 if (PrevConstant && PrevVal) { 822 CurMaskedReg = CurReg; 823 } else { 824 CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 825 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 826 .addReg(CurReg) 827 .addReg(ExecReg); 828 } 829 } 830 831 if (PrevConstant && !PrevVal) { 832 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 833 .addReg(CurMaskedReg); 834 } else if (CurConstant && !CurVal) { 835 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 836 .addReg(PrevMaskedReg); 837 } else if (PrevConstant && PrevVal) { 838 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 839 .addReg(CurMaskedReg) 840 .addReg(ExecReg); 841 } else { 842 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 843 .addReg(PrevMaskedReg) 844 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 845 } 846 } 847 848 void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {} 849 850 /// Lower all instructions that def or use vreg_1 registers. 851 /// 852 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 853 /// occur around inline assembly. We do this first, before vreg_1 registers 854 /// are changed to scalar mask registers. 855 /// 856 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 857 /// all others, because phi lowering looks through copies and can therefore 858 /// often make copy lowering unnecessary. 859 static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, 860 MachinePostDominatorTree &MPDT) { 861 // Only need to run this in SelectionDAG path. 862 if (MF.getProperties().hasSelected()) 863 return false; 864 865 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT); 866 bool Changed = false; 867 Changed |= Helper.lowerCopiesFromI1(); 868 Changed |= Helper.lowerPhis(); 869 Changed |= Helper.lowerCopiesToI1(); 870 return Helper.cleanConstrainRegs(Changed); 871 } 872 873 PreservedAnalyses 874 SILowerI1CopiesPass::run(MachineFunction &MF, 875 MachineFunctionAnalysisManager &MFAM) { 876 MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF); 877 MachinePostDominatorTree &MPDT = 878 MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF); 879 bool Changed = runFixI1Copies(MF, MDT, MPDT); 880 if (!Changed) 881 return PreservedAnalyses::all(); 882 883 // TODO: Probably preserves most. 884 PreservedAnalyses PA; 885 PA.preserveSet<CFGAnalyses>(); 886 return PA; 887 } 888 889 class SILowerI1CopiesLegacy : public MachineFunctionPass { 890 public: 891 static char ID; 892 893 SILowerI1CopiesLegacy() : MachineFunctionPass(ID) { 894 initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry()); 895 } 896 897 bool runOnMachineFunction(MachineFunction &MF) override; 898 899 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 900 901 void getAnalysisUsage(AnalysisUsage &AU) const override { 902 AU.setPreservesCFG(); 903 AU.addRequired<MachineDominatorTreeWrapperPass>(); 904 AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 905 MachineFunctionPass::getAnalysisUsage(AU); 906 } 907 }; 908 909 bool SILowerI1CopiesLegacy::runOnMachineFunction(MachineFunction &MF) { 910 MachineDominatorTree &MDT = 911 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 912 MachinePostDominatorTree &MPDT = 913 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 914 return runFixI1Copies(MF, MDT, MPDT); 915 } 916 917 INITIALIZE_PASS_BEGIN(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies", 918 false, false) 919 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 920 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 921 INITIALIZE_PASS_END(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies", 922 false, false) 923 924 char SILowerI1CopiesLegacy::ID = 0; 925 926 char &llvm::SILowerI1CopiesLegacyID = SILowerI1CopiesLegacy::ID; 927 928 FunctionPass *llvm::createSILowerI1CopiesLegacyPass() { 929 return new SILowerI1CopiesLegacy(); 930 } 931