1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers all occurrences of i1 values (with a vreg_1 register class) 10 // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA 11 // form and a wave-level control flow graph. 12 // 13 // Before this pass, values that are semantically i1 and are defined and used 14 // within the same basic block are already represented as lane masks in scalar 15 // registers. However, values that cross basic blocks are always transferred 16 // between basic blocks in vreg_1 virtual registers and are lowered by this 17 // pass. 18 // 19 // The only instructions that use or define vreg_1 virtual registers are COPY, 20 // PHI, and IMPLICIT_DEF. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include "llvm/CodeGen/MachinePostDominators.h" 30 #include "llvm/CodeGen/MachineSSAUpdater.h" 31 #include "llvm/InitializePasses.h" 32 33 #define DEBUG_TYPE "si-i1-copies" 34 35 using namespace llvm; 36 37 static unsigned createLaneMaskReg(MachineFunction &MF); 38 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); 39 40 namespace { 41 42 class SILowerI1Copies : public MachineFunctionPass { 43 public: 44 static char ID; 45 46 private: 47 bool IsWave32 = false; 48 MachineFunction *MF = nullptr; 49 MachineDominatorTree *DT = nullptr; 50 MachinePostDominatorTree *PDT = nullptr; 51 MachineRegisterInfo *MRI = nullptr; 52 const GCNSubtarget *ST = nullptr; 53 const SIInstrInfo *TII = nullptr; 54 55 unsigned ExecReg; 56 unsigned MovOp; 57 unsigned AndOp; 58 unsigned OrOp; 59 unsigned XorOp; 60 unsigned AndN2Op; 61 unsigned OrN2Op; 62 63 DenseSet<unsigned> ConstrainRegs; 64 65 public: 66 SILowerI1Copies() : MachineFunctionPass(ID) { 67 initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); 68 } 69 70 bool runOnMachineFunction(MachineFunction &MF) override; 71 72 StringRef getPassName() const override { return "SI Lower i1 Copies"; } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.setPreservesCFG(); 76 AU.addRequired<MachineDominatorTree>(); 77 AU.addRequired<MachinePostDominatorTree>(); 78 MachineFunctionPass::getAnalysisUsage(AU); 79 } 80 81 private: 82 void lowerCopiesFromI1(); 83 void lowerPhis(); 84 void lowerCopiesToI1(); 85 bool isConstantLaneMask(Register Reg, bool &Val) const; 86 void buildMergeLaneMasks(MachineBasicBlock &MBB, 87 MachineBasicBlock::iterator I, const DebugLoc &DL, 88 unsigned DstReg, unsigned PrevReg, unsigned CurReg); 89 MachineBasicBlock::iterator 90 getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; 91 92 bool isVreg1(Register Reg) const { 93 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; 94 } 95 96 bool isLaneMaskReg(unsigned Reg) const { 97 return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && 98 TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == 99 ST->getWavefrontSize(); 100 } 101 }; 102 103 /// Helper class that determines the relationship between incoming values of a 104 /// phi in the control flow graph to determine where an incoming value can 105 /// simply be taken as a scalar lane mask as-is, and where it needs to be 106 /// merged with another, previously defined lane mask. 107 /// 108 /// The approach is as follows: 109 /// - Determine all basic blocks which, starting from the incoming blocks, 110 /// a wave may reach before entering the def block (the block containing the 111 /// phi). 112 /// - If an incoming block has no predecessors in this set, we can take the 113 /// incoming value as a scalar lane mask as-is. 114 /// -- A special case of this is when the def block has a self-loop. 115 /// - Otherwise, the incoming value needs to be merged with a previously 116 /// defined lane mask. 117 /// - If there is a path into the set of reachable blocks that does _not_ go 118 /// through an incoming block where we can take the scalar lane mask as-is, 119 /// we need to invent an available value for the SSAUpdater. Choices are 120 /// 0 and undef, with differing consequences for how to merge values etc. 121 /// 122 /// TODO: We could use region analysis to quickly skip over SESE regions during 123 /// the traversal. 124 /// 125 class PhiIncomingAnalysis { 126 MachinePostDominatorTree &PDT; 127 128 // For each reachable basic block, whether it is a source in the induced 129 // subgraph of the CFG. 130 DenseMap<MachineBasicBlock *, bool> ReachableMap; 131 SmallVector<MachineBasicBlock *, 4> ReachableOrdered; 132 SmallVector<MachineBasicBlock *, 4> Stack; 133 SmallVector<MachineBasicBlock *, 4> Predecessors; 134 135 public: 136 PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} 137 138 /// Returns whether \p MBB is a source in the induced subgraph of reachable 139 /// blocks. 140 bool isSource(MachineBasicBlock &MBB) const { 141 return ReachableMap.find(&MBB)->second; 142 } 143 144 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } 145 146 void analyze(MachineBasicBlock &DefBlock, 147 ArrayRef<MachineBasicBlock *> IncomingBlocks) { 148 assert(Stack.empty()); 149 ReachableMap.clear(); 150 ReachableOrdered.clear(); 151 Predecessors.clear(); 152 153 // Insert the def block first, so that it acts as an end point for the 154 // traversal. 155 ReachableMap.try_emplace(&DefBlock, false); 156 ReachableOrdered.push_back(&DefBlock); 157 158 for (MachineBasicBlock *MBB : IncomingBlocks) { 159 if (MBB == &DefBlock) { 160 ReachableMap[&DefBlock] = true; // self-loop on DefBlock 161 continue; 162 } 163 164 ReachableMap.try_emplace(MBB, false); 165 ReachableOrdered.push_back(MBB); 166 167 // If this block has a divergent terminator and the def block is its 168 // post-dominator, the wave may first visit the other successors. 169 bool Divergent = false; 170 for (MachineInstr &MI : MBB->terminators()) { 171 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 172 MI.getOpcode() == AMDGPU::SI_IF || 173 MI.getOpcode() == AMDGPU::SI_ELSE || 174 MI.getOpcode() == AMDGPU::SI_LOOP) { 175 Divergent = true; 176 break; 177 } 178 } 179 180 if (Divergent && PDT.dominates(&DefBlock, MBB)) 181 append_range(Stack, MBB->successors()); 182 } 183 184 while (!Stack.empty()) { 185 MachineBasicBlock *MBB = Stack.pop_back_val(); 186 if (!ReachableMap.try_emplace(MBB, false).second) 187 continue; 188 ReachableOrdered.push_back(MBB); 189 190 append_range(Stack, MBB->successors()); 191 } 192 193 for (MachineBasicBlock *MBB : ReachableOrdered) { 194 bool HaveReachablePred = false; 195 for (MachineBasicBlock *Pred : MBB->predecessors()) { 196 if (ReachableMap.count(Pred)) { 197 HaveReachablePred = true; 198 } else { 199 Stack.push_back(Pred); 200 } 201 } 202 if (!HaveReachablePred) 203 ReachableMap[MBB] = true; 204 if (HaveReachablePred) { 205 for (MachineBasicBlock *UnreachablePred : Stack) { 206 if (!llvm::is_contained(Predecessors, UnreachablePred)) 207 Predecessors.push_back(UnreachablePred); 208 } 209 } 210 Stack.clear(); 211 } 212 } 213 }; 214 215 /// Helper class that detects loops which require us to lower an i1 COPY into 216 /// bitwise manipulation. 217 /// 218 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish 219 /// between loops with the same header. Consider this example: 220 /// 221 /// A-+-+ 222 /// | | | 223 /// B-+ | 224 /// | | 225 /// C---+ 226 /// 227 /// A is the header of a loop containing A, B, and C as far as LoopInfo is 228 /// concerned. However, an i1 COPY in B that is used in C must be lowered to 229 /// bitwise operations to combine results from different loop iterations when 230 /// B has a divergent branch (since by default we will compile this code such 231 /// that threads in a wave are merged at the entry of C). 232 /// 233 /// The following rule is implemented to determine whether bitwise operations 234 /// are required: use the bitwise lowering for a def in block B if a backward 235 /// edge to B is reachable without going through the nearest common 236 /// post-dominator of B and all uses of the def. 237 /// 238 /// TODO: This rule is conservative because it does not check whether the 239 /// relevant branches are actually divergent. 240 /// 241 /// The class is designed to cache the CFG traversal so that it can be re-used 242 /// for multiple defs within the same basic block. 243 /// 244 /// TODO: We could use region analysis to quickly skip over SESE regions during 245 /// the traversal. 246 /// 247 class LoopFinder { 248 MachineDominatorTree &DT; 249 MachinePostDominatorTree &PDT; 250 251 // All visited / reachable block, tagged by level (level 0 is the def block, 252 // level 1 are all blocks reachable including but not going through the def 253 // block's IPDOM, etc.). 254 DenseMap<MachineBasicBlock *, unsigned> Visited; 255 256 // Nearest common dominator of all visited blocks by level (level 0 is the 257 // def block). Used for seeding the SSAUpdater. 258 SmallVector<MachineBasicBlock *, 4> CommonDominators; 259 260 // Post-dominator of all visited blocks. 261 MachineBasicBlock *VisitedPostDom = nullptr; 262 263 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is 264 // reachable without going through the IPDOM of the def block (if the IPDOM 265 // itself has an edge to the def block, the loop level is 2), etc. 266 unsigned FoundLoopLevel = ~0u; 267 268 MachineBasicBlock *DefBlock = nullptr; 269 SmallVector<MachineBasicBlock *, 4> Stack; 270 SmallVector<MachineBasicBlock *, 4> NextLevel; 271 272 public: 273 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) 274 : DT(DT), PDT(PDT) {} 275 276 void initialize(MachineBasicBlock &MBB) { 277 Visited.clear(); 278 CommonDominators.clear(); 279 Stack.clear(); 280 NextLevel.clear(); 281 VisitedPostDom = nullptr; 282 FoundLoopLevel = ~0u; 283 284 DefBlock = &MBB; 285 } 286 287 /// Check whether a backward edge can be reached without going through the 288 /// given \p PostDom of the def block. 289 /// 290 /// Return the level of \p PostDom if a loop was found, or 0 otherwise. 291 unsigned findLoop(MachineBasicBlock *PostDom) { 292 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); 293 294 if (!VisitedPostDom) 295 advanceLevel(); 296 297 unsigned Level = 0; 298 while (PDNode->getBlock() != PostDom) { 299 if (PDNode->getBlock() == VisitedPostDom) 300 advanceLevel(); 301 PDNode = PDNode->getIDom(); 302 Level++; 303 if (FoundLoopLevel == Level) 304 return Level; 305 } 306 307 return 0; 308 } 309 310 /// Add undef values dominating the loop and the optionally given additional 311 /// blocks, so that the SSA updater doesn't have to search all the way to the 312 /// function entry. 313 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, 314 ArrayRef<MachineBasicBlock *> Blocks = {}) { 315 assert(LoopLevel < CommonDominators.size()); 316 317 MachineBasicBlock *Dom = CommonDominators[LoopLevel]; 318 for (MachineBasicBlock *MBB : Blocks) 319 Dom = DT.findNearestCommonDominator(Dom, MBB); 320 321 if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { 322 SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); 323 } else { 324 // The dominator is part of the loop or the given blocks, so add the 325 // undef value to unreachable predecessors instead. 326 for (MachineBasicBlock *Pred : Dom->predecessors()) { 327 if (!inLoopLevel(*Pred, LoopLevel, Blocks)) 328 SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); 329 } 330 } 331 } 332 333 private: 334 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, 335 ArrayRef<MachineBasicBlock *> Blocks) const { 336 auto DomIt = Visited.find(&MBB); 337 if (DomIt != Visited.end() && DomIt->second <= LoopLevel) 338 return true; 339 340 if (llvm::is_contained(Blocks, &MBB)) 341 return true; 342 343 return false; 344 } 345 346 void advanceLevel() { 347 MachineBasicBlock *VisitedDom; 348 349 if (!VisitedPostDom) { 350 VisitedPostDom = DefBlock; 351 VisitedDom = DefBlock; 352 Stack.push_back(DefBlock); 353 } else { 354 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); 355 VisitedDom = CommonDominators.back(); 356 357 for (unsigned i = 0; i < NextLevel.size();) { 358 if (PDT.dominates(VisitedPostDom, NextLevel[i])) { 359 Stack.push_back(NextLevel[i]); 360 361 NextLevel[i] = NextLevel.back(); 362 NextLevel.pop_back(); 363 } else { 364 i++; 365 } 366 } 367 } 368 369 unsigned Level = CommonDominators.size(); 370 while (!Stack.empty()) { 371 MachineBasicBlock *MBB = Stack.pop_back_val(); 372 if (!PDT.dominates(VisitedPostDom, MBB)) 373 NextLevel.push_back(MBB); 374 375 Visited[MBB] = Level; 376 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); 377 378 for (MachineBasicBlock *Succ : MBB->successors()) { 379 if (Succ == DefBlock) { 380 if (MBB == VisitedPostDom) 381 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); 382 else 383 FoundLoopLevel = std::min(FoundLoopLevel, Level); 384 continue; 385 } 386 387 if (Visited.try_emplace(Succ, ~0u).second) { 388 if (MBB == VisitedPostDom) 389 NextLevel.push_back(Succ); 390 else 391 Stack.push_back(Succ); 392 } 393 } 394 } 395 396 CommonDominators.push_back(VisitedDom); 397 } 398 }; 399 400 } // End anonymous namespace. 401 402 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 403 false) 404 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 405 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 406 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, 407 false) 408 409 char SILowerI1Copies::ID = 0; 410 411 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; 412 413 FunctionPass *llvm::createSILowerI1CopiesPass() { 414 return new SILowerI1Copies(); 415 } 416 417 static unsigned createLaneMaskReg(MachineFunction &MF) { 418 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 419 MachineRegisterInfo &MRI = MF.getRegInfo(); 420 return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass 421 : &AMDGPU::SReg_64RegClass); 422 } 423 424 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { 425 MachineFunction &MF = *MBB.getParent(); 426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 427 const SIInstrInfo *TII = ST.getInstrInfo(); 428 unsigned UndefReg = createLaneMaskReg(MF); 429 BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), 430 UndefReg); 431 return UndefReg; 432 } 433 434 /// Lower all instructions that def or use vreg_1 registers. 435 /// 436 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can 437 /// occur around inline assembly. We do this first, before vreg_1 registers 438 /// are changed to scalar mask registers. 439 /// 440 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before 441 /// all others, because phi lowering looks through copies and can therefore 442 /// often make copy lowering unnecessary. 443 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { 444 // Only need to run this in SelectionDAG path. 445 if (TheMF.getProperties().hasProperty( 446 MachineFunctionProperties::Property::Selected)) 447 return false; 448 449 MF = &TheMF; 450 MRI = &MF->getRegInfo(); 451 DT = &getAnalysis<MachineDominatorTree>(); 452 PDT = &getAnalysis<MachinePostDominatorTree>(); 453 454 ST = &MF->getSubtarget<GCNSubtarget>(); 455 TII = ST->getInstrInfo(); 456 IsWave32 = ST->isWave32(); 457 458 if (IsWave32) { 459 ExecReg = AMDGPU::EXEC_LO; 460 MovOp = AMDGPU::S_MOV_B32; 461 AndOp = AMDGPU::S_AND_B32; 462 OrOp = AMDGPU::S_OR_B32; 463 XorOp = AMDGPU::S_XOR_B32; 464 AndN2Op = AMDGPU::S_ANDN2_B32; 465 OrN2Op = AMDGPU::S_ORN2_B32; 466 } else { 467 ExecReg = AMDGPU::EXEC; 468 MovOp = AMDGPU::S_MOV_B64; 469 AndOp = AMDGPU::S_AND_B64; 470 OrOp = AMDGPU::S_OR_B64; 471 XorOp = AMDGPU::S_XOR_B64; 472 AndN2Op = AMDGPU::S_ANDN2_B64; 473 OrN2Op = AMDGPU::S_ORN2_B64; 474 } 475 476 lowerCopiesFromI1(); 477 lowerPhis(); 478 lowerCopiesToI1(); 479 480 for (unsigned Reg : ConstrainRegs) 481 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); 482 ConstrainRegs.clear(); 483 484 return true; 485 } 486 487 #ifndef NDEBUG 488 static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, 489 const MachineRegisterInfo &MRI, 490 Register Reg) { 491 unsigned Size = TRI.getRegSizeInBits(Reg, MRI); 492 return Size == 1 || Size == 32; 493 } 494 #endif 495 496 void SILowerI1Copies::lowerCopiesFromI1() { 497 SmallVector<MachineInstr *, 4> DeadCopies; 498 499 for (MachineBasicBlock &MBB : *MF) { 500 for (MachineInstr &MI : MBB) { 501 if (MI.getOpcode() != AMDGPU::COPY) 502 continue; 503 504 Register DstReg = MI.getOperand(0).getReg(); 505 Register SrcReg = MI.getOperand(1).getReg(); 506 if (!isVreg1(SrcReg)) 507 continue; 508 509 if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) 510 continue; 511 512 // Copy into a 32-bit vector register. 513 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); 514 DebugLoc DL = MI.getDebugLoc(); 515 516 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); 517 assert(!MI.getOperand(0).getSubReg()); 518 519 ConstrainRegs.insert(SrcReg); 520 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 521 .addImm(0) 522 .addImm(0) 523 .addImm(0) 524 .addImm(-1) 525 .addReg(SrcReg); 526 DeadCopies.push_back(&MI); 527 } 528 529 for (MachineInstr *MI : DeadCopies) 530 MI->eraseFromParent(); 531 DeadCopies.clear(); 532 } 533 } 534 535 void SILowerI1Copies::lowerPhis() { 536 MachineSSAUpdater SSAUpdater(*MF); 537 LoopFinder LF(*DT, *PDT); 538 PhiIncomingAnalysis PIA(*PDT); 539 SmallVector<MachineInstr *, 4> Vreg1Phis; 540 SmallVector<MachineBasicBlock *, 4> IncomingBlocks; 541 SmallVector<unsigned, 4> IncomingRegs; 542 SmallVector<unsigned, 4> IncomingUpdated; 543 #ifndef NDEBUG 544 DenseSet<unsigned> PhiRegisters; 545 #endif 546 547 for (MachineBasicBlock &MBB : *MF) { 548 for (MachineInstr &MI : MBB.phis()) { 549 if (isVreg1(MI.getOperand(0).getReg())) 550 Vreg1Phis.push_back(&MI); 551 } 552 } 553 554 MachineBasicBlock *PrevMBB = nullptr; 555 for (MachineInstr *MI : Vreg1Phis) { 556 MachineBasicBlock &MBB = *MI->getParent(); 557 if (&MBB != PrevMBB) { 558 LF.initialize(MBB); 559 PrevMBB = &MBB; 560 } 561 562 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); 563 564 Register DstReg = MI->getOperand(0).getReg(); 565 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 566 : &AMDGPU::SReg_64RegClass); 567 568 // Collect incoming values. 569 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 570 assert(i + 1 < MI->getNumOperands()); 571 Register IncomingReg = MI->getOperand(i).getReg(); 572 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); 573 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); 574 575 if (IncomingDef->getOpcode() == AMDGPU::COPY) { 576 IncomingReg = IncomingDef->getOperand(1).getReg(); 577 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); 578 assert(!IncomingDef->getOperand(1).getSubReg()); 579 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { 580 continue; 581 } else { 582 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); 583 } 584 585 IncomingBlocks.push_back(IncomingMBB); 586 IncomingRegs.push_back(IncomingReg); 587 } 588 589 #ifndef NDEBUG 590 PhiRegisters.insert(DstReg); 591 #endif 592 593 // Phis in a loop that are observed outside the loop receive a simple but 594 // conservatively correct treatment. 595 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 596 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 597 DomBlocks.push_back(Use.getParent()); 598 599 MachineBasicBlock *PostDomBound = 600 PDT->findNearestCommonDominator(DomBlocks); 601 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 602 603 SSAUpdater.Initialize(DstReg); 604 605 if (FoundLoopLevel) { 606 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); 607 608 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 609 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 610 SSAUpdater.AddAvailableValue(IncomingBlocks[i], 611 IncomingUpdated.back()); 612 } 613 614 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 615 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 616 buildMergeLaneMasks( 617 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 618 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 619 } 620 } else { 621 // The phi is not observed from outside a loop. Use a more accurate 622 // lowering. 623 PIA.analyze(MBB, IncomingBlocks); 624 625 for (MachineBasicBlock *MBB : PIA.predecessors()) 626 SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); 627 628 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 629 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 630 if (PIA.isSource(IMBB)) { 631 IncomingUpdated.push_back(0); 632 SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); 633 } else { 634 IncomingUpdated.push_back(createLaneMaskReg(*MF)); 635 SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); 636 } 637 } 638 639 for (unsigned i = 0; i < IncomingRegs.size(); ++i) { 640 if (!IncomingUpdated[i]) 641 continue; 642 643 MachineBasicBlock &IMBB = *IncomingBlocks[i]; 644 buildMergeLaneMasks( 645 IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], 646 SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); 647 } 648 } 649 650 Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); 651 if (NewReg != DstReg) { 652 MRI->replaceRegWith(NewReg, DstReg); 653 MI->eraseFromParent(); 654 } 655 656 IncomingBlocks.clear(); 657 IncomingRegs.clear(); 658 IncomingUpdated.clear(); 659 } 660 } 661 662 void SILowerI1Copies::lowerCopiesToI1() { 663 MachineSSAUpdater SSAUpdater(*MF); 664 LoopFinder LF(*DT, *PDT); 665 SmallVector<MachineInstr *, 4> DeadCopies; 666 667 for (MachineBasicBlock &MBB : *MF) { 668 LF.initialize(MBB); 669 670 for (MachineInstr &MI : MBB) { 671 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && 672 MI.getOpcode() != AMDGPU::COPY) 673 continue; 674 675 Register DstReg = MI.getOperand(0).getReg(); 676 if (!isVreg1(DstReg)) 677 continue; 678 679 if (MRI->use_empty(DstReg)) { 680 DeadCopies.push_back(&MI); 681 continue; 682 } 683 684 LLVM_DEBUG(dbgs() << "Lower Other: " << MI); 685 686 MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass 687 : &AMDGPU::SReg_64RegClass); 688 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) 689 continue; 690 691 DebugLoc DL = MI.getDebugLoc(); 692 Register SrcReg = MI.getOperand(1).getReg(); 693 assert(!MI.getOperand(1).getSubReg()); 694 695 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { 696 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); 697 unsigned TmpReg = createLaneMaskReg(*MF); 698 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) 699 .addReg(SrcReg) 700 .addImm(0); 701 MI.getOperand(1).setReg(TmpReg); 702 SrcReg = TmpReg; 703 } 704 705 // Defs in a loop that are observed outside the loop must be transformed 706 // into appropriate bit manipulation. 707 std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; 708 for (MachineInstr &Use : MRI->use_instructions(DstReg)) 709 DomBlocks.push_back(Use.getParent()); 710 711 MachineBasicBlock *PostDomBound = 712 PDT->findNearestCommonDominator(DomBlocks); 713 unsigned FoundLoopLevel = LF.findLoop(PostDomBound); 714 if (FoundLoopLevel) { 715 SSAUpdater.Initialize(DstReg); 716 SSAUpdater.AddAvailableValue(&MBB, DstReg); 717 LF.addLoopEntries(FoundLoopLevel, SSAUpdater); 718 719 buildMergeLaneMasks(MBB, MI, DL, DstReg, 720 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); 721 DeadCopies.push_back(&MI); 722 } 723 } 724 725 for (MachineInstr *MI : DeadCopies) 726 MI->eraseFromParent(); 727 DeadCopies.clear(); 728 } 729 } 730 731 bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { 732 const MachineInstr *MI; 733 for (;;) { 734 MI = MRI->getUniqueVRegDef(Reg); 735 if (MI->getOpcode() != AMDGPU::COPY) 736 break; 737 738 Reg = MI->getOperand(1).getReg(); 739 if (!Reg.isVirtual()) 740 return false; 741 if (!isLaneMaskReg(Reg)) 742 return false; 743 } 744 745 if (MI->getOpcode() != MovOp) 746 return false; 747 748 if (!MI->getOperand(1).isImm()) 749 return false; 750 751 int64_t Imm = MI->getOperand(1).getImm(); 752 if (Imm == 0) { 753 Val = false; 754 return true; 755 } 756 if (Imm == -1) { 757 Val = true; 758 return true; 759 } 760 761 return false; 762 } 763 764 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { 765 Def = false; 766 Use = false; 767 768 for (const MachineOperand &MO : MI.operands()) { 769 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { 770 if (MO.isUse()) 771 Use = true; 772 else 773 Def = true; 774 } 775 } 776 } 777 778 /// Return a point at the end of the given \p MBB to insert SALU instructions 779 /// for lane mask calculation. Take terminators and SCC into account. 780 MachineBasicBlock::iterator 781 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { 782 auto InsertionPt = MBB.getFirstTerminator(); 783 bool TerminatorsUseSCC = false; 784 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { 785 bool DefsSCC; 786 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); 787 if (TerminatorsUseSCC || DefsSCC) 788 break; 789 } 790 791 if (!TerminatorsUseSCC) 792 return InsertionPt; 793 794 while (InsertionPt != MBB.begin()) { 795 InsertionPt--; 796 797 bool DefSCC, UseSCC; 798 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); 799 if (DefSCC) 800 return InsertionPt; 801 } 802 803 // We should have at least seen an IMPLICIT_DEF or COPY 804 llvm_unreachable("SCC used by terminator but no def in block"); 805 } 806 807 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, 808 MachineBasicBlock::iterator I, 809 const DebugLoc &DL, unsigned DstReg, 810 unsigned PrevReg, unsigned CurReg) { 811 bool PrevVal; 812 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); 813 bool CurVal; 814 bool CurConstant = isConstantLaneMask(CurReg, CurVal); 815 816 if (PrevConstant && CurConstant) { 817 if (PrevVal == CurVal) { 818 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); 819 } else if (CurVal) { 820 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); 821 } else { 822 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) 823 .addReg(ExecReg) 824 .addImm(-1); 825 } 826 return; 827 } 828 829 unsigned PrevMaskedReg = 0; 830 unsigned CurMaskedReg = 0; 831 if (!PrevConstant) { 832 if (CurConstant && CurVal) { 833 PrevMaskedReg = PrevReg; 834 } else { 835 PrevMaskedReg = createLaneMaskReg(*MF); 836 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) 837 .addReg(PrevReg) 838 .addReg(ExecReg); 839 } 840 } 841 if (!CurConstant) { 842 // TODO: check whether CurReg is already masked by EXEC 843 if (PrevConstant && PrevVal) { 844 CurMaskedReg = CurReg; 845 } else { 846 CurMaskedReg = createLaneMaskReg(*MF); 847 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) 848 .addReg(CurReg) 849 .addReg(ExecReg); 850 } 851 } 852 853 if (PrevConstant && !PrevVal) { 854 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 855 .addReg(CurMaskedReg); 856 } else if (CurConstant && !CurVal) { 857 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) 858 .addReg(PrevMaskedReg); 859 } else if (PrevConstant && PrevVal) { 860 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) 861 .addReg(CurMaskedReg) 862 .addReg(ExecReg); 863 } else { 864 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) 865 .addReg(PrevMaskedReg) 866 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); 867 } 868 } 869