1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPU.h" 10 #include "GCNSubtarget.h" 11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12 #include "SIRegisterInfo.h" 13 #include "llvm/CodeGen/LivePhysRegs.h" 14 #include "llvm/CodeGen/MachineFunctionPass.h" 15 #include "llvm/CodeGen/MachineOperand.h" 16 #include "llvm/CodeGen/TargetRegisterInfo.h" 17 #include "llvm/InitializePasses.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "si-optimize-exec-masking" 22 23 namespace { 24 25 class SIOptimizeExecMasking : public MachineFunctionPass { 26 MachineFunction *MF = nullptr; 27 const GCNSubtarget *ST = nullptr; 28 const SIRegisterInfo *TRI = nullptr; 29 const SIInstrInfo *TII = nullptr; 30 const MachineRegisterInfo *MRI = nullptr; 31 MCRegister Exec; 32 33 DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; 34 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; 35 36 Register isCopyFromExec(const MachineInstr &MI) const; 37 Register isCopyToExec(const MachineInstr &MI) const; 38 bool removeTerminatorBit(MachineInstr &MI) const; 39 MachineBasicBlock::reverse_iterator 40 fixTerminators(MachineBasicBlock &MBB) const; 41 MachineBasicBlock::reverse_iterator 42 findExecCopy(MachineBasicBlock &MBB, 43 MachineBasicBlock::reverse_iterator I) const; 44 45 bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, 46 MCRegister Reg, bool UseLiveOuts = false, 47 bool IgnoreStart = false) const; 48 bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; 49 MachineInstr *findInstrBackwards(MachineInstr &Origin, 50 std::function<bool(MachineInstr *)> Pred, 51 ArrayRef<MCRegister> NonModifiableRegs, 52 unsigned MaxInstructions = 20) const; 53 bool optimizeExecSequence(); 54 void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); 55 bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, 56 MachineInstr &VCmp, MCRegister Exec) const; 57 58 void tryRecordOrSaveexecXorSequence(MachineInstr &MI); 59 bool optimizeOrSaveexecXorSequences(); 60 61 public: 62 static char ID; 63 64 SIOptimizeExecMasking() : MachineFunctionPass(ID) { 65 initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 66 } 67 68 bool runOnMachineFunction(MachineFunction &MF) override; 69 70 StringRef getPassName() const override { 71 return "SI optimize exec mask operations"; 72 } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.setPreservesCFG(); 76 MachineFunctionPass::getAnalysisUsage(AU); 77 } 78 }; 79 80 } // End anonymous namespace. 81 82 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 83 "SI optimize exec mask operations", false, false) 84 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 85 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 86 "SI optimize exec mask operations", false, false) 87 88 char SIOptimizeExecMasking::ID = 0; 89 90 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 91 92 /// If \p MI is a copy from exec, return the register copied to. 93 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { 94 switch (MI.getOpcode()) { 95 case AMDGPU::COPY: 96 case AMDGPU::S_MOV_B64: 97 case AMDGPU::S_MOV_B64_term: 98 case AMDGPU::S_MOV_B32: 99 case AMDGPU::S_MOV_B32_term: { 100 const MachineOperand &Src = MI.getOperand(1); 101 if (Src.isReg() && Src.getReg() == Exec) 102 return MI.getOperand(0).getReg(); 103 } 104 } 105 106 return AMDGPU::NoRegister; 107 } 108 109 /// If \p MI is a copy to exec, return the register copied from. 110 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { 111 switch (MI.getOpcode()) { 112 case AMDGPU::COPY: 113 case AMDGPU::S_MOV_B64: 114 case AMDGPU::S_MOV_B32: { 115 const MachineOperand &Dst = MI.getOperand(0); 116 if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) 117 return MI.getOperand(1).getReg(); 118 break; 119 } 120 case AMDGPU::S_MOV_B64_term: 121 case AMDGPU::S_MOV_B32_term: 122 llvm_unreachable("should have been replaced"); 123 } 124 125 return Register(); 126 } 127 128 /// If \p MI is a logical operation on an exec value, 129 /// return the register copied to. 130 static Register isLogicalOpOnExec(const MachineInstr &MI) { 131 switch (MI.getOpcode()) { 132 case AMDGPU::S_AND_B64: 133 case AMDGPU::S_OR_B64: 134 case AMDGPU::S_XOR_B64: 135 case AMDGPU::S_ANDN2_B64: 136 case AMDGPU::S_ORN2_B64: 137 case AMDGPU::S_NAND_B64: 138 case AMDGPU::S_NOR_B64: 139 case AMDGPU::S_XNOR_B64: { 140 const MachineOperand &Src1 = MI.getOperand(1); 141 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 142 return MI.getOperand(0).getReg(); 143 const MachineOperand &Src2 = MI.getOperand(2); 144 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 145 return MI.getOperand(0).getReg(); 146 break; 147 } 148 case AMDGPU::S_AND_B32: 149 case AMDGPU::S_OR_B32: 150 case AMDGPU::S_XOR_B32: 151 case AMDGPU::S_ANDN2_B32: 152 case AMDGPU::S_ORN2_B32: 153 case AMDGPU::S_NAND_B32: 154 case AMDGPU::S_NOR_B32: 155 case AMDGPU::S_XNOR_B32: { 156 const MachineOperand &Src1 = MI.getOperand(1); 157 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) 158 return MI.getOperand(0).getReg(); 159 const MachineOperand &Src2 = MI.getOperand(2); 160 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) 161 return MI.getOperand(0).getReg(); 162 break; 163 } 164 } 165 166 return AMDGPU::NoRegister; 167 } 168 169 static unsigned getSaveExecOp(unsigned Opc) { 170 switch (Opc) { 171 case AMDGPU::S_AND_B64: 172 return AMDGPU::S_AND_SAVEEXEC_B64; 173 case AMDGPU::S_OR_B64: 174 return AMDGPU::S_OR_SAVEEXEC_B64; 175 case AMDGPU::S_XOR_B64: 176 return AMDGPU::S_XOR_SAVEEXEC_B64; 177 case AMDGPU::S_ANDN2_B64: 178 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 179 case AMDGPU::S_ORN2_B64: 180 return AMDGPU::S_ORN2_SAVEEXEC_B64; 181 case AMDGPU::S_NAND_B64: 182 return AMDGPU::S_NAND_SAVEEXEC_B64; 183 case AMDGPU::S_NOR_B64: 184 return AMDGPU::S_NOR_SAVEEXEC_B64; 185 case AMDGPU::S_XNOR_B64: 186 return AMDGPU::S_XNOR_SAVEEXEC_B64; 187 case AMDGPU::S_AND_B32: 188 return AMDGPU::S_AND_SAVEEXEC_B32; 189 case AMDGPU::S_OR_B32: 190 return AMDGPU::S_OR_SAVEEXEC_B32; 191 case AMDGPU::S_XOR_B32: 192 return AMDGPU::S_XOR_SAVEEXEC_B32; 193 case AMDGPU::S_ANDN2_B32: 194 return AMDGPU::S_ANDN2_SAVEEXEC_B32; 195 case AMDGPU::S_ORN2_B32: 196 return AMDGPU::S_ORN2_SAVEEXEC_B32; 197 case AMDGPU::S_NAND_B32: 198 return AMDGPU::S_NAND_SAVEEXEC_B32; 199 case AMDGPU::S_NOR_B32: 200 return AMDGPU::S_NOR_SAVEEXEC_B32; 201 case AMDGPU::S_XNOR_B32: 202 return AMDGPU::S_XNOR_SAVEEXEC_B32; 203 default: 204 return AMDGPU::INSTRUCTION_LIST_END; 205 } 206 } 207 208 // These are only terminators to get correct spill code placement during 209 // register allocation, so turn them back into normal instructions. 210 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { 211 switch (MI.getOpcode()) { 212 case AMDGPU::S_MOV_B32_term: { 213 bool RegSrc = MI.getOperand(1).isReg(); 214 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 215 return true; 216 } 217 case AMDGPU::S_MOV_B64_term: { 218 bool RegSrc = MI.getOperand(1).isReg(); 219 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); 220 return true; 221 } 222 case AMDGPU::S_XOR_B64_term: { 223 // This is only a terminator to get the correct spill code placement during 224 // register allocation. 225 MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); 226 return true; 227 } 228 case AMDGPU::S_XOR_B32_term: { 229 // This is only a terminator to get the correct spill code placement during 230 // register allocation. 231 MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); 232 return true; 233 } 234 case AMDGPU::S_OR_B64_term: { 235 // This is only a terminator to get the correct spill code placement during 236 // register allocation. 237 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 238 return true; 239 } 240 case AMDGPU::S_OR_B32_term: { 241 // This is only a terminator to get the correct spill code placement during 242 // register allocation. 243 MI.setDesc(TII->get(AMDGPU::S_OR_B32)); 244 return true; 245 } 246 case AMDGPU::S_ANDN2_B64_term: { 247 // This is only a terminator to get the correct spill code placement during 248 // register allocation. 249 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); 250 return true; 251 } 252 case AMDGPU::S_ANDN2_B32_term: { 253 // This is only a terminator to get the correct spill code placement during 254 // register allocation. 255 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); 256 return true; 257 } 258 case AMDGPU::S_AND_B64_term: { 259 // This is only a terminator to get the correct spill code placement during 260 // register allocation. 261 MI.setDesc(TII->get(AMDGPU::S_AND_B64)); 262 return true; 263 } 264 case AMDGPU::S_AND_B32_term: { 265 // This is only a terminator to get the correct spill code placement during 266 // register allocation. 267 MI.setDesc(TII->get(AMDGPU::S_AND_B32)); 268 return true; 269 } 270 default: 271 return false; 272 } 273 } 274 275 // Turn all pseudoterminators in the block into their equivalent non-terminator 276 // instructions. Returns the reverse iterator to the first non-terminator 277 // instruction in the block. 278 MachineBasicBlock::reverse_iterator 279 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { 280 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 281 282 bool Seen = false; 283 MachineBasicBlock::reverse_iterator FirstNonTerm = I; 284 for (; I != E; ++I) { 285 if (!I->isTerminator()) 286 return Seen ? FirstNonTerm : I; 287 288 if (removeTerminatorBit(*I)) { 289 if (!Seen) { 290 FirstNonTerm = I; 291 Seen = true; 292 } 293 } 294 } 295 296 return FirstNonTerm; 297 } 298 299 MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy( 300 MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const { 301 const unsigned InstLimit = 25; 302 303 auto E = MBB.rend(); 304 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 305 Register CopyFromExec = isCopyFromExec(*I); 306 if (CopyFromExec.isValid()) 307 return I; 308 } 309 310 return E; 311 } 312 313 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly 314 // report the register as unavailable because a super-register with a lane mask 315 // is unavailable. 316 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 317 for (MachineBasicBlock *Succ : MBB.successors()) { 318 if (Succ->isLiveIn(Reg)) 319 return true; 320 } 321 322 return false; 323 } 324 325 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either 326 // the beginning of the BB is reached or Pred evaluates to true - which can be 327 // an arbitrary condition based on the current MachineInstr, for instance an 328 // target instruction. Breaks prematurely by returning nullptr if one of the 329 // registers given in NonModifiableRegs is modified by the current instruction. 330 MachineInstr *SIOptimizeExecMasking::findInstrBackwards( 331 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 332 ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { 333 MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), 334 E = Origin.getParent()->rend(); 335 unsigned CurrentIteration = 0; 336 337 for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { 338 if (A->isDebugInstr()) 339 continue; 340 341 if (Pred(&*A)) 342 return &*A; 343 344 for (MCRegister Reg : NonModifiableRegs) { 345 if (A->modifiesRegister(Reg, TRI)) 346 return nullptr; 347 } 348 349 ++CurrentIteration; 350 } 351 352 return nullptr; 353 } 354 355 // Determine if a register Reg is not re-defined and still in use 356 // in the range (Stop..Start]. 357 // It does so by backwards calculating liveness from the end of the BB until 358 // either Stop or the beginning of the BB is reached. 359 // After liveness is calculated, we can determine if Reg is still in use and not 360 // defined inbetween the instructions. 361 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, 362 MachineInstr &Start, 363 MCRegister Reg, 364 bool UseLiveOuts, 365 bool IgnoreStart) const { 366 LivePhysRegs LR(*TRI); 367 if (UseLiveOuts) 368 LR.addLiveOuts(*Stop.getParent()); 369 370 MachineBasicBlock::reverse_iterator A(Start); 371 372 if (IgnoreStart) 373 ++A; 374 375 for (; A != Stop.getParent()->rend() && A != Stop; ++A) { 376 LR.stepBackward(*A); 377 } 378 379 return !LR.available(*MRI, Reg); 380 } 381 382 // Determine if a register Reg is not re-defined and still in use 383 // in the range (Stop..BB.end]. 384 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, 385 MCRegister Reg) const { 386 return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); 387 } 388 389 // Optimize sequences emitted for control flow lowering. They are originally 390 // emitted as the separate operations because spill code may need to be 391 // inserted for the saved copy of exec. 392 // 393 // x = copy exec 394 // z = s_<op>_b64 x, y 395 // exec = copy z 396 // => 397 // x = s_<op>_saveexec_b64 y 398 // 399 bool SIOptimizeExecMasking::optimizeExecSequence() { 400 bool Changed = false; 401 for (MachineBasicBlock &MBB : *MF) { 402 MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); 403 MachineBasicBlock::reverse_iterator E = MBB.rend(); 404 if (I == E) 405 continue; 406 407 // It's possible to see other terminator copies after the exec copy. This 408 // can happen if control flow pseudos had their outputs used by phis. 409 Register CopyToExec; 410 411 unsigned SearchCount = 0; 412 const unsigned SearchLimit = 5; 413 while (I != E && SearchCount++ < SearchLimit) { 414 CopyToExec = isCopyToExec(*I); 415 if (CopyToExec) 416 break; 417 ++I; 418 } 419 420 if (!CopyToExec) 421 continue; 422 423 // Scan backwards to find the def. 424 auto *CopyToExecInst = &*I; 425 auto CopyFromExecInst = findExecCopy(MBB, I); 426 if (CopyFromExecInst == E) { 427 auto PrepareExecInst = std::next(I); 428 if (PrepareExecInst == E) 429 continue; 430 // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 431 if (CopyToExecInst->getOperand(1).isKill() && 432 isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 433 LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 434 435 PrepareExecInst->getOperand(0).setReg(Exec); 436 437 LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 438 439 CopyToExecInst->eraseFromParent(); 440 Changed = true; 441 } 442 443 continue; 444 } 445 446 if (isLiveOut(MBB, CopyToExec)) { 447 // The copied register is live out and has a second use in another block. 448 LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 449 continue; 450 } 451 452 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 453 MachineInstr *SaveExecInst = nullptr; 454 SmallVector<MachineInstr *, 4> OtherUseInsts; 455 456 for (MachineBasicBlock::iterator 457 J = std::next(CopyFromExecInst->getIterator()), 458 JE = I->getIterator(); 459 J != JE; ++J) { 460 if (SaveExecInst && J->readsRegister(Exec, TRI)) { 461 LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 462 // Make sure this is inserted after any VALU ops that may have been 463 // scheduled in between. 464 SaveExecInst = nullptr; 465 break; 466 } 467 468 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 469 470 if (J->modifiesRegister(CopyToExec, TRI)) { 471 if (SaveExecInst) { 472 LLVM_DEBUG(dbgs() << "Multiple instructions modify " 473 << printReg(CopyToExec, TRI) << '\n'); 474 SaveExecInst = nullptr; 475 break; 476 } 477 478 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 479 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 480 break; 481 482 if (ReadsCopyFromExec) { 483 SaveExecInst = &*J; 484 LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 485 continue; 486 } else { 487 LLVM_DEBUG(dbgs() 488 << "Instruction does not read exec copy: " << *J << '\n'); 489 break; 490 } 491 } else if (ReadsCopyFromExec && !SaveExecInst) { 492 // Make sure no other instruction is trying to use this copy, before it 493 // will be rewritten by the saveexec, i.e. hasOneUse. There may have 494 // been another use, such as an inserted spill. For example: 495 // 496 // %sgpr0_sgpr1 = COPY %exec 497 // spill %sgpr0_sgpr1 498 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 499 // 500 LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 501 << '\n'); 502 break; 503 } 504 505 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 506 assert(SaveExecInst != &*J); 507 OtherUseInsts.push_back(&*J); 508 } 509 } 510 511 if (!SaveExecInst) 512 continue; 513 514 LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 515 516 MachineOperand &Src0 = SaveExecInst->getOperand(1); 517 MachineOperand &Src1 = SaveExecInst->getOperand(2); 518 519 MachineOperand *OtherOp = nullptr; 520 521 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 522 OtherOp = &Src1; 523 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 524 if (!SaveExecInst->isCommutable()) 525 break; 526 527 OtherOp = &Src0; 528 } else 529 llvm_unreachable("unexpected"); 530 531 CopyFromExecInst->eraseFromParent(); 532 533 auto InsPt = SaveExecInst->getIterator(); 534 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 535 536 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 537 CopyFromExec) 538 .addReg(OtherOp->getReg()); 539 SaveExecInst->eraseFromParent(); 540 541 CopyToExecInst->eraseFromParent(); 542 543 for (MachineInstr *OtherInst : OtherUseInsts) { 544 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, 545 *TRI); 546 } 547 548 Changed = true; 549 } 550 551 return Changed; 552 } 553 554 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the 555 // operands extracted from a v_cmp ..., s_and_saveexec pattern. 556 bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( 557 MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { 558 const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); 559 560 if (NewOpcode == -1) 561 return false; 562 563 MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); 564 MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); 565 566 Register MoveDest = SaveExecInstr.getOperand(0).getReg(); 567 568 MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); 569 if (!SaveExecInstr.uses().empty()) { 570 bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; 571 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 572 BuildMI(*SaveExecInstr.getParent(), InsertPosIt, 573 SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) 574 .addReg(Exec); 575 } 576 577 // Omit dst as V_CMPX is implicitly writing to EXEC. 578 // Add dummy src and clamp modifiers, if needed. 579 auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), 580 VCmp.getDebugLoc(), TII->get(NewOpcode)); 581 582 auto TryAddImmediateValueFromNamedOperand = 583 [&](unsigned OperandName) -> void { 584 if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) 585 Builder.addImm(Mod->getImm()); 586 }; 587 588 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); 589 Builder.add(*Src0); 590 591 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); 592 Builder.add(*Src1); 593 594 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); 595 596 // The kill flags may no longer be correct. 597 if (Src0->isReg()) 598 MRI->clearKillFlags(Src0->getReg()); 599 if (Src1->isReg()) 600 MRI->clearKillFlags(Src1->getReg()); 601 602 SaveExecInstr.eraseFromParent(); 603 VCmp.eraseFromParent(); 604 605 return true; 606 } 607 608 // Record (on GFX10.3 and later) occurences of 609 // v_cmp_* SGPR, IMM, VGPR 610 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR 611 // to be replaced with 612 // s_mov_b32 EXEC_SGPR_DEST, exec_lo 613 // v_cmpx_* IMM, VGPR 614 // to reduce pipeline stalls. 615 void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( 616 MachineInstr &MI) { 617 if (!ST->hasGFX10_3Insts()) 618 return; 619 620 const unsigned AndSaveExecOpcode = 621 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 622 623 if (MI.getOpcode() != AndSaveExecOpcode) 624 return; 625 626 Register SaveExecDest = MI.getOperand(0).getReg(); 627 if (!TRI->isSGPRReg(*MRI, SaveExecDest)) 628 return; 629 630 MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 631 if (!SaveExecSrc0->isReg()) 632 return; 633 634 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec 635 // sequence by looking at an instance of an s_and_saveexec instruction. 636 // Returns a pointer to the v_cmp instruction if it is safe to replace the 637 // sequence (see the conditions in the function body). This is after register 638 // allocation, so some checks on operand dependencies need to be considered. 639 MachineInstr *VCmp = nullptr; 640 641 // Try to find the last v_cmp instruction that defs the saveexec input 642 // operand without any write to Exec or the saveexec input operand inbetween. 643 VCmp = findInstrBackwards( 644 MI, 645 [&](MachineInstr *Check) { 646 return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && 647 Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); 648 }, 649 {Exec, SaveExecSrc0->getReg()}); 650 651 if (!VCmp) 652 return; 653 654 MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); 655 assert(VCmpDest && "Should have an sdst operand!"); 656 657 // Check if any of the v_cmp source operands is written by the saveexec. 658 MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); 659 if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && 660 MI.modifiesRegister(Src0->getReg(), TRI)) 661 return; 662 663 MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); 664 if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && 665 MI.modifiesRegister(Src1->getReg(), TRI)) 666 return; 667 668 // Don't do the transformation if the destination operand is included in 669 // it's MBB Live-outs, meaning it's used in any of its successors, leading 670 // to incorrect code if the v_cmp and therefore the def of 671 // the dest operand is removed. 672 if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) 673 return; 674 675 // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the 676 // s_and_saveexec, skip the optimization. 677 if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || 678 isRegisterInUseAfter(MI, VCmpDest->getReg())) 679 return; 680 681 // Try to determine if there is a write to any of the VCmp 682 // operands between the saveexec and the vcmp. 683 // If yes, additional VGPR spilling might need to be inserted. In this case, 684 // it's not worth replacing the instruction sequence. 685 SmallVector<MCRegister, 2> NonDefRegs; 686 if (Src0->isReg()) 687 NonDefRegs.push_back(Src0->getReg()); 688 689 if (Src1->isReg()) 690 NonDefRegs.push_back(Src1->getReg()); 691 692 if (!findInstrBackwards( 693 MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs)) 694 return; 695 696 if (VCmp) 697 SaveExecVCmpMapping[&MI] = VCmp; 698 } 699 700 // Record occurences of 701 // s_or_saveexec s_o, s_i 702 // s_xor exec, exec, s_o 703 // to be replaced with 704 // s_andn2_saveexec s_o, s_i. 705 void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { 706 const unsigned XorOpcode = 707 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; 708 709 if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { 710 const MachineOperand &XorDst = MI.getOperand(0); 711 const MachineOperand &XorSrc0 = MI.getOperand(1); 712 const MachineOperand &XorSrc1 = MI.getOperand(2); 713 714 if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && 715 XorSrc1.isReg() && 716 (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { 717 const unsigned OrSaveexecOpcode = ST->isWave32() 718 ? AMDGPU::S_OR_SAVEEXEC_B32 719 : AMDGPU::S_OR_SAVEEXEC_B64; 720 721 // Peek at the previous instruction and check if this is a relevant 722 // s_or_saveexec instruction. 723 MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); 724 if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) 725 return; 726 727 const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); 728 const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); 729 if (OrDst.isReg() && OrSrc0.isReg()) { 730 if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || 731 (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { 732 OrXors.emplace_back(&PossibleOrSaveexec, &MI); 733 } 734 } 735 } 736 } 737 } 738 739 bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { 740 if (OrXors.empty()) { 741 return false; 742 } 743 744 bool Changed = false; 745 const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 746 : AMDGPU::S_ANDN2_SAVEEXEC_B64; 747 748 for (const auto &Pair : OrXors) { 749 MachineInstr *Or = nullptr; 750 MachineInstr *Xor = nullptr; 751 std::tie(Or, Xor) = Pair; 752 BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), 753 TII->get(Andn2Opcode), Or->getOperand(0).getReg()) 754 .addReg(Or->getOperand(1).getReg()); 755 756 Or->eraseFromParent(); 757 Xor->eraseFromParent(); 758 759 Changed = true; 760 } 761 762 return Changed; 763 } 764 765 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 766 if (skipFunction(MF.getFunction())) 767 return false; 768 769 this->MF = &MF; 770 ST = &MF.getSubtarget<GCNSubtarget>(); 771 TRI = ST->getRegisterInfo(); 772 TII = ST->getInstrInfo(); 773 MRI = &MF.getRegInfo(); 774 Exec = TRI->getExec(); 775 776 bool Changed = optimizeExecSequence(); 777 778 OrXors.clear(); 779 SaveExecVCmpMapping.clear(); 780 static unsigned SearchWindow = 10; 781 for (MachineBasicBlock &MBB : MF) { 782 unsigned SearchCount = 0; 783 784 for (auto &MI : llvm::reverse(MBB)) { 785 if (MI.isDebugInstr()) 786 continue; 787 788 if (SearchCount >= SearchWindow) { 789 break; 790 } 791 792 tryRecordOrSaveexecXorSequence(MI); 793 tryRecordVCmpxAndSaveexecSequence(MI); 794 795 if (MI.modifiesRegister(Exec, TRI)) { 796 break; 797 } 798 799 ++SearchCount; 800 } 801 } 802 803 Changed |= optimizeOrSaveexecXorSequences(); 804 for (const auto &Entry : SaveExecVCmpMapping) { 805 MachineInstr *SaveExecInstr = Entry.getFirst(); 806 MachineInstr *VCmpInstr = Entry.getSecond(); 807 808 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); 809 } 810 811 return Changed; 812 } 813