1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPU.h" 10 #include "GCNSubtarget.h" 11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12 #include "SIRegisterInfo.h" 13 #include "llvm/CodeGen/LivePhysRegs.h" 14 #include "llvm/CodeGen/MachineFunctionPass.h" 15 #include "llvm/CodeGen/MachineOperand.h" 16 #include "llvm/CodeGen/TargetRegisterInfo.h" 17 #include "llvm/InitializePasses.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "si-optimize-exec-masking" 22 23 namespace { 24 25 class SIOptimizeExecMasking : public MachineFunctionPass { 26 MachineFunction *MF = nullptr; 27 const GCNSubtarget *ST = nullptr; 28 const SIRegisterInfo *TRI = nullptr; 29 const SIInstrInfo *TII = nullptr; 30 const MachineRegisterInfo *MRI = nullptr; 31 MCRegister Exec; 32 33 DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; 34 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; 35 36 Register isCopyFromExec(const MachineInstr &MI) const; 37 Register isCopyToExec(const MachineInstr &MI) const; 38 bool removeTerminatorBit(MachineInstr &MI) const; 39 MachineBasicBlock::reverse_iterator 40 fixTerminators(MachineBasicBlock &MBB) const; 41 MachineBasicBlock::reverse_iterator 42 findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, 43 unsigned CopyToExec) const; 44 45 bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, 46 MCRegister Reg, bool UseLiveOuts = false, 47 bool IgnoreStart = false) const; 48 bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; 49 MachineInstr *findInstrBackwards(MachineInstr &Origin, 50 std::function<bool(MachineInstr *)> Pred, 51 ArrayRef<MCRegister> NonModifiableRegs, 52 unsigned MaxInstructions = 20) const; 53 bool optimizeExecSequence(); 54 void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); 55 bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, 56 MachineInstr &VCmp, MCRegister Exec) const; 57 58 void tryRecordOrSaveexecXorSequence(MachineInstr &MI); 59 bool optimizeOrSaveexecXorSequences(); 60 61 public: 62 static char ID; 63 64 SIOptimizeExecMasking() : MachineFunctionPass(ID) { 65 initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 66 } 67 68 bool runOnMachineFunction(MachineFunction &MF) override; 69 70 StringRef getPassName() const override { 71 return "SI optimize exec mask operations"; 72 } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.setPreservesCFG(); 76 MachineFunctionPass::getAnalysisUsage(AU); 77 } 78 }; 79 80 } // End anonymous namespace. 81 82 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 83 "SI optimize exec mask operations", false, false) 84 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 85 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 86 "SI optimize exec mask operations", false, false) 87 88 char SIOptimizeExecMasking::ID = 0; 89 90 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 91 92 /// If \p MI is a copy from exec, return the register copied to. 93 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { 94 switch (MI.getOpcode()) { 95 case AMDGPU::COPY: 96 case AMDGPU::S_MOV_B64: 97 case AMDGPU::S_MOV_B64_term: 98 case AMDGPU::S_MOV_B32: 99 case AMDGPU::S_MOV_B32_term: { 100 const MachineOperand &Src = MI.getOperand(1); 101 if (Src.isReg() && Src.getReg() == Exec) 102 return MI.getOperand(0).getReg(); 103 } 104 } 105 106 return AMDGPU::NoRegister; 107 } 108 109 /// If \p MI is a copy to exec, return the register copied from. 110 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { 111 switch (MI.getOpcode()) { 112 case AMDGPU::COPY: 113 case AMDGPU::S_MOV_B64: 114 case AMDGPU::S_MOV_B32: { 115 const MachineOperand &Dst = MI.getOperand(0); 116 if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) 117 return MI.getOperand(1).getReg(); 118 break; 119 } 120 case AMDGPU::S_MOV_B64_term: 121 case AMDGPU::S_MOV_B32_term: 122 llvm_unreachable("should have been replaced"); 123 } 124 125 return Register(); 126 } 127 128 /// If \p MI is a logical operation on an exec value, 129 /// return the register copied to. 130 static Register isLogicalOpOnExec(const MachineInstr &MI) { 131 switch (MI.getOpcode()) { 132 case AMDGPU::S_AND_B64: 133 case AMDGPU::S_OR_B64: 134 case AMDGPU::S_XOR_B64: 135 case AMDGPU::S_ANDN2_B64: 136 case AMDGPU::S_ORN2_B64: 137 case AMDGPU::S_NAND_B64: 138 case AMDGPU::S_NOR_B64: 139 case AMDGPU::S_XNOR_B64: { 140 const MachineOperand &Src1 = MI.getOperand(1); 141 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 142 return MI.getOperand(0).getReg(); 143 const MachineOperand &Src2 = MI.getOperand(2); 144 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 145 return MI.getOperand(0).getReg(); 146 break; 147 } 148 case AMDGPU::S_AND_B32: 149 case AMDGPU::S_OR_B32: 150 case AMDGPU::S_XOR_B32: 151 case AMDGPU::S_ANDN2_B32: 152 case AMDGPU::S_ORN2_B32: 153 case AMDGPU::S_NAND_B32: 154 case AMDGPU::S_NOR_B32: 155 case AMDGPU::S_XNOR_B32: { 156 const MachineOperand &Src1 = MI.getOperand(1); 157 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) 158 return MI.getOperand(0).getReg(); 159 const MachineOperand &Src2 = MI.getOperand(2); 160 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) 161 return MI.getOperand(0).getReg(); 162 break; 163 } 164 } 165 166 return AMDGPU::NoRegister; 167 } 168 169 static unsigned getSaveExecOp(unsigned Opc) { 170 switch (Opc) { 171 case AMDGPU::S_AND_B64: 172 return AMDGPU::S_AND_SAVEEXEC_B64; 173 case AMDGPU::S_OR_B64: 174 return AMDGPU::S_OR_SAVEEXEC_B64; 175 case AMDGPU::S_XOR_B64: 176 return AMDGPU::S_XOR_SAVEEXEC_B64; 177 case AMDGPU::S_ANDN2_B64: 178 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 179 case AMDGPU::S_ORN2_B64: 180 return AMDGPU::S_ORN2_SAVEEXEC_B64; 181 case AMDGPU::S_NAND_B64: 182 return AMDGPU::S_NAND_SAVEEXEC_B64; 183 case AMDGPU::S_NOR_B64: 184 return AMDGPU::S_NOR_SAVEEXEC_B64; 185 case AMDGPU::S_XNOR_B64: 186 return AMDGPU::S_XNOR_SAVEEXEC_B64; 187 case AMDGPU::S_AND_B32: 188 return AMDGPU::S_AND_SAVEEXEC_B32; 189 case AMDGPU::S_OR_B32: 190 return AMDGPU::S_OR_SAVEEXEC_B32; 191 case AMDGPU::S_XOR_B32: 192 return AMDGPU::S_XOR_SAVEEXEC_B32; 193 case AMDGPU::S_ANDN2_B32: 194 return AMDGPU::S_ANDN2_SAVEEXEC_B32; 195 case AMDGPU::S_ORN2_B32: 196 return AMDGPU::S_ORN2_SAVEEXEC_B32; 197 case AMDGPU::S_NAND_B32: 198 return AMDGPU::S_NAND_SAVEEXEC_B32; 199 case AMDGPU::S_NOR_B32: 200 return AMDGPU::S_NOR_SAVEEXEC_B32; 201 case AMDGPU::S_XNOR_B32: 202 return AMDGPU::S_XNOR_SAVEEXEC_B32; 203 default: 204 return AMDGPU::INSTRUCTION_LIST_END; 205 } 206 } 207 208 // These are only terminators to get correct spill code placement during 209 // register allocation, so turn them back into normal instructions. 210 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { 211 switch (MI.getOpcode()) { 212 case AMDGPU::S_MOV_B32_term: { 213 bool RegSrc = MI.getOperand(1).isReg(); 214 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 215 return true; 216 } 217 case AMDGPU::S_MOV_B64_term: { 218 bool RegSrc = MI.getOperand(1).isReg(); 219 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); 220 return true; 221 } 222 case AMDGPU::S_XOR_B64_term: { 223 // This is only a terminator to get the correct spill code placement during 224 // register allocation. 225 MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); 226 return true; 227 } 228 case AMDGPU::S_XOR_B32_term: { 229 // This is only a terminator to get the correct spill code placement during 230 // register allocation. 231 MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); 232 return true; 233 } 234 case AMDGPU::S_OR_B64_term: { 235 // This is only a terminator to get the correct spill code placement during 236 // register allocation. 237 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 238 return true; 239 } 240 case AMDGPU::S_OR_B32_term: { 241 // This is only a terminator to get the correct spill code placement during 242 // register allocation. 243 MI.setDesc(TII->get(AMDGPU::S_OR_B32)); 244 return true; 245 } 246 case AMDGPU::S_ANDN2_B64_term: { 247 // This is only a terminator to get the correct spill code placement during 248 // register allocation. 249 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); 250 return true; 251 } 252 case AMDGPU::S_ANDN2_B32_term: { 253 // This is only a terminator to get the correct spill code placement during 254 // register allocation. 255 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); 256 return true; 257 } 258 case AMDGPU::S_AND_B64_term: { 259 // This is only a terminator to get the correct spill code placement during 260 // register allocation. 261 MI.setDesc(TII->get(AMDGPU::S_AND_B64)); 262 return true; 263 } 264 case AMDGPU::S_AND_B32_term: { 265 // This is only a terminator to get the correct spill code placement during 266 // register allocation. 267 MI.setDesc(TII->get(AMDGPU::S_AND_B32)); 268 return true; 269 } 270 default: 271 return false; 272 } 273 } 274 275 // Turn all pseudoterminators in the block into their equivalent non-terminator 276 // instructions. Returns the reverse iterator to the first non-terminator 277 // instruction in the block. 278 MachineBasicBlock::reverse_iterator 279 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { 280 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 281 282 bool Seen = false; 283 MachineBasicBlock::reverse_iterator FirstNonTerm = I; 284 for (; I != E; ++I) { 285 if (!I->isTerminator()) 286 return Seen ? FirstNonTerm : I; 287 288 if (removeTerminatorBit(*I)) { 289 if (!Seen) { 290 FirstNonTerm = I; 291 Seen = true; 292 } 293 } 294 } 295 296 return FirstNonTerm; 297 } 298 299 MachineBasicBlock::reverse_iterator 300 SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB, 301 MachineBasicBlock::reverse_iterator I, 302 unsigned CopyToExec) const { 303 const unsigned InstLimit = 25; 304 305 auto E = MBB.rend(); 306 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 307 Register CopyFromExec = isCopyFromExec(*I); 308 if (CopyFromExec.isValid()) 309 return I; 310 } 311 312 return E; 313 } 314 315 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly 316 // report the register as unavailable because a super-register with a lane mask 317 // is unavailable. 318 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 319 for (MachineBasicBlock *Succ : MBB.successors()) { 320 if (Succ->isLiveIn(Reg)) 321 return true; 322 } 323 324 return false; 325 } 326 327 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either 328 // the beginning of the BB is reached or Pred evaluates to true - which can be 329 // an arbitrary condition based on the current MachineInstr, for instance an 330 // target instruction. Breaks prematurely by returning nullptr if one of the 331 // registers given in NonModifiableRegs is modified by the current instruction. 332 MachineInstr *SIOptimizeExecMasking::findInstrBackwards( 333 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 334 ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { 335 MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), 336 E = Origin.getParent()->rend(); 337 unsigned CurrentIteration = 0; 338 339 for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { 340 if (A->isDebugInstr()) 341 continue; 342 343 if (Pred(&*A)) 344 return &*A; 345 346 for (MCRegister Reg : NonModifiableRegs) { 347 if (A->modifiesRegister(Reg, TRI)) 348 return nullptr; 349 } 350 351 ++CurrentIteration; 352 } 353 354 return nullptr; 355 } 356 357 // Determine if a register Reg is not re-defined and still in use 358 // in the range (Stop..Start]. 359 // It does so by backwards calculating liveness from the end of the BB until 360 // either Stop or the beginning of the BB is reached. 361 // After liveness is calculated, we can determine if Reg is still in use and not 362 // defined inbetween the instructions. 363 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, 364 MachineInstr &Start, 365 MCRegister Reg, 366 bool UseLiveOuts, 367 bool IgnoreStart) const { 368 LivePhysRegs LR(*TRI); 369 if (UseLiveOuts) 370 LR.addLiveOuts(*Stop.getParent()); 371 372 MachineBasicBlock::reverse_iterator A(Start); 373 MachineBasicBlock::reverse_iterator E(Stop); 374 375 if (IgnoreStart) 376 ++A; 377 378 for (; A != Stop.getParent()->rend() && A != Stop; ++A) { 379 LR.stepBackward(*A); 380 } 381 382 return !LR.available(*MRI, Reg); 383 } 384 385 // Determine if a register Reg is not re-defined and still in use 386 // in the range (Stop..BB.end]. 387 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, 388 MCRegister Reg) const { 389 return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); 390 } 391 392 // Optimize sequences emitted for control flow lowering. They are originally 393 // emitted as the separate operations because spill code may need to be 394 // inserted for the saved copy of exec. 395 // 396 // x = copy exec 397 // z = s_<op>_b64 x, y 398 // exec = copy z 399 // => 400 // x = s_<op>_saveexec_b64 y 401 // 402 bool SIOptimizeExecMasking::optimizeExecSequence() { 403 bool Changed = false; 404 for (MachineBasicBlock &MBB : *MF) { 405 MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); 406 MachineBasicBlock::reverse_iterator E = MBB.rend(); 407 if (I == E) 408 continue; 409 410 // It's possible to see other terminator copies after the exec copy. This 411 // can happen if control flow pseudos had their outputs used by phis. 412 Register CopyToExec; 413 414 unsigned SearchCount = 0; 415 const unsigned SearchLimit = 5; 416 while (I != E && SearchCount++ < SearchLimit) { 417 CopyToExec = isCopyToExec(*I); 418 if (CopyToExec) 419 break; 420 ++I; 421 } 422 423 if (!CopyToExec) 424 continue; 425 426 // Scan backwards to find the def. 427 auto *CopyToExecInst = &*I; 428 auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec); 429 if (CopyFromExecInst == E) { 430 auto PrepareExecInst = std::next(I); 431 if (PrepareExecInst == E) 432 continue; 433 // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 434 if (CopyToExecInst->getOperand(1).isKill() && 435 isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 436 LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 437 438 PrepareExecInst->getOperand(0).setReg(Exec); 439 440 LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 441 442 CopyToExecInst->eraseFromParent(); 443 Changed = true; 444 } 445 446 continue; 447 } 448 449 if (isLiveOut(MBB, CopyToExec)) { 450 // The copied register is live out and has a second use in another block. 451 LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 452 continue; 453 } 454 455 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 456 MachineInstr *SaveExecInst = nullptr; 457 SmallVector<MachineInstr *, 4> OtherUseInsts; 458 459 for (MachineBasicBlock::iterator 460 J = std::next(CopyFromExecInst->getIterator()), 461 JE = I->getIterator(); 462 J != JE; ++J) { 463 if (SaveExecInst && J->readsRegister(Exec, TRI)) { 464 LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 465 // Make sure this is inserted after any VALU ops that may have been 466 // scheduled in between. 467 SaveExecInst = nullptr; 468 break; 469 } 470 471 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 472 473 if (J->modifiesRegister(CopyToExec, TRI)) { 474 if (SaveExecInst) { 475 LLVM_DEBUG(dbgs() << "Multiple instructions modify " 476 << printReg(CopyToExec, TRI) << '\n'); 477 SaveExecInst = nullptr; 478 break; 479 } 480 481 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 482 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 483 break; 484 485 if (ReadsCopyFromExec) { 486 SaveExecInst = &*J; 487 LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 488 continue; 489 } else { 490 LLVM_DEBUG(dbgs() 491 << "Instruction does not read exec copy: " << *J << '\n'); 492 break; 493 } 494 } else if (ReadsCopyFromExec && !SaveExecInst) { 495 // Make sure no other instruction is trying to use this copy, before it 496 // will be rewritten by the saveexec, i.e. hasOneUse. There may have 497 // been another use, such as an inserted spill. For example: 498 // 499 // %sgpr0_sgpr1 = COPY %exec 500 // spill %sgpr0_sgpr1 501 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 502 // 503 LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 504 << '\n'); 505 break; 506 } 507 508 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 509 assert(SaveExecInst != &*J); 510 OtherUseInsts.push_back(&*J); 511 } 512 } 513 514 if (!SaveExecInst) 515 continue; 516 517 LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 518 519 MachineOperand &Src0 = SaveExecInst->getOperand(1); 520 MachineOperand &Src1 = SaveExecInst->getOperand(2); 521 522 MachineOperand *OtherOp = nullptr; 523 524 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 525 OtherOp = &Src1; 526 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 527 if (!SaveExecInst->isCommutable()) 528 break; 529 530 OtherOp = &Src0; 531 } else 532 llvm_unreachable("unexpected"); 533 534 CopyFromExecInst->eraseFromParent(); 535 536 auto InsPt = SaveExecInst->getIterator(); 537 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 538 539 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 540 CopyFromExec) 541 .addReg(OtherOp->getReg()); 542 SaveExecInst->eraseFromParent(); 543 544 CopyToExecInst->eraseFromParent(); 545 546 for (MachineInstr *OtherInst : OtherUseInsts) { 547 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, 548 *TRI); 549 } 550 551 Changed = true; 552 } 553 554 return Changed; 555 } 556 557 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the 558 // operands extracted from a v_cmp ..., s_and_saveexec pattern. 559 bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( 560 MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { 561 const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); 562 563 if (NewOpcode == -1) 564 return false; 565 566 MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); 567 MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); 568 569 Register MoveDest = SaveExecInstr.getOperand(0).getReg(); 570 571 MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); 572 if (!SaveExecInstr.uses().empty()) { 573 bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; 574 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 575 BuildMI(*SaveExecInstr.getParent(), InsertPosIt, 576 SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) 577 .addReg(Exec); 578 } 579 580 // Omit dst as V_CMPX is implicitly writing to EXEC. 581 // Add dummy src and clamp modifiers, if needed. 582 auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), 583 VCmp.getDebugLoc(), TII->get(NewOpcode)); 584 585 auto TryAddImmediateValueFromNamedOperand = 586 [&](unsigned OperandName) -> void { 587 if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) 588 Builder.addImm(Mod->getImm()); 589 }; 590 591 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); 592 Builder.add(*Src0); 593 594 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); 595 Builder.add(*Src1); 596 597 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); 598 599 // The kill flags may no longer be correct. 600 if (Src0->isReg()) 601 MRI->clearKillFlags(Src0->getReg()); 602 if (Src1->isReg()) 603 MRI->clearKillFlags(Src1->getReg()); 604 605 SaveExecInstr.eraseFromParent(); 606 VCmp.eraseFromParent(); 607 608 return true; 609 } 610 611 // Record (on GFX10.3 and later) occurences of 612 // v_cmp_* SGPR, IMM, VGPR 613 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR 614 // to be replaced with 615 // s_mov_b32 EXEC_SGPR_DEST, exec_lo 616 // v_cmpx_* IMM, VGPR 617 // to reduce pipeline stalls. 618 void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( 619 MachineInstr &MI) { 620 if (!ST->hasGFX10_3Insts()) 621 return; 622 623 const unsigned AndSaveExecOpcode = 624 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 625 626 if (MI.getOpcode() != AndSaveExecOpcode) 627 return; 628 629 Register SaveExecDest = MI.getOperand(0).getReg(); 630 if (!TRI->isSGPRReg(*MRI, SaveExecDest)) 631 return; 632 633 MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 634 if (!SaveExecSrc0->isReg()) 635 return; 636 637 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec 638 // sequence by looking at an instance of a s_and_saveexec instruction. Returns 639 // a pointer to the v_cmp instruction if it is safe to replace the sequence 640 // (see the conditions in the function body). This is after register 641 // allocation, so some checks on operand dependencies need to be considered. 642 MachineInstr *VCmp = nullptr; 643 644 // Try to find the last v_cmp instruction that defs the saveexec input 645 // operand without any write to Exec or the saveexec input operand inbetween. 646 VCmp = findInstrBackwards( 647 MI, 648 [&](MachineInstr *Check) { 649 return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && 650 Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); 651 }, 652 {Exec, SaveExecSrc0->getReg()}); 653 654 if (!VCmp) 655 return; 656 657 MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); 658 assert(VCmpDest && "Should have an sdst operand!"); 659 660 // Check if any of the v_cmp source operands is written by the saveexec. 661 MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); 662 if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && 663 MI.modifiesRegister(Src0->getReg(), TRI)) 664 return; 665 666 MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); 667 if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && 668 MI.modifiesRegister(Src1->getReg(), TRI)) 669 return; 670 671 // Don't do the transformation if the destination operand is included in 672 // it's MBB Live-outs, meaning it's used in any of it's successors, leading 673 // to incorrect code if the v_cmp and therefore the def of 674 // the dest operand is removed. 675 if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) 676 return; 677 678 // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the 679 // s_and_saveexec, skip the optimization. 680 if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || 681 isRegisterInUseAfter(MI, VCmpDest->getReg())) 682 return; 683 684 // Try to determine if there is a write to any of the VCmp 685 // operands between the saveexec and the vcmp. 686 // If yes, additional VGPR spilling might need to be inserted. In this case, 687 // it's not worth replacing the instruction sequence. 688 SmallVector<MCRegister, 2> NonDefRegs; 689 if (Src0->isReg()) 690 NonDefRegs.push_back(Src0->getReg()); 691 692 if (Src1->isReg()) 693 NonDefRegs.push_back(Src1->getReg()); 694 695 if (!findInstrBackwards( 696 MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs)) 697 return; 698 699 if (VCmp) 700 SaveExecVCmpMapping[&MI] = VCmp; 701 } 702 703 // Record occurences of 704 // s_or_saveexec s_o, s_i 705 // s_xor exec, exec, s_o 706 // to be replaced with 707 // s_andn2_saveexec s_o, s_i. 708 void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { 709 const unsigned XorOpcode = 710 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; 711 712 if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { 713 const MachineOperand &XorDst = MI.getOperand(0); 714 const MachineOperand &XorSrc0 = MI.getOperand(1); 715 const MachineOperand &XorSrc1 = MI.getOperand(2); 716 717 if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && 718 XorSrc1.isReg() && 719 (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { 720 const unsigned OrSaveexecOpcode = ST->isWave32() 721 ? AMDGPU::S_OR_SAVEEXEC_B32 722 : AMDGPU::S_OR_SAVEEXEC_B64; 723 724 // Peek at the previous instruction and check if this is a relevant 725 // s_or_saveexec instruction. 726 MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); 727 if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) 728 return; 729 730 const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); 731 const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); 732 if (OrDst.isReg() && OrSrc0.isReg()) { 733 if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || 734 (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { 735 OrXors.emplace_back(&PossibleOrSaveexec, &MI); 736 } 737 } 738 } 739 } 740 } 741 742 bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { 743 if (OrXors.empty()) { 744 return false; 745 } 746 747 bool Changed = false; 748 const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 749 : AMDGPU::S_ANDN2_SAVEEXEC_B64; 750 751 for (const auto &Pair : OrXors) { 752 MachineInstr *Or = nullptr; 753 MachineInstr *Xor = nullptr; 754 std::tie(Or, Xor) = Pair; 755 BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), 756 TII->get(Andn2Opcode), Or->getOperand(0).getReg()) 757 .addReg(Or->getOperand(1).getReg()); 758 759 Or->eraseFromParent(); 760 Xor->eraseFromParent(); 761 762 Changed = true; 763 } 764 765 return Changed; 766 } 767 768 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 769 if (skipFunction(MF.getFunction())) 770 return false; 771 772 this->MF = &MF; 773 ST = &MF.getSubtarget<GCNSubtarget>(); 774 TRI = ST->getRegisterInfo(); 775 TII = ST->getInstrInfo(); 776 MRI = &MF.getRegInfo(); 777 Exec = TRI->getExec(); 778 779 bool Changed = optimizeExecSequence(); 780 781 OrXors.clear(); 782 SaveExecVCmpMapping.clear(); 783 static unsigned SearchWindow = 10; 784 for (MachineBasicBlock &MBB : MF) { 785 unsigned SearchCount = 0; 786 787 for (auto &MI : llvm::reverse(MBB)) { 788 if (MI.isDebugInstr()) 789 continue; 790 791 if (SearchCount >= SearchWindow) { 792 break; 793 } 794 795 tryRecordOrSaveexecXorSequence(MI); 796 tryRecordVCmpxAndSaveexecSequence(MI); 797 798 if (MI.modifiesRegister(Exec, TRI)) { 799 break; 800 } 801 802 ++SearchCount; 803 } 804 } 805 806 Changed |= optimizeOrSaveexecXorSequences(); 807 for (const auto &Entry : SaveExecVCmpMapping) { 808 MachineInstr *SaveExecInstr = Entry.getFirst(); 809 MachineInstr *VCmpInstr = Entry.getSecond(); 810 811 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); 812 } 813 814 return Changed; 815 } 816