1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPU.h" 10 #include "GCNSubtarget.h" 11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12 #include "SIRegisterInfo.h" 13 #include "llvm/ADT/SmallVector.h" 14 #include "llvm/CodeGen/LiveRegUnits.h" 15 #include "llvm/CodeGen/MachineFunctionPass.h" 16 #include "llvm/CodeGen/MachineOperand.h" 17 #include "llvm/CodeGen/TargetRegisterInfo.h" 18 #include "llvm/InitializePasses.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "si-optimize-exec-masking" 23 24 namespace { 25 26 class SIOptimizeExecMasking : public MachineFunctionPass { 27 MachineFunction *MF = nullptr; 28 const GCNSubtarget *ST = nullptr; 29 const SIRegisterInfo *TRI = nullptr; 30 const SIInstrInfo *TII = nullptr; 31 const MachineRegisterInfo *MRI = nullptr; 32 MCRegister Exec; 33 34 DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; 35 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; 36 SmallVector<MachineOperand *, 1> KillFlagCandidates; 37 38 Register isCopyFromExec(const MachineInstr &MI) const; 39 Register isCopyToExec(const MachineInstr &MI) const; 40 bool removeTerminatorBit(MachineInstr &MI) const; 41 MachineBasicBlock::reverse_iterator 42 fixTerminators(MachineBasicBlock &MBB) const; 43 MachineBasicBlock::reverse_iterator 44 findExecCopy(MachineBasicBlock &MBB, 45 MachineBasicBlock::reverse_iterator I) const; 46 bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, 47 MCRegister Reg, bool UseLiveOuts = false, 48 bool IgnoreStart = false) const; 49 bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; 50 MachineInstr *findInstrBackwards( 51 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 52 ArrayRef<MCRegister> NonModifiableRegs, 53 MachineInstr *Terminator = nullptr, 54 SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr, 55 unsigned MaxInstructions = 20) const; 56 bool optimizeExecSequence(); 57 void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); 58 bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, 59 MachineInstr &VCmp, MCRegister Exec) const; 60 61 void tryRecordOrSaveexecXorSequence(MachineInstr &MI); 62 bool optimizeOrSaveexecXorSequences(); 63 64 public: 65 static char ID; 66 67 SIOptimizeExecMasking() : MachineFunctionPass(ID) { 68 initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 69 } 70 71 bool runOnMachineFunction(MachineFunction &MF) override; 72 73 StringRef getPassName() const override { 74 return "SI optimize exec mask operations"; 75 } 76 77 void getAnalysisUsage(AnalysisUsage &AU) const override { 78 AU.setPreservesCFG(); 79 MachineFunctionPass::getAnalysisUsage(AU); 80 } 81 }; 82 83 } // End anonymous namespace. 84 85 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 86 "SI optimize exec mask operations", false, false) 87 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) 88 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 89 "SI optimize exec mask operations", false, false) 90 91 char SIOptimizeExecMasking::ID = 0; 92 93 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 94 95 /// If \p MI is a copy from exec, return the register copied to. 96 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { 97 switch (MI.getOpcode()) { 98 case AMDGPU::COPY: 99 case AMDGPU::S_MOV_B64: 100 case AMDGPU::S_MOV_B64_term: 101 case AMDGPU::S_MOV_B32: 102 case AMDGPU::S_MOV_B32_term: { 103 const MachineOperand &Src = MI.getOperand(1); 104 if (Src.isReg() && Src.getReg() == Exec) 105 return MI.getOperand(0).getReg(); 106 } 107 } 108 109 return AMDGPU::NoRegister; 110 } 111 112 /// If \p MI is a copy to exec, return the register copied from. 113 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { 114 switch (MI.getOpcode()) { 115 case AMDGPU::COPY: 116 case AMDGPU::S_MOV_B64: 117 case AMDGPU::S_MOV_B32: { 118 const MachineOperand &Dst = MI.getOperand(0); 119 if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) 120 return MI.getOperand(1).getReg(); 121 break; 122 } 123 case AMDGPU::S_MOV_B64_term: 124 case AMDGPU::S_MOV_B32_term: 125 llvm_unreachable("should have been replaced"); 126 } 127 128 return Register(); 129 } 130 131 /// If \p MI is a logical operation on an exec value, 132 /// return the register copied to. 133 static Register isLogicalOpOnExec(const MachineInstr &MI) { 134 switch (MI.getOpcode()) { 135 case AMDGPU::S_AND_B64: 136 case AMDGPU::S_OR_B64: 137 case AMDGPU::S_XOR_B64: 138 case AMDGPU::S_ANDN2_B64: 139 case AMDGPU::S_ORN2_B64: 140 case AMDGPU::S_NAND_B64: 141 case AMDGPU::S_NOR_B64: 142 case AMDGPU::S_XNOR_B64: { 143 const MachineOperand &Src1 = MI.getOperand(1); 144 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 145 return MI.getOperand(0).getReg(); 146 const MachineOperand &Src2 = MI.getOperand(2); 147 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 148 return MI.getOperand(0).getReg(); 149 break; 150 } 151 case AMDGPU::S_AND_B32: 152 case AMDGPU::S_OR_B32: 153 case AMDGPU::S_XOR_B32: 154 case AMDGPU::S_ANDN2_B32: 155 case AMDGPU::S_ORN2_B32: 156 case AMDGPU::S_NAND_B32: 157 case AMDGPU::S_NOR_B32: 158 case AMDGPU::S_XNOR_B32: { 159 const MachineOperand &Src1 = MI.getOperand(1); 160 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) 161 return MI.getOperand(0).getReg(); 162 const MachineOperand &Src2 = MI.getOperand(2); 163 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) 164 return MI.getOperand(0).getReg(); 165 break; 166 } 167 } 168 169 return AMDGPU::NoRegister; 170 } 171 172 static unsigned getSaveExecOp(unsigned Opc) { 173 switch (Opc) { 174 case AMDGPU::S_AND_B64: 175 return AMDGPU::S_AND_SAVEEXEC_B64; 176 case AMDGPU::S_OR_B64: 177 return AMDGPU::S_OR_SAVEEXEC_B64; 178 case AMDGPU::S_XOR_B64: 179 return AMDGPU::S_XOR_SAVEEXEC_B64; 180 case AMDGPU::S_ANDN2_B64: 181 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 182 case AMDGPU::S_ORN2_B64: 183 return AMDGPU::S_ORN2_SAVEEXEC_B64; 184 case AMDGPU::S_NAND_B64: 185 return AMDGPU::S_NAND_SAVEEXEC_B64; 186 case AMDGPU::S_NOR_B64: 187 return AMDGPU::S_NOR_SAVEEXEC_B64; 188 case AMDGPU::S_XNOR_B64: 189 return AMDGPU::S_XNOR_SAVEEXEC_B64; 190 case AMDGPU::S_AND_B32: 191 return AMDGPU::S_AND_SAVEEXEC_B32; 192 case AMDGPU::S_OR_B32: 193 return AMDGPU::S_OR_SAVEEXEC_B32; 194 case AMDGPU::S_XOR_B32: 195 return AMDGPU::S_XOR_SAVEEXEC_B32; 196 case AMDGPU::S_ANDN2_B32: 197 return AMDGPU::S_ANDN2_SAVEEXEC_B32; 198 case AMDGPU::S_ORN2_B32: 199 return AMDGPU::S_ORN2_SAVEEXEC_B32; 200 case AMDGPU::S_NAND_B32: 201 return AMDGPU::S_NAND_SAVEEXEC_B32; 202 case AMDGPU::S_NOR_B32: 203 return AMDGPU::S_NOR_SAVEEXEC_B32; 204 case AMDGPU::S_XNOR_B32: 205 return AMDGPU::S_XNOR_SAVEEXEC_B32; 206 default: 207 return AMDGPU::INSTRUCTION_LIST_END; 208 } 209 } 210 211 // These are only terminators to get correct spill code placement during 212 // register allocation, so turn them back into normal instructions. 213 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { 214 switch (MI.getOpcode()) { 215 case AMDGPU::S_MOV_B32_term: { 216 bool RegSrc = MI.getOperand(1).isReg(); 217 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 218 return true; 219 } 220 case AMDGPU::S_MOV_B64_term: { 221 bool RegSrc = MI.getOperand(1).isReg(); 222 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); 223 return true; 224 } 225 case AMDGPU::S_XOR_B64_term: { 226 // This is only a terminator to get the correct spill code placement during 227 // register allocation. 228 MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); 229 return true; 230 } 231 case AMDGPU::S_XOR_B32_term: { 232 // This is only a terminator to get the correct spill code placement during 233 // register allocation. 234 MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); 235 return true; 236 } 237 case AMDGPU::S_OR_B64_term: { 238 // This is only a terminator to get the correct spill code placement during 239 // register allocation. 240 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 241 return true; 242 } 243 case AMDGPU::S_OR_B32_term: { 244 // This is only a terminator to get the correct spill code placement during 245 // register allocation. 246 MI.setDesc(TII->get(AMDGPU::S_OR_B32)); 247 return true; 248 } 249 case AMDGPU::S_ANDN2_B64_term: { 250 // This is only a terminator to get the correct spill code placement during 251 // register allocation. 252 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); 253 return true; 254 } 255 case AMDGPU::S_ANDN2_B32_term: { 256 // This is only a terminator to get the correct spill code placement during 257 // register allocation. 258 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); 259 return true; 260 } 261 case AMDGPU::S_AND_B64_term: { 262 // This is only a terminator to get the correct spill code placement during 263 // register allocation. 264 MI.setDesc(TII->get(AMDGPU::S_AND_B64)); 265 return true; 266 } 267 case AMDGPU::S_AND_B32_term: { 268 // This is only a terminator to get the correct spill code placement during 269 // register allocation. 270 MI.setDesc(TII->get(AMDGPU::S_AND_B32)); 271 return true; 272 } 273 default: 274 return false; 275 } 276 } 277 278 // Turn all pseudoterminators in the block into their equivalent non-terminator 279 // instructions. Returns the reverse iterator to the first non-terminator 280 // instruction in the block. 281 MachineBasicBlock::reverse_iterator 282 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { 283 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 284 285 bool Seen = false; 286 MachineBasicBlock::reverse_iterator FirstNonTerm = I; 287 for (; I != E; ++I) { 288 if (!I->isTerminator()) 289 return Seen ? FirstNonTerm : I; 290 291 if (removeTerminatorBit(*I)) { 292 if (!Seen) { 293 FirstNonTerm = I; 294 Seen = true; 295 } 296 } 297 } 298 299 return FirstNonTerm; 300 } 301 302 MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy( 303 MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const { 304 const unsigned InstLimit = 25; 305 306 auto E = MBB.rend(); 307 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 308 Register CopyFromExec = isCopyFromExec(*I); 309 if (CopyFromExec.isValid()) 310 return I; 311 } 312 313 return E; 314 } 315 316 // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly 317 // report the register as unavailable because a super-register with a lane mask 318 // is unavailable. 319 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 320 for (MachineBasicBlock *Succ : MBB.successors()) { 321 if (Succ->isLiveIn(Reg)) 322 return true; 323 } 324 325 return false; 326 } 327 328 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either 329 // the beginning of the BB is reached or Pred evaluates to true - which can be 330 // an arbitrary condition based on the current MachineInstr, for instance an 331 // target instruction. Breaks prematurely by returning nullptr if one of the 332 // registers given in NonModifiableRegs is modified by the current instruction. 333 MachineInstr *SIOptimizeExecMasking::findInstrBackwards( 334 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 335 ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator, 336 SmallVectorImpl<MachineOperand *> *KillFlagCandidates, 337 unsigned MaxInstructions) const { 338 MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), 339 E = Origin.getParent()->rend(); 340 unsigned CurrentIteration = 0; 341 342 for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { 343 if (A->isDebugInstr()) 344 continue; 345 346 if (Pred(&*A)) 347 return &*A; 348 349 for (MCRegister Reg : NonModifiableRegs) { 350 if (A->modifiesRegister(Reg, TRI)) 351 return nullptr; 352 353 // Check for kills that appear after the terminator instruction, that 354 // would not be detected by clearKillFlags, since they will cause the 355 // register to be dead at a later place, causing the verifier to fail. 356 // We use the candidates to clear the kill flags later. 357 if (Terminator && KillFlagCandidates && A != Terminator && 358 A->killsRegister(Reg, TRI)) { 359 for (MachineOperand &MO : A->operands()) { 360 if (MO.isReg() && MO.isKill()) { 361 Register Candidate = MO.getReg(); 362 if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg)) 363 KillFlagCandidates->push_back(&MO); 364 } 365 } 366 } 367 } 368 369 ++CurrentIteration; 370 } 371 372 return nullptr; 373 } 374 375 // Determine if a register Reg is not re-defined and still in use 376 // in the range (Stop..Start]. 377 // It does so by backwards calculating liveness from the end of the BB until 378 // either Stop or the beginning of the BB is reached. 379 // After liveness is calculated, we can determine if Reg is still in use and not 380 // defined inbetween the instructions. 381 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, 382 MachineInstr &Start, 383 MCRegister Reg, 384 bool UseLiveOuts, 385 bool IgnoreStart) const { 386 LiveRegUnits LR(*TRI); 387 if (UseLiveOuts) 388 LR.addLiveOuts(*Stop.getParent()); 389 390 MachineBasicBlock::reverse_iterator A(Start); 391 392 if (IgnoreStart) 393 ++A; 394 395 for (; A != Stop.getParent()->rend() && A != Stop; ++A) { 396 LR.stepBackward(*A); 397 } 398 399 return !LR.available(Reg) || MRI->isReserved(Reg); 400 } 401 402 // Determine if a register Reg is not re-defined and still in use 403 // in the range (Stop..BB.end]. 404 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, 405 MCRegister Reg) const { 406 return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); 407 } 408 409 // Optimize sequences emitted for control flow lowering. They are originally 410 // emitted as the separate operations because spill code may need to be 411 // inserted for the saved copy of exec. 412 // 413 // x = copy exec 414 // z = s_<op>_b64 x, y 415 // exec = copy z 416 // => 417 // x = s_<op>_saveexec_b64 y 418 // 419 bool SIOptimizeExecMasking::optimizeExecSequence() { 420 bool Changed = false; 421 for (MachineBasicBlock &MBB : *MF) { 422 MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); 423 MachineBasicBlock::reverse_iterator E = MBB.rend(); 424 if (I == E) 425 continue; 426 427 // It's possible to see other terminator copies after the exec copy. This 428 // can happen if control flow pseudos had their outputs used by phis. 429 Register CopyToExec; 430 431 unsigned SearchCount = 0; 432 const unsigned SearchLimit = 5; 433 while (I != E && SearchCount++ < SearchLimit) { 434 CopyToExec = isCopyToExec(*I); 435 if (CopyToExec) 436 break; 437 ++I; 438 } 439 440 if (!CopyToExec) 441 continue; 442 443 // Scan backwards to find the def. 444 auto *CopyToExecInst = &*I; 445 auto CopyFromExecInst = findExecCopy(MBB, I); 446 if (CopyFromExecInst == E) { 447 auto PrepareExecInst = std::next(I); 448 if (PrepareExecInst == E) 449 continue; 450 // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 451 if (CopyToExecInst->getOperand(1).isKill() && 452 isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 453 LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 454 455 PrepareExecInst->getOperand(0).setReg(Exec); 456 457 LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 458 459 CopyToExecInst->eraseFromParent(); 460 Changed = true; 461 } 462 463 continue; 464 } 465 466 if (isLiveOut(MBB, CopyToExec)) { 467 // The copied register is live out and has a second use in another block. 468 LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 469 continue; 470 } 471 472 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 473 MachineInstr *SaveExecInst = nullptr; 474 SmallVector<MachineInstr *, 4> OtherUseInsts; 475 476 for (MachineBasicBlock::iterator 477 J = std::next(CopyFromExecInst->getIterator()), 478 JE = I->getIterator(); 479 J != JE; ++J) { 480 if (SaveExecInst && J->readsRegister(Exec, TRI)) { 481 LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 482 // Make sure this is inserted after any VALU ops that may have been 483 // scheduled in between. 484 SaveExecInst = nullptr; 485 break; 486 } 487 488 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 489 490 if (J->modifiesRegister(CopyToExec, TRI)) { 491 if (SaveExecInst) { 492 LLVM_DEBUG(dbgs() << "Multiple instructions modify " 493 << printReg(CopyToExec, TRI) << '\n'); 494 SaveExecInst = nullptr; 495 break; 496 } 497 498 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 499 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 500 break; 501 502 if (ReadsCopyFromExec) { 503 SaveExecInst = &*J; 504 LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 505 continue; 506 } 507 LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J 508 << '\n'); 509 break; 510 } 511 if (ReadsCopyFromExec && !SaveExecInst) { 512 // Make sure no other instruction is trying to use this copy, before it 513 // will be rewritten by the saveexec, i.e. hasOneUse. There may have 514 // been another use, such as an inserted spill. For example: 515 // 516 // %sgpr0_sgpr1 = COPY %exec 517 // spill %sgpr0_sgpr1 518 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 519 // 520 LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 521 << '\n'); 522 break; 523 } 524 525 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 526 assert(SaveExecInst != &*J); 527 OtherUseInsts.push_back(&*J); 528 } 529 } 530 531 if (!SaveExecInst) 532 continue; 533 534 LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 535 536 MachineOperand &Src0 = SaveExecInst->getOperand(1); 537 MachineOperand &Src1 = SaveExecInst->getOperand(2); 538 539 MachineOperand *OtherOp = nullptr; 540 541 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 542 OtherOp = &Src1; 543 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 544 if (!SaveExecInst->isCommutable()) 545 break; 546 547 OtherOp = &Src0; 548 } else 549 llvm_unreachable("unexpected"); 550 551 CopyFromExecInst->eraseFromParent(); 552 553 auto InsPt = SaveExecInst->getIterator(); 554 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 555 556 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 557 CopyFromExec) 558 .addReg(OtherOp->getReg()); 559 SaveExecInst->eraseFromParent(); 560 561 CopyToExecInst->eraseFromParent(); 562 563 for (MachineInstr *OtherInst : OtherUseInsts) { 564 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, 565 *TRI); 566 } 567 568 Changed = true; 569 } 570 571 return Changed; 572 } 573 574 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the 575 // operands extracted from a v_cmp ..., s_and_saveexec pattern. 576 bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( 577 MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { 578 const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); 579 580 if (NewOpcode == -1) 581 return false; 582 583 MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); 584 MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); 585 586 Register MoveDest = SaveExecInstr.getOperand(0).getReg(); 587 588 MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); 589 if (!SaveExecInstr.uses().empty()) { 590 bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; 591 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 592 BuildMI(*SaveExecInstr.getParent(), InsertPosIt, 593 SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) 594 .addReg(Exec); 595 } 596 597 // Omit dst as V_CMPX is implicitly writing to EXEC. 598 // Add dummy src and clamp modifiers, if needed. 599 auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), 600 VCmp.getDebugLoc(), TII->get(NewOpcode)); 601 602 auto TryAddImmediateValueFromNamedOperand = 603 [&](unsigned OperandName) -> void { 604 if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) 605 Builder.addImm(Mod->getImm()); 606 }; 607 608 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); 609 Builder.add(*Src0); 610 611 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); 612 Builder.add(*Src1); 613 614 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); 615 616 // The kill flags may no longer be correct. 617 if (Src0->isReg()) 618 MRI->clearKillFlags(Src0->getReg()); 619 if (Src1->isReg()) 620 MRI->clearKillFlags(Src1->getReg()); 621 622 for (MachineOperand *MO : KillFlagCandidates) 623 MO->setIsKill(false); 624 625 SaveExecInstr.eraseFromParent(); 626 VCmp.eraseFromParent(); 627 628 return true; 629 } 630 631 // Record (on GFX10.3 and later) occurences of 632 // v_cmp_* SGPR, IMM, VGPR 633 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR 634 // to be replaced with 635 // s_mov_b32 EXEC_SGPR_DEST, exec_lo 636 // v_cmpx_* IMM, VGPR 637 // to reduce pipeline stalls. 638 void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( 639 MachineInstr &MI) { 640 if (!ST->hasGFX10_3Insts()) 641 return; 642 643 const unsigned AndSaveExecOpcode = 644 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 645 646 if (MI.getOpcode() != AndSaveExecOpcode) 647 return; 648 649 Register SaveExecDest = MI.getOperand(0).getReg(); 650 if (!TRI->isSGPRReg(*MRI, SaveExecDest)) 651 return; 652 653 MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 654 if (!SaveExecSrc0->isReg()) 655 return; 656 657 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec 658 // sequence by looking at an instance of an s_and_saveexec instruction. 659 // Returns a pointer to the v_cmp instruction if it is safe to replace the 660 // sequence (see the conditions in the function body). This is after register 661 // allocation, so some checks on operand dependencies need to be considered. 662 MachineInstr *VCmp = nullptr; 663 664 // Try to find the last v_cmp instruction that defs the saveexec input 665 // operand without any write to Exec or the saveexec input operand inbetween. 666 VCmp = findInstrBackwards( 667 MI, 668 [&](MachineInstr *Check) { 669 return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && 670 Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); 671 }, 672 {Exec, SaveExecSrc0->getReg()}); 673 674 if (!VCmp) 675 return; 676 677 MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); 678 assert(VCmpDest && "Should have an sdst operand!"); 679 680 // Check if any of the v_cmp source operands is written by the saveexec. 681 MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); 682 if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && 683 MI.modifiesRegister(Src0->getReg(), TRI)) 684 return; 685 686 MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); 687 if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && 688 MI.modifiesRegister(Src1->getReg(), TRI)) 689 return; 690 691 // Don't do the transformation if the destination operand is included in 692 // it's MBB Live-outs, meaning it's used in any of its successors, leading 693 // to incorrect code if the v_cmp and therefore the def of 694 // the dest operand is removed. 695 if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) 696 return; 697 698 // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the 699 // s_and_saveexec, skip the optimization. 700 if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || 701 isRegisterInUseAfter(MI, VCmpDest->getReg())) 702 return; 703 704 // Try to determine if there is a write to any of the VCmp 705 // operands between the saveexec and the vcmp. 706 // If yes, additional VGPR spilling might need to be inserted. In this case, 707 // it's not worth replacing the instruction sequence. 708 SmallVector<MCRegister, 2> NonDefRegs; 709 if (Src0->isReg()) 710 NonDefRegs.push_back(Src0->getReg()); 711 712 if (Src1->isReg()) 713 NonDefRegs.push_back(Src1->getReg()); 714 715 if (!findInstrBackwards( 716 MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs, 717 VCmp, &KillFlagCandidates)) 718 return; 719 720 if (VCmp) 721 SaveExecVCmpMapping[&MI] = VCmp; 722 } 723 724 // Record occurences of 725 // s_or_saveexec s_o, s_i 726 // s_xor exec, exec, s_o 727 // to be replaced with 728 // s_andn2_saveexec s_o, s_i. 729 void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { 730 const unsigned XorOpcode = 731 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; 732 733 if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { 734 const MachineOperand &XorDst = MI.getOperand(0); 735 const MachineOperand &XorSrc0 = MI.getOperand(1); 736 const MachineOperand &XorSrc1 = MI.getOperand(2); 737 738 if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && 739 XorSrc1.isReg() && 740 (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { 741 const unsigned OrSaveexecOpcode = ST->isWave32() 742 ? AMDGPU::S_OR_SAVEEXEC_B32 743 : AMDGPU::S_OR_SAVEEXEC_B64; 744 745 // Peek at the previous instruction and check if this is a relevant 746 // s_or_saveexec instruction. 747 MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); 748 if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) 749 return; 750 751 const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); 752 const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); 753 if (OrDst.isReg() && OrSrc0.isReg()) { 754 if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || 755 (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { 756 OrXors.emplace_back(&PossibleOrSaveexec, &MI); 757 } 758 } 759 } 760 } 761 } 762 763 bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { 764 if (OrXors.empty()) { 765 return false; 766 } 767 768 bool Changed = false; 769 const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 770 : AMDGPU::S_ANDN2_SAVEEXEC_B64; 771 772 for (const auto &Pair : OrXors) { 773 MachineInstr *Or = nullptr; 774 MachineInstr *Xor = nullptr; 775 std::tie(Or, Xor) = Pair; 776 BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), 777 TII->get(Andn2Opcode), Or->getOperand(0).getReg()) 778 .addReg(Or->getOperand(1).getReg()); 779 780 Or->eraseFromParent(); 781 Xor->eraseFromParent(); 782 783 Changed = true; 784 } 785 786 return Changed; 787 } 788 789 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 790 if (skipFunction(MF.getFunction())) 791 return false; 792 793 this->MF = &MF; 794 ST = &MF.getSubtarget<GCNSubtarget>(); 795 TRI = ST->getRegisterInfo(); 796 TII = ST->getInstrInfo(); 797 MRI = &MF.getRegInfo(); 798 Exec = TRI->getExec(); 799 800 bool Changed = optimizeExecSequence(); 801 802 OrXors.clear(); 803 SaveExecVCmpMapping.clear(); 804 KillFlagCandidates.clear(); 805 static unsigned SearchWindow = 10; 806 for (MachineBasicBlock &MBB : MF) { 807 unsigned SearchCount = 0; 808 809 for (auto &MI : llvm::reverse(MBB)) { 810 if (MI.isDebugInstr()) 811 continue; 812 813 if (SearchCount >= SearchWindow) { 814 break; 815 } 816 817 tryRecordOrSaveexecXorSequence(MI); 818 tryRecordVCmpxAndSaveexecSequence(MI); 819 820 if (MI.modifiesRegister(Exec, TRI)) { 821 break; 822 } 823 824 ++SearchCount; 825 } 826 } 827 828 Changed |= optimizeOrSaveexecXorSequences(); 829 for (const auto &Entry : SaveExecVCmpMapping) { 830 MachineInstr *SaveExecInstr = Entry.getFirst(); 831 MachineInstr *VCmpInstr = Entry.getSecond(); 832 833 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); 834 } 835 836 return Changed; 837 } 838