xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPU.h"
10 #include "GCNSubtarget.h"
11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12 #include "llvm/CodeGen/LivePhysRegs.h"
13 #include "llvm/CodeGen/MachineFunctionPass.h"
14 #include "llvm/InitializePasses.h"
15 
16 using namespace llvm;
17 
18 #define DEBUG_TYPE "si-optimize-exec-masking"
19 
20 namespace {
21 
22 class SIOptimizeExecMasking : public MachineFunctionPass {
23 public:
24   static char ID;
25 
26 public:
27   SIOptimizeExecMasking() : MachineFunctionPass(ID) {
28     initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
29   }
30 
31   bool runOnMachineFunction(MachineFunction &MF) override;
32 
33   StringRef getPassName() const override {
34     return "SI optimize exec mask operations";
35   }
36 
37   void getAnalysisUsage(AnalysisUsage &AU) const override {
38     AU.setPreservesCFG();
39     MachineFunctionPass::getAnalysisUsage(AU);
40   }
41 };
42 
43 } // End anonymous namespace.
44 
45 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
46                       "SI optimize exec mask operations", false, false)
47 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
48 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
49                     "SI optimize exec mask operations", false, false)
50 
51 char SIOptimizeExecMasking::ID = 0;
52 
53 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
54 
55 /// If \p MI is a copy from exec, return the register copied to.
56 static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
57   switch (MI.getOpcode()) {
58   case AMDGPU::COPY:
59   case AMDGPU::S_MOV_B64:
60   case AMDGPU::S_MOV_B64_term:
61   case AMDGPU::S_MOV_B32:
62   case AMDGPU::S_MOV_B32_term: {
63     const MachineOperand &Src = MI.getOperand(1);
64     if (Src.isReg() &&
65         Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
66       return MI.getOperand(0).getReg();
67   }
68   }
69 
70   return AMDGPU::NoRegister;
71 }
72 
73 /// If \p MI is a copy to exec, return the register copied from.
74 static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
75   switch (MI.getOpcode()) {
76   case AMDGPU::COPY:
77   case AMDGPU::S_MOV_B64:
78   case AMDGPU::S_MOV_B32: {
79     const MachineOperand &Dst = MI.getOperand(0);
80     if (Dst.isReg() &&
81         Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
82         MI.getOperand(1).isReg())
83       return MI.getOperand(1).getReg();
84     break;
85   }
86   case AMDGPU::S_MOV_B64_term:
87   case AMDGPU::S_MOV_B32_term:
88     llvm_unreachable("should have been replaced");
89   }
90 
91   return Register();
92 }
93 
94 /// If \p MI is a logical operation on an exec value,
95 /// return the register copied to.
96 static Register isLogicalOpOnExec(const MachineInstr &MI) {
97   switch (MI.getOpcode()) {
98   case AMDGPU::S_AND_B64:
99   case AMDGPU::S_OR_B64:
100   case AMDGPU::S_XOR_B64:
101   case AMDGPU::S_ANDN2_B64:
102   case AMDGPU::S_ORN2_B64:
103   case AMDGPU::S_NAND_B64:
104   case AMDGPU::S_NOR_B64:
105   case AMDGPU::S_XNOR_B64: {
106     const MachineOperand &Src1 = MI.getOperand(1);
107     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
108       return MI.getOperand(0).getReg();
109     const MachineOperand &Src2 = MI.getOperand(2);
110     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
111       return MI.getOperand(0).getReg();
112     break;
113   }
114   case AMDGPU::S_AND_B32:
115   case AMDGPU::S_OR_B32:
116   case AMDGPU::S_XOR_B32:
117   case AMDGPU::S_ANDN2_B32:
118   case AMDGPU::S_ORN2_B32:
119   case AMDGPU::S_NAND_B32:
120   case AMDGPU::S_NOR_B32:
121   case AMDGPU::S_XNOR_B32: {
122     const MachineOperand &Src1 = MI.getOperand(1);
123     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
124       return MI.getOperand(0).getReg();
125     const MachineOperand &Src2 = MI.getOperand(2);
126     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
127       return MI.getOperand(0).getReg();
128     break;
129   }
130   }
131 
132   return AMDGPU::NoRegister;
133 }
134 
135 static unsigned getSaveExecOp(unsigned Opc) {
136   switch (Opc) {
137   case AMDGPU::S_AND_B64:
138     return AMDGPU::S_AND_SAVEEXEC_B64;
139   case AMDGPU::S_OR_B64:
140     return AMDGPU::S_OR_SAVEEXEC_B64;
141   case AMDGPU::S_XOR_B64:
142     return AMDGPU::S_XOR_SAVEEXEC_B64;
143   case AMDGPU::S_ANDN2_B64:
144     return AMDGPU::S_ANDN2_SAVEEXEC_B64;
145   case AMDGPU::S_ORN2_B64:
146     return AMDGPU::S_ORN2_SAVEEXEC_B64;
147   case AMDGPU::S_NAND_B64:
148     return AMDGPU::S_NAND_SAVEEXEC_B64;
149   case AMDGPU::S_NOR_B64:
150     return AMDGPU::S_NOR_SAVEEXEC_B64;
151   case AMDGPU::S_XNOR_B64:
152     return AMDGPU::S_XNOR_SAVEEXEC_B64;
153   case AMDGPU::S_AND_B32:
154     return AMDGPU::S_AND_SAVEEXEC_B32;
155   case AMDGPU::S_OR_B32:
156     return AMDGPU::S_OR_SAVEEXEC_B32;
157   case AMDGPU::S_XOR_B32:
158     return AMDGPU::S_XOR_SAVEEXEC_B32;
159   case AMDGPU::S_ANDN2_B32:
160     return AMDGPU::S_ANDN2_SAVEEXEC_B32;
161   case AMDGPU::S_ORN2_B32:
162     return AMDGPU::S_ORN2_SAVEEXEC_B32;
163   case AMDGPU::S_NAND_B32:
164     return AMDGPU::S_NAND_SAVEEXEC_B32;
165   case AMDGPU::S_NOR_B32:
166     return AMDGPU::S_NOR_SAVEEXEC_B32;
167   case AMDGPU::S_XNOR_B32:
168     return AMDGPU::S_XNOR_SAVEEXEC_B32;
169   default:
170     return AMDGPU::INSTRUCTION_LIST_END;
171   }
172 }
173 
174 // These are only terminators to get correct spill code placement during
175 // register allocation, so turn them back into normal instructions.
176 static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
177   switch (MI.getOpcode()) {
178   case AMDGPU::S_MOV_B32_term: {
179     bool RegSrc = MI.getOperand(1).isReg();
180     MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
181     return true;
182   }
183   case AMDGPU::S_MOV_B64_term: {
184     bool RegSrc = MI.getOperand(1).isReg();
185     MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
186     return true;
187   }
188   case AMDGPU::S_XOR_B64_term: {
189     // This is only a terminator to get the correct spill code placement during
190     // register allocation.
191     MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
192     return true;
193   }
194   case AMDGPU::S_XOR_B32_term: {
195     // This is only a terminator to get the correct spill code placement during
196     // register allocation.
197     MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
198     return true;
199   }
200   case AMDGPU::S_OR_B64_term: {
201     // This is only a terminator to get the correct spill code placement during
202     // register allocation.
203     MI.setDesc(TII.get(AMDGPU::S_OR_B64));
204     return true;
205   }
206   case AMDGPU::S_OR_B32_term: {
207     // This is only a terminator to get the correct spill code placement during
208     // register allocation.
209     MI.setDesc(TII.get(AMDGPU::S_OR_B32));
210     return true;
211   }
212   case AMDGPU::S_ANDN2_B64_term: {
213     // This is only a terminator to get the correct spill code placement during
214     // register allocation.
215     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
216     return true;
217   }
218   case AMDGPU::S_ANDN2_B32_term: {
219     // This is only a terminator to get the correct spill code placement during
220     // register allocation.
221     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
222     return true;
223   }
224   case AMDGPU::S_AND_B64_term: {
225     // This is only a terminator to get the correct spill code placement during
226     // register allocation.
227     MI.setDesc(TII.get(AMDGPU::S_AND_B64));
228     return true;
229   }
230   case AMDGPU::S_AND_B32_term: {
231     // This is only a terminator to get the correct spill code placement during
232     // register allocation.
233     MI.setDesc(TII.get(AMDGPU::S_AND_B32));
234     return true;
235   }
236   default:
237     return false;
238   }
239 }
240 
241 // Turn all pseudoterminators in the block into their equivalent non-terminator
242 // instructions. Returns the reverse iterator to the first non-terminator
243 // instruction in the block.
244 static MachineBasicBlock::reverse_iterator fixTerminators(
245   const SIInstrInfo &TII,
246   MachineBasicBlock &MBB) {
247   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
248 
249   bool Seen = false;
250   MachineBasicBlock::reverse_iterator FirstNonTerm = I;
251   for (; I != E; ++I) {
252     if (!I->isTerminator())
253       return Seen ? FirstNonTerm : I;
254 
255     if (removeTerminatorBit(TII, *I)) {
256       if (!Seen) {
257         FirstNonTerm = I;
258         Seen = true;
259       }
260     }
261   }
262 
263   return FirstNonTerm;
264 }
265 
266 static MachineBasicBlock::reverse_iterator findExecCopy(
267   const SIInstrInfo &TII,
268   const GCNSubtarget &ST,
269   MachineBasicBlock &MBB,
270   MachineBasicBlock::reverse_iterator I,
271   unsigned CopyToExec) {
272   const unsigned InstLimit = 25;
273 
274   auto E = MBB.rend();
275   for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
276     Register CopyFromExec = isCopyFromExec(*I, ST);
277     if (CopyFromExec.isValid())
278       return I;
279   }
280 
281   return E;
282 }
283 
284 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
285 // report the register as unavailable because a super-register with a lane mask
286 // is unavailable.
287 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
288   for (MachineBasicBlock *Succ : MBB.successors()) {
289     if (Succ->isLiveIn(Reg))
290       return true;
291   }
292 
293   return false;
294 }
295 
296 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
297 // the beginning of the BB is reached or Pred evaluates to true - which can be
298 // an arbitrary condition based on the current MachineInstr, for instance an
299 // target instruction. Breaks prematurely by returning nullptr if  one of the
300 // registers given in NonModifiableRegs is modified by the current instruction.
301 static MachineInstr *
302 findInstrBackwards(MachineInstr &Origin,
303                    std::function<bool(MachineInstr *)> Pred,
304                    ArrayRef<MCRegister> NonModifiableRegs,
305                    const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
306   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
307                                       E = Origin.getParent()->rend();
308   unsigned CurrentIteration = 0;
309 
310   for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
311     if (A->isDebugInstr())
312       continue;
313 
314     if (Pred(&*A))
315       return &*A;
316 
317     for (MCRegister Reg : NonModifiableRegs) {
318       if (A->modifiesRegister(Reg, TRI))
319         return nullptr;
320     }
321 
322     ++CurrentIteration;
323   }
324 
325   return nullptr;
326 }
327 
328 
329 // Determine if a register Reg is not re-defined and still in use
330 // in the range (Stop..Start].
331 // It does so by backwards calculating liveness from the end of the BB until
332 // either Stop or the beginning of the BB is reached.
333 // After liveness is calculated, we can determine if Reg is still in use and not
334 // defined inbetween the instructions.
335 static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
336                                    MCRegister Reg, const SIRegisterInfo *TRI,
337                                    MachineRegisterInfo &MRI,
338                                    bool useLiveOuts = false,
339                                    bool ignoreStart = false) {
340   LivePhysRegs LR(*TRI);
341   if (useLiveOuts)
342     LR.addLiveOuts(*Stop.getParent());
343 
344   MachineBasicBlock::reverse_iterator A(Start);
345   MachineBasicBlock::reverse_iterator E(Stop);
346 
347   if (ignoreStart)
348     ++A;
349 
350   for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
351     LR.stepBackward(*A);
352   }
353 
354   return !LR.available(MRI, Reg);
355 }
356 
357 // Determine if a register Reg is not re-defined and still in use
358 // in the range (Stop..BB.end].
359 static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
360                                  const SIRegisterInfo *TRI,
361                                  MachineRegisterInfo &MRI) {
362   return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI,
363                                 MRI, true);
364 }
365 
366 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
367 // by looking at an instance of a s_and_saveexec instruction. Returns a pointer
368 // to the v_cmp instruction if it is safe to replace the sequence (see the
369 // conditions in the function body). This is after register allocation, so some
370 // checks on operand dependencies need to be considered.
371 static MachineInstr *findPossibleVCMPVCMPXOptimization(
372     MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
373     const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
374 
375   MachineInstr *VCmp = nullptr;
376 
377   Register SaveExecDest = SaveExec.getOperand(0).getReg();
378   if (!TRI->isSGPRReg(MRI, SaveExecDest))
379     return nullptr;
380 
381   MachineOperand *SaveExecSrc0 =
382       TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
383   if (!SaveExecSrc0->isReg())
384     return nullptr;
385 
386   // Try to find the last v_cmp instruction that defs the saveexec input
387   // operand without any write to Exec or the saveexec input operand inbetween.
388   VCmp = findInstrBackwards(
389       SaveExec,
390       [&](MachineInstr *Check) {
391         return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
392                Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
393       },
394       {Exec, SaveExecSrc0->getReg()}, TRI);
395 
396   if (!VCmp)
397     return nullptr;
398 
399   MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
400   assert(VCmpDest && "Should have an sdst operand!");
401 
402   // Check if any of the v_cmp source operands is written by the saveexec.
403   MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
404   if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
405       SaveExec.modifiesRegister(Src0->getReg(), TRI))
406     return nullptr;
407 
408   MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
409   if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
410       SaveExec.modifiesRegister(Src1->getReg(), TRI))
411     return nullptr;
412 
413   // Don't do the transformation if the destination operand is included in
414   // it's MBB Live-outs, meaning it's used in any of it's successors, leading
415   // to incorrect code if the v_cmp and therefore the def of
416   // the dest operand is removed.
417   if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
418     return nullptr;
419 
420   // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
421   // s_and_saveexec, skip the optimization.
422   if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI,
423                              false, true) ||
424       isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
425     return nullptr;
426 
427   // Try to determine if there is a write to any of the VCmp
428   // operands between the saveexec and the vcmp.
429   // If yes, additional VGPR spilling might need to be inserted. In this case,
430   // it's not worth replacing the instruction sequence.
431   SmallVector<MCRegister, 2> NonDefRegs;
432   if (Src0->isReg())
433     NonDefRegs.push_back(Src0->getReg());
434 
435   if (Src1->isReg())
436     NonDefRegs.push_back(Src1->getReg());
437 
438   if (!findInstrBackwards(
439           SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
440           NonDefRegs, TRI))
441     return nullptr;
442 
443   return VCmp;
444 }
445 
446 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
447 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
448 static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
449                                          MachineInstr &VCmp, MCRegister Exec,
450                                          const SIInstrInfo *TII,
451                                          const SIRegisterInfo *TRI,
452                                          MachineRegisterInfo &MRI) {
453   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
454 
455   if (NewOpcode == -1)
456     return false;
457 
458   MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
459   MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
460 
461   Register MoveDest = SaveExecInstr.getOperand(0).getReg();
462 
463   MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
464   if (!SaveExecInstr.uses().empty()) {
465     bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
466     unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
467     BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
468             SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
469         .addReg(Exec);
470   }
471 
472   // Omit dst as V_CMPX is implicitly writing to EXEC.
473   // Add dummy src and clamp modifiers, if needed.
474   auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
475                          VCmp.getDebugLoc(), TII->get(NewOpcode));
476 
477   auto TryAddImmediateValueFromNamedOperand =
478       [&](unsigned OperandName) -> void {
479     if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
480       Builder.addImm(Mod->getImm());
481   };
482 
483   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
484   Builder.add(*Src0);
485 
486   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
487   Builder.add(*Src1);
488 
489   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
490 
491   // The kill flags may no longer be correct.
492   if (Src0->isReg())
493     MRI.clearKillFlags(Src0->getReg());
494   if (Src1->isReg())
495     MRI.clearKillFlags(Src1->getReg());
496 
497   return true;
498 }
499 
500 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
501   if (skipFunction(MF.getFunction()))
502     return false;
503 
504   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
505   const SIRegisterInfo *TRI = ST.getRegisterInfo();
506   const SIInstrInfo *TII = ST.getInstrInfo();
507   MachineRegisterInfo *MRI = &MF.getRegInfo();
508   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
509 
510   // Optimize sequences emitted for control flow lowering. They are originally
511   // emitted as the separate operations because spill code may need to be
512   // inserted for the saved copy of exec.
513   //
514   //     x = copy exec
515   //     z = s_<op>_b64 x, y
516   //     exec = copy z
517   // =>
518   //     x = s_<op>_saveexec_b64 y
519   //
520 
521   bool Changed = false;
522   for (MachineBasicBlock &MBB : MF) {
523     MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
524     MachineBasicBlock::reverse_iterator E = MBB.rend();
525     if (I == E)
526       continue;
527 
528     // It's possible to see other terminator copies after the exec copy. This
529     // can happen if control flow pseudos had their outputs used by phis.
530     Register CopyToExec;
531 
532     unsigned SearchCount = 0;
533     const unsigned SearchLimit = 5;
534     while (I != E && SearchCount++ < SearchLimit) {
535       CopyToExec = isCopyToExec(*I, ST);
536       if (CopyToExec)
537         break;
538       ++I;
539     }
540 
541     if (!CopyToExec)
542       continue;
543 
544     // Scan backwards to find the def.
545     auto CopyToExecInst = &*I;
546     auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
547     if (CopyFromExecInst == E) {
548       auto PrepareExecInst = std::next(I);
549       if (PrepareExecInst == E)
550         continue;
551       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
552       if (CopyToExecInst->getOperand(1).isKill() &&
553           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
554         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
555 
556         PrepareExecInst->getOperand(0).setReg(Exec);
557 
558         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
559 
560         CopyToExecInst->eraseFromParent();
561         Changed = true;
562       }
563 
564       continue;
565     }
566 
567     if (isLiveOut(MBB, CopyToExec)) {
568       // The copied register is live out and has a second use in another block.
569       LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
570       continue;
571     }
572 
573     Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
574     MachineInstr *SaveExecInst = nullptr;
575     SmallVector<MachineInstr *, 4> OtherUseInsts;
576 
577     for (MachineBasicBlock::iterator J
578            = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
579          J != JE; ++J) {
580       if (SaveExecInst && J->readsRegister(Exec, TRI)) {
581         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
582         // Make sure this is inserted after any VALU ops that may have been
583         // scheduled in between.
584         SaveExecInst = nullptr;
585         break;
586       }
587 
588       bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
589 
590       if (J->modifiesRegister(CopyToExec, TRI)) {
591         if (SaveExecInst) {
592           LLVM_DEBUG(dbgs() << "Multiple instructions modify "
593                             << printReg(CopyToExec, TRI) << '\n');
594           SaveExecInst = nullptr;
595           break;
596         }
597 
598         unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
599         if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
600           break;
601 
602         if (ReadsCopyFromExec) {
603           SaveExecInst = &*J;
604           LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
605           continue;
606         } else {
607           LLVM_DEBUG(dbgs()
608                      << "Instruction does not read exec copy: " << *J << '\n');
609           break;
610         }
611       } else if (ReadsCopyFromExec && !SaveExecInst) {
612         // Make sure no other instruction is trying to use this copy, before it
613         // will be rewritten by the saveexec, i.e. hasOneUse. There may have
614         // been another use, such as an inserted spill. For example:
615         //
616         // %sgpr0_sgpr1 = COPY %exec
617         // spill %sgpr0_sgpr1
618         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
619         //
620         LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
621                           << '\n');
622         break;
623       }
624 
625       if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
626         assert(SaveExecInst != &*J);
627         OtherUseInsts.push_back(&*J);
628       }
629     }
630 
631     if (!SaveExecInst)
632       continue;
633 
634     LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
635 
636     MachineOperand &Src0 = SaveExecInst->getOperand(1);
637     MachineOperand &Src1 = SaveExecInst->getOperand(2);
638 
639     MachineOperand *OtherOp = nullptr;
640 
641     if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
642       OtherOp = &Src1;
643     } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
644       if (!SaveExecInst->isCommutable())
645         break;
646 
647       OtherOp = &Src0;
648     } else
649       llvm_unreachable("unexpected");
650 
651     CopyFromExecInst->eraseFromParent();
652 
653     auto InsPt = SaveExecInst->getIterator();
654     const DebugLoc &DL = SaveExecInst->getDebugLoc();
655 
656     BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
657             CopyFromExec)
658       .addReg(OtherOp->getReg());
659     SaveExecInst->eraseFromParent();
660 
661     CopyToExecInst->eraseFromParent();
662 
663     for (MachineInstr *OtherInst : OtherUseInsts) {
664       OtherInst->substituteRegister(CopyToExec, Exec,
665                                     AMDGPU::NoSubRegister, *TRI);
666     }
667 
668     Changed = true;
669   }
670 
671   // After all s_op_saveexec instructions are inserted,
672   // replace (on GFX10.3 and later)
673   // v_cmp_* SGPR, IMM, VGPR
674   // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
675   // with
676   // s_mov_b32 EXEC_SGPR_DEST, exec_lo
677   // v_cmpx_* IMM, VGPR
678   // to reduce pipeline stalls.
679   if (ST.hasGFX10_3Insts()) {
680     DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
681     const unsigned AndSaveExecOpcode =
682         ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
683 
684     for (MachineBasicBlock &MBB : MF) {
685       for (MachineInstr &MI : MBB) {
686         // Record relevant v_cmp / s_and_saveexec instruction pairs for
687         // replacement.
688         if (MI.getOpcode() != AndSaveExecOpcode)
689           continue;
690 
691         if (MachineInstr *VCmp =
692                 findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
693           SaveExecVCmpMapping[&MI] = VCmp;
694       }
695     }
696 
697     for (const auto &Entry : SaveExecVCmpMapping) {
698       MachineInstr *SaveExecInstr = Entry.getFirst();
699       MachineInstr *VCmpInstr = Entry.getSecond();
700 
701       if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
702                                        TRI, *MRI)) {
703         SaveExecInstr->eraseFromParent();
704         VCmpInstr->eraseFromParent();
705 
706         Changed = true;
707       }
708     }
709   }
710 
711   return Changed;
712 }
713