xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs exec mask handling peephole optimizations which needs
11 /// to be done before register allocation to reduce register pressure.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIOptimizeExecMaskingPreRA.h"
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/LiveIntervals.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/InitializePasses.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
26 
27 namespace {
28 
29 class SIOptimizeExecMaskingPreRA {
30 private:
31   const SIRegisterInfo *TRI;
32   const SIInstrInfo *TII;
33   MachineRegisterInfo *MRI;
34   LiveIntervals *LIS;
35 
36   unsigned AndOpc;
37   unsigned Andn2Opc;
38   unsigned OrSaveExecOpc;
39   unsigned XorTermrOpc;
40   MCRegister CondReg;
41   MCRegister ExecReg;
42 
43   bool optimizeVcndVcmpPair(MachineBasicBlock &MBB);
44   bool optimizeElseBranch(MachineBasicBlock &MBB);
45 
46 public:
47   SIOptimizeExecMaskingPreRA(LiveIntervals *LIS) : LIS(LIS) {}
48   bool run(MachineFunction &MF);
49 };
50 
51 class SIOptimizeExecMaskingPreRALegacy : public MachineFunctionPass {
52 public:
53   static char ID;
54 
55   SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) {
56     initializeSIOptimizeExecMaskingPreRALegacyPass(
57         *PassRegistry::getPassRegistry());
58   }
59 
60   bool runOnMachineFunction(MachineFunction &MF) override;
61 
62   StringRef getPassName() const override {
63     return "SI optimize exec mask operations pre-RA";
64   }
65 
66   void getAnalysisUsage(AnalysisUsage &AU) const override {
67     AU.addRequired<LiveIntervalsWrapperPass>();
68     AU.setPreservesAll();
69     MachineFunctionPass::getAnalysisUsage(AU);
70   }
71 };
72 
73 } // End anonymous namespace.
74 
75 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRALegacy, DEBUG_TYPE,
76                       "SI optimize exec mask operations pre-RA", false, false)
77 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
78 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRALegacy, DEBUG_TYPE,
79                     "SI optimize exec mask operations pre-RA", false, false)
80 
81 char SIOptimizeExecMaskingPreRALegacy::ID = 0;
82 
83 char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRALegacy::ID;
84 
85 FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
86   return new SIOptimizeExecMaskingPreRALegacy();
87 }
88 
89 // See if there is a def between \p AndIdx and \p SelIdx that needs to live
90 // beyond \p AndIdx.
91 static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
92                          SlotIndex SelIdx) {
93   LiveQueryResult AndLRQ = LR.Query(AndIdx);
94   return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
95 }
96 
97 // FIXME: Why do we bother trying to handle physical registers here?
98 static bool isDefBetween(const SIRegisterInfo &TRI,
99                          LiveIntervals *LIS, Register Reg,
100                          const MachineInstr &Sel, const MachineInstr &And) {
101   SlotIndex AndIdx = LIS->getInstructionIndex(And).getRegSlot();
102   SlotIndex SelIdx = LIS->getInstructionIndex(Sel).getRegSlot();
103 
104   if (Reg.isVirtual())
105     return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
106 
107   for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) {
108     if (isDefBetween(LIS->getRegUnit(Unit), AndIdx, SelIdx))
109       return true;
110   }
111 
112   return false;
113 }
114 
115 // Optimize sequence
116 //    %sel = V_CNDMASK_B32_e64 0, 1, %cc
117 //    %cmp = V_CMP_NE_U32 1, %sel
118 //    $vcc = S_AND_B64 $exec, %cmp
119 //    S_CBRANCH_VCC[N]Z
120 // =>
121 //    $vcc = S_ANDN2_B64 $exec, %cc
122 //    S_CBRANCH_VCC[N]Z
123 //
124 // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
125 // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
126 // only 3 first instructions are really needed. S_AND_B64 with exec is a
127 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
128 // lanes.
129 //
130 // Returns true on success.
131 bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
132   auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
133                            unsigned Opc = MI.getOpcode();
134                            return Opc == AMDGPU::S_CBRANCH_VCCZ ||
135                                   Opc == AMDGPU::S_CBRANCH_VCCNZ; });
136   if (I == MBB.terminators().end())
137     return false;
138 
139   auto *And =
140       TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
141   if (!And || And->getOpcode() != AndOpc ||
142       !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
143     return false;
144 
145   MachineOperand *AndCC = &And->getOperand(1);
146   Register CmpReg = AndCC->getReg();
147   unsigned CmpSubReg = AndCC->getSubReg();
148   if (CmpReg == Register(ExecReg)) {
149     AndCC = &And->getOperand(2);
150     CmpReg = AndCC->getReg();
151     CmpSubReg = AndCC->getSubReg();
152   } else if (And->getOperand(2).getReg() != Register(ExecReg)) {
153     return false;
154   }
155 
156   auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
157   if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
158                 Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
159       Cmp->getParent() != And->getParent())
160     return false;
161 
162   MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
163   MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
164   if (Op1->isImm() && Op2->isReg())
165     std::swap(Op1, Op2);
166   if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
167     return false;
168 
169   Register SelReg = Op1->getReg();
170   if (SelReg.isPhysical())
171     return false;
172 
173   auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
174   if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
175     return false;
176 
177   if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
178       TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
179     return false;
180 
181   Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
182   Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
183   MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
184   if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
185       Op1->getImm() != 0 || Op2->getImm() != 1)
186     return false;
187 
188   Register CCReg = CC->getReg();
189 
190   // If there was a def between the select and the and, we would need to move it
191   // to fold this.
192   if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
193     return false;
194 
195   // Cannot safely mirror live intervals with PHI nodes, so check for these
196   // before optimization.
197   SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
198   LiveInterval *SelLI = &LIS->getInterval(SelReg);
199   if (llvm::any_of(SelLI->vnis(),
200                     [](const VNInfo *VNI) {
201                       return VNI->isPHIDef();
202                     }))
203     return false;
204 
205   // TODO: Guard against implicit def operands?
206   LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
207                     << *And);
208 
209   MachineInstr *Andn2 =
210       BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
211               And->getOperand(0).getReg())
212           .addReg(ExecReg)
213           .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
214   MachineOperand &AndSCC = And->getOperand(3);
215   assert(AndSCC.getReg() == AMDGPU::SCC);
216   MachineOperand &Andn2SCC = Andn2->getOperand(3);
217   assert(Andn2SCC.getReg() == AMDGPU::SCC);
218   Andn2SCC.setIsDead(AndSCC.isDead());
219 
220   SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*And, *Andn2);
221   And->eraseFromParent();
222 
223   LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
224 
225   // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
226   // and their associated liveness information.
227   SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
228   if (CCReg.isVirtual()) {
229     LiveInterval &CCLI = LIS->getInterval(CCReg);
230     auto CCQ = CCLI.Query(SelIdx.getRegSlot());
231     if (CCQ.valueIn()) {
232       LIS->removeInterval(CCReg);
233       LIS->createAndComputeVirtRegInterval(CCReg);
234     }
235   } else
236     LIS->removeAllRegUnitsForPhysReg(CCReg);
237 
238   // Try to remove compare. Cmp value should not used in between of cmp
239   // and s_and_b64 if VCC or just unused if any other register.
240   LiveInterval *CmpLI = CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
241   if ((CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
242       (CmpReg == Register(CondReg) &&
243        std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
244                     [&](const MachineInstr &MI) {
245                       return MI.readsRegister(CondReg, TRI);
246                     }))) {
247     LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
248     if (CmpLI)
249       LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
250     LIS->RemoveMachineInstrFromMaps(*Cmp);
251     Cmp->eraseFromParent();
252 
253     // Try to remove v_cndmask_b32.
254     // Kill status must be checked before shrinking the live range.
255     bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
256     LIS->shrinkToUses(SelLI);
257     bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
258     if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
259       LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
260 
261       LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
262       LIS->RemoveMachineInstrFromMaps(*Sel);
263       bool ShrinkSel = Sel->getOperand(0).readsReg();
264       Sel->eraseFromParent();
265       if (ShrinkSel) {
266         // The result of the V_CNDMASK was a subreg def which counted as a read
267         // from the other parts of the reg. Shrink their live ranges.
268         LIS->shrinkToUses(SelLI);
269       }
270     }
271   }
272 
273   return true;
274 }
275 
276 // Optimize sequence
277 //    %dst = S_OR_SAVEEXEC %src
278 //    ... instructions not modifying exec ...
279 //    %tmp = S_AND $exec, %dst
280 //    $exec = S_XOR_term $exec, %tmp
281 // =>
282 //    %dst = S_OR_SAVEEXEC %src
283 //    ... instructions not modifying exec ...
284 //    $exec = S_XOR_term $exec, %dst
285 //
286 // Clean up potentially unnecessary code added for safety during
287 // control flow lowering.
288 //
289 // Return whether any changes were made to MBB.
290 bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
291   if (MBB.empty())
292     return false;
293 
294   // Check this is an else block.
295   auto First = MBB.begin();
296   MachineInstr &SaveExecMI = *First;
297   if (SaveExecMI.getOpcode() != OrSaveExecOpc)
298     return false;
299 
300   auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
301     return MI.getOpcode() == XorTermrOpc;
302   });
303   if (I == MBB.terminators().end())
304     return false;
305 
306   MachineInstr &XorTermMI = *I;
307   if (XorTermMI.getOperand(1).getReg() != Register(ExecReg))
308     return false;
309 
310   Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
311   Register DstReg = XorTermMI.getOperand(2).getReg();
312 
313   // Find potentially unnecessary S_AND
314   MachineInstr *AndExecMI = nullptr;
315   I--;
316   while (I != First && !AndExecMI) {
317     if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
318         I->getOperand(1).getReg() == Register(ExecReg))
319       AndExecMI = &*I;
320     I--;
321   }
322   if (!AndExecMI)
323     return false;
324 
325   // Check for exec modifying instructions.
326   // Note: exec defs do not create live ranges beyond the
327   // instruction so isDefBetween cannot be used.
328   // Instead just check that the def segments are adjacent.
329   SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
330   SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
331   for (MCRegUnit Unit : TRI->regunits(ExecReg)) {
332     LiveRange &RegUnit = LIS->getRegUnit(Unit);
333     if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
334       return false;
335   }
336 
337   // Remove unnecessary S_AND
338   LIS->removeInterval(SavedExecReg);
339   LIS->removeInterval(DstReg);
340 
341   SaveExecMI.getOperand(0).setReg(DstReg);
342 
343   LIS->RemoveMachineInstrFromMaps(*AndExecMI);
344   AndExecMI->eraseFromParent();
345 
346   LIS->createAndComputeVirtRegInterval(DstReg);
347 
348   return true;
349 }
350 
351 PreservedAnalyses
352 SIOptimizeExecMaskingPreRAPass::run(MachineFunction &MF,
353                                     MachineFunctionAnalysisManager &MFAM) {
354   auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
355   SIOptimizeExecMaskingPreRA(&LIS).run(MF);
356   return PreservedAnalyses::all();
357 }
358 
359 bool SIOptimizeExecMaskingPreRALegacy::runOnMachineFunction(
360     MachineFunction &MF) {
361   if (skipFunction(MF.getFunction()))
362     return false;
363 
364   auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
365   return SIOptimizeExecMaskingPreRA(LIS).run(MF);
366 }
367 
368 bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
369   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
370   TRI = ST.getRegisterInfo();
371   TII = ST.getInstrInfo();
372   MRI = &MF.getRegInfo();
373 
374   const bool Wave32 = ST.isWave32();
375   AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
376   Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
377   OrSaveExecOpc =
378       Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
379   XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
380   CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC);
381   ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
382 
383   DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
384   bool Changed = false;
385 
386   for (MachineBasicBlock &MBB : MF) {
387 
388     if (optimizeElseBranch(MBB)) {
389       RecalcRegs.insert(AMDGPU::SCC);
390       Changed = true;
391     }
392 
393     if (optimizeVcndVcmpPair(MBB)) {
394       RecalcRegs.insert(AMDGPU::VCC_LO);
395       RecalcRegs.insert(AMDGPU::VCC_HI);
396       RecalcRegs.insert(AMDGPU::SCC);
397       Changed = true;
398     }
399 
400     // Try to remove unneeded instructions before s_endpgm.
401     if (MBB.succ_empty()) {
402       if (MBB.empty())
403         continue;
404 
405       // Skip this if the endpgm has any implicit uses, otherwise we would need
406       // to be careful to update / remove them.
407       // S_ENDPGM always has a single imm operand that is not used other than to
408       // end up in the encoding
409       MachineInstr &Term = MBB.back();
410       if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1)
411         continue;
412 
413       SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
414 
415       while (!Blocks.empty()) {
416         auto *CurBB = Blocks.pop_back_val();
417         auto I = CurBB->rbegin(), E = CurBB->rend();
418         if (I != E) {
419           if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
420             ++I;
421           else if (I->isBranch())
422             continue;
423         }
424 
425         while (I != E) {
426           if (I->isDebugInstr()) {
427             I = std::next(I);
428             continue;
429           }
430 
431           if (I->mayStore() || I->isBarrier() || I->isCall() ||
432               I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
433             break;
434 
435           LLVM_DEBUG(dbgs()
436                      << "Removing no effect instruction: " << *I << '\n');
437 
438           for (auto &Op : I->operands()) {
439             if (Op.isReg())
440               RecalcRegs.insert(Op.getReg());
441           }
442 
443           auto Next = std::next(I);
444           LIS->RemoveMachineInstrFromMaps(*I);
445           I->eraseFromParent();
446           I = Next;
447 
448           Changed = true;
449         }
450 
451         if (I != E)
452           continue;
453 
454         // Try to ascend predecessors.
455         for (auto *Pred : CurBB->predecessors()) {
456           if (Pred->succ_size() == 1)
457             Blocks.push_back(Pred);
458         }
459       }
460       continue;
461     }
462 
463     // If the only user of a logical operation is move to exec, fold it now
464     // to prevent forming of saveexec. I.e.:
465     //
466     //    %0:sreg_64 = COPY $exec
467     //    %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
468     // =>
469     //    %1 = S_AND_B64 $exec, %2:sreg_64
470     unsigned ScanThreshold = 10;
471     for (auto I = MBB.rbegin(), E = MBB.rend(); I != E
472          && ScanThreshold--; ++I) {
473       // Continue scanning if this is not a full exec copy
474       if (!(I->isFullCopy() && I->getOperand(1).getReg() == Register(ExecReg)))
475         continue;
476 
477       Register SavedExec = I->getOperand(0).getReg();
478       if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec)) {
479         MachineInstr *SingleExecUser = &*MRI->use_instr_nodbg_begin(SavedExec);
480         int Idx = SingleExecUser->findRegisterUseOperandIdx(SavedExec,
481                                                             /*TRI=*/nullptr);
482         assert(Idx != -1);
483         if (SingleExecUser->getParent() == I->getParent() &&
484             !SingleExecUser->getOperand(Idx).isImplicit() &&
485             TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) {
486           LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
487           LIS->RemoveMachineInstrFromMaps(*I);
488           I->eraseFromParent();
489           MRI->replaceRegWith(SavedExec, ExecReg);
490           LIS->removeInterval(SavedExec);
491           Changed = true;
492         }
493       }
494       break;
495     }
496   }
497 
498   if (Changed) {
499     for (auto Reg : RecalcRegs) {
500       if (Reg.isVirtual()) {
501         LIS->removeInterval(Reg);
502         if (!MRI->reg_empty(Reg))
503           LIS->createAndComputeVirtRegInterval(Reg);
504       } else {
505         LIS->removeAllRegUnitsForPhysReg(Reg);
506       }
507     }
508   }
509 
510   return Changed;
511 }
512