xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 2c2ec6bbc9cc7762a250ffe903bda6c2e44d25ff)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "SIWholeQuadMode.h"
71 #include "AMDGPU.h"
72 #include "GCNSubtarget.h"
73 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
74 #include "llvm/ADT/MapVector.h"
75 #include "llvm/ADT/PostOrderIterator.h"
76 #include "llvm/CodeGen/LiveIntervals.h"
77 #include "llvm/CodeGen/MachineBasicBlock.h"
78 #include "llvm/CodeGen/MachineDominators.h"
79 #include "llvm/CodeGen/MachineFunctionPass.h"
80 #include "llvm/CodeGen/MachineInstr.h"
81 #include "llvm/CodeGen/MachinePostDominators.h"
82 #include "llvm/IR/CallingConv.h"
83 #include "llvm/InitializePasses.h"
84 #include "llvm/Support/raw_ostream.h"
85 
86 using namespace llvm;
87 
88 #define DEBUG_TYPE "si-wqm"
89 
90 namespace {
91 
92 enum {
93   StateWQM = 0x1,
94   StateStrictWWM = 0x2,
95   StateStrictWQM = 0x4,
96   StateExact = 0x8,
97   StateStrict = StateStrictWWM | StateStrictWQM,
98 };
99 
100 struct PrintState {
101 public:
102   int State;
103 
104   explicit PrintState(int State) : State(State) {}
105 };
106 
107 #ifndef NDEBUG
108 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
109 
110   static const std::pair<char, const char *> Mapping[] = {
111       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
112       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
113   char State = PS.State;
114   for (auto M : Mapping) {
115     if (State & M.first) {
116       OS << M.second;
117       State &= ~M.first;
118 
119       if (State)
120         OS << '|';
121     }
122   }
123   assert(State == 0);
124   return OS;
125 }
126 #endif
127 
128 struct InstrInfo {
129   char Needs = 0;
130   char Disabled = 0;
131   char OutNeeds = 0;
132   char MarkedStates = 0;
133 };
134 
135 struct BlockInfo {
136   char Needs = 0;
137   char InNeeds = 0;
138   char OutNeeds = 0;
139   char InitialState = 0;
140   bool NeedsLowering = false;
141 };
142 
143 struct WorkItem {
144   MachineBasicBlock *MBB = nullptr;
145   MachineInstr *MI = nullptr;
146 
147   WorkItem() = default;
148   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149   WorkItem(MachineInstr *MI) : MI(MI) {}
150 };
151 
152 class SIWholeQuadMode {
153 public:
154   SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS,
155                   MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
156       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
157         TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
158         PDT(PDT) {}
159   bool run(MachineFunction &MF);
160 
161 private:
162   const GCNSubtarget *ST;
163   const SIInstrInfo *TII;
164   const SIRegisterInfo *TRI;
165   MachineRegisterInfo *MRI;
166   LiveIntervals *LIS;
167   MachineDominatorTree *MDT;
168   MachinePostDominatorTree *PDT;
169 
170   unsigned AndOpc;
171   unsigned AndTermOpc;
172   unsigned AndN2Opc;
173   unsigned XorOpc;
174   unsigned AndSaveExecOpc;
175   unsigned AndSaveExecTermOpc;
176   unsigned WQMOpc;
177   Register Exec;
178   Register LiveMaskReg;
179 
180   DenseMap<const MachineInstr *, InstrInfo> Instructions;
181   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
182 
183   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
184   DenseMap<const MachineInstr *, char> StateTransition;
185 
186   SmallVector<MachineInstr *, 2> LiveMaskQueries;
187   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
188   SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
189   SmallVector<MachineInstr *, 4> KillInstrs;
190   SmallVector<MachineInstr *, 4> InitExecInstrs;
191   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
192 
193   void printInfo();
194 
195   void markInstruction(MachineInstr &MI, char Flag,
196                        std::vector<WorkItem> &Worklist);
197   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
198                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
199   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
200                    std::vector<WorkItem> &Worklist);
201   void markInstructionUses(const MachineInstr &MI, char Flag,
202                            std::vector<WorkItem> &Worklist);
203   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
204   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
205   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
206   char analyzeFunction(MachineFunction &MF);
207 
208   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
209                                       MachineBasicBlock::iterator Before);
210   MachineBasicBlock::iterator
211   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
212                    MachineBasicBlock::iterator Last, bool PreferLast,
213                    bool SaveSCC);
214   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
215                Register SaveWQM);
216   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
217              Register SavedWQM);
218   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
219                     Register SaveOrig, char StrictStateNeeded);
220   void fromStrictMode(MachineBasicBlock &MBB,
221                       MachineBasicBlock::iterator Before, Register SavedOrig,
222                       char NonStrictState, char CurrentStrictState);
223 
224   void splitBlock(MachineInstr *TermMI);
225   MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
226   MachineInstr *lowerKillF32(MachineInstr &MI);
227 
228   void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
229   void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
230 
231   bool lowerLiveMaskQueries();
232   bool lowerCopyInstrs();
233   bool lowerKillInstrs(bool IsWQM);
234   void lowerInitExec(MachineInstr &MI);
235   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
236                                                   bool &Changed);
237 };
238 
239 class SIWholeQuadModeLegacy : public MachineFunctionPass {
240 public:
241   static char ID;
242 
243   SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {}
244 
245   bool runOnMachineFunction(MachineFunction &MF) override;
246 
247   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
248 
249   void getAnalysisUsage(AnalysisUsage &AU) const override {
250     AU.addRequired<LiveIntervalsWrapperPass>();
251     AU.addPreserved<SlotIndexesWrapperPass>();
252     AU.addPreserved<LiveIntervalsWrapperPass>();
253     AU.addPreserved<MachineDominatorTreeWrapperPass>();
254     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
255     MachineFunctionPass::getAnalysisUsage(AU);
256   }
257 
258   MachineFunctionProperties getClearedProperties() const override {
259     return MachineFunctionProperties().setIsSSA();
260   }
261 };
262 } // end anonymous namespace
263 
264 char SIWholeQuadModeLegacy::ID = 0;
265 
266 INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
267                       false, false)
268 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
269 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
270 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
271 INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
272                     false, false)
273 
274 char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID;
275 
276 FunctionPass *llvm::createSIWholeQuadModeLegacyPass() {
277   return new SIWholeQuadModeLegacy;
278 }
279 
280 #ifndef NDEBUG
281 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
282   for (const auto &BII : Blocks) {
283     dbgs() << "\n"
284            << printMBBReference(*BII.first) << ":\n"
285            << "  InNeeds = " << PrintState(BII.second.InNeeds)
286            << ", Needs = " << PrintState(BII.second.Needs)
287            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
288 
289     for (const MachineInstr &MI : *BII.first) {
290       auto III = Instructions.find(&MI);
291       if (III != Instructions.end()) {
292         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
293                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
294       }
295     }
296   }
297 }
298 #endif
299 
300 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
301                                       std::vector<WorkItem> &Worklist) {
302   InstrInfo &II = Instructions[&MI];
303 
304   assert(!(Flag & StateExact) && Flag != 0);
305 
306   // Capture all states requested in marking including disabled ones.
307   II.MarkedStates |= Flag;
308 
309   // Remove any disabled states from the flag. The user that required it gets
310   // an undefined value in the helper lanes. For example, this can happen if
311   // the result of an atomic is used by instruction that requires WQM, where
312   // ignoring the request for WQM is correct as per the relevant specs.
313   Flag &= ~II.Disabled;
314 
315   // Ignore if the flag is already encompassed by the existing needs, or we
316   // just disabled everything.
317   if ((II.Needs & Flag) == Flag)
318     return;
319 
320   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
321   II.Needs |= Flag;
322   Worklist.emplace_back(&MI);
323 }
324 
325 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
326 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
327                                Register Reg, unsigned SubReg, char Flag,
328                                std::vector<WorkItem> &Worklist) {
329   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
330 
331   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
332   const VNInfo *Value = UseLRQ.valueIn();
333   if (!Value)
334     return;
335 
336   // Note: this code assumes that lane masks on AMDGPU completely
337   // cover registers.
338   const LaneBitmask UseLanes =
339       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
340              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
341                                 : LaneBitmask::getNone());
342 
343   // Perform a depth-first iteration of the LiveRange graph marking defs.
344   // Stop processing of a given branch when all use lanes have been defined.
345   // The first definition stops processing for a physical register.
346   struct PhiEntry {
347     const VNInfo *Phi;
348     unsigned PredIdx;
349     LaneBitmask DefinedLanes;
350 
351     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
352         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
353   };
354   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
355   SmallVector<PhiEntry, 2> PhiStack;
356   SmallSet<VisitKey, 4> Visited;
357   LaneBitmask DefinedLanes;
358   unsigned NextPredIdx = 0; // Only used for processing phi nodes
359   do {
360     const VNInfo *NextValue = nullptr;
361     const VisitKey Key(Value, DefinedLanes);
362 
363     if (Visited.insert(Key).second) {
364       // On first visit to a phi then start processing first predecessor
365       NextPredIdx = 0;
366     }
367 
368     if (Value->isPHIDef()) {
369       // Each predecessor node in the phi must be processed as a subgraph
370       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
371       assert(MBB && "Phi-def has no defining MBB");
372 
373       // Find next predecessor to process
374       unsigned Idx = NextPredIdx;
375       const auto *PI = MBB->pred_begin() + Idx;
376       const auto *PE = MBB->pred_end();
377       for (; PI != PE && !NextValue; ++PI, ++Idx) {
378         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
379           if (!Visited.count(VisitKey(VN, DefinedLanes)))
380             NextValue = VN;
381         }
382       }
383 
384       // If there are more predecessors to process; add phi to stack
385       if (PI != PE)
386         PhiStack.emplace_back(Value, Idx, DefinedLanes);
387     } else {
388       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
389       assert(MI && "Def has no defining instruction");
390 
391       if (Reg.isVirtual()) {
392         // Iterate over all operands to find relevant definitions
393         bool HasDef = false;
394         for (const MachineOperand &Op : MI->all_defs()) {
395           if (Op.getReg() != Reg)
396             continue;
397 
398           // Compute lanes defined and overlap with use
399           LaneBitmask OpLanes =
400               Op.isUndef() ? LaneBitmask::getAll()
401                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
402           LaneBitmask Overlap = (UseLanes & OpLanes);
403 
404           // Record if this instruction defined any of use
405           HasDef |= Overlap.any();
406 
407           // Mark any lanes defined
408           DefinedLanes |= OpLanes;
409         }
410 
411         // Check if all lanes of use have been defined
412         if ((DefinedLanes & UseLanes) != UseLanes) {
413           // Definition not complete; need to process input value
414           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
415           if (const VNInfo *VN = LRQ.valueIn()) {
416             if (!Visited.count(VisitKey(VN, DefinedLanes)))
417               NextValue = VN;
418           }
419         }
420 
421         // Only mark the instruction if it defines some part of the use
422         if (HasDef)
423           markInstruction(*MI, Flag, Worklist);
424       } else {
425         // For physical registers simply mark the defining instruction
426         markInstruction(*MI, Flag, Worklist);
427       }
428     }
429 
430     if (!NextValue && !PhiStack.empty()) {
431       // Reach end of chain; revert to processing last phi
432       PhiEntry &Entry = PhiStack.back();
433       NextValue = Entry.Phi;
434       NextPredIdx = Entry.PredIdx;
435       DefinedLanes = Entry.DefinedLanes;
436       PhiStack.pop_back();
437     }
438 
439     Value = NextValue;
440   } while (Value);
441 }
442 
443 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
444                                   const MachineOperand &Op, char Flag,
445                                   std::vector<WorkItem> &Worklist) {
446   assert(Op.isReg());
447   Register Reg = Op.getReg();
448 
449   // Ignore some hardware registers
450   switch (Reg) {
451   case AMDGPU::EXEC:
452   case AMDGPU::EXEC_LO:
453     return;
454   default:
455     break;
456   }
457 
458   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
459                     << " for " << MI);
460   if (Reg.isVirtual()) {
461     LiveRange &LR = LIS->getInterval(Reg);
462     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
463   } else {
464     // Handle physical registers that we need to track; this is mostly relevant
465     // for VCC, which can appear as the (implicit) input of a uniform branch,
466     // e.g. when a loop counter is stored in a VGPR.
467     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
468       LiveRange &LR = LIS->getRegUnit(Unit);
469       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
470       if (Value)
471         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
472     }
473   }
474 }
475 
476 /// Mark all instructions defining the uses in \p MI with \p Flag.
477 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
478                                           std::vector<WorkItem> &Worklist) {
479   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
480                     << MI);
481 
482   for (const MachineOperand &Use : MI.all_uses())
483     markOperand(MI, Use, Flag, Worklist);
484 }
485 
486 // Scan instructions to determine which ones require an Exact execmask and
487 // which ones seed WQM requirements.
488 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
489                                        std::vector<WorkItem> &Worklist) {
490   char GlobalFlags = 0;
491   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
492   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
493   bool HasImplicitDerivatives =
494       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
495 
496   // We need to visit the basic blocks in reverse post-order so that we visit
497   // defs before uses, in particular so that we don't accidentally mark an
498   // instruction as needing e.g. WQM before visiting it and realizing it needs
499   // WQM disabled.
500   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
501   for (MachineBasicBlock *MBB : RPOT) {
502     BlockInfo &BBI = Blocks[MBB];
503 
504     for (MachineInstr &MI : *MBB) {
505       InstrInfo &III = Instructions[&MI];
506       unsigned Opcode = MI.getOpcode();
507       char Flags = 0;
508 
509       if (TII->isWQM(Opcode)) {
510         // If LOD is not supported WQM is not needed.
511         // Only generate implicit WQM if implicit derivatives are required.
512         // This avoids inserting unintended WQM if a shader type without
513         // implicit derivatives uses an image sampling instruction.
514         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
515           // Sampling instructions don't need to produce results for all pixels
516           // in a quad, they just require all inputs of a quad to have been
517           // computed for derivatives.
518           markInstructionUses(MI, StateWQM, Worklist);
519           GlobalFlags |= StateWQM;
520         }
521       } else if (Opcode == AMDGPU::WQM) {
522         // The WQM intrinsic requires its output to have all the helper lanes
523         // correct, so we need it to be in WQM.
524         Flags = StateWQM;
525         LowerToCopyInstrs.insert(&MI);
526       } else if (Opcode == AMDGPU::SOFT_WQM) {
527         LowerToCopyInstrs.insert(&MI);
528         SoftWQMInstrs.push_back(&MI);
529       } else if (Opcode == AMDGPU::STRICT_WWM) {
530         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
531         // it needs to be executed in WQM or Exact so that its copy doesn't
532         // clobber inactive lanes.
533         markInstructionUses(MI, StateStrictWWM, Worklist);
534         GlobalFlags |= StateStrictWWM;
535         LowerToMovInstrs.push_back(&MI);
536       } else if (Opcode == AMDGPU::STRICT_WQM ||
537                  TII->isDualSourceBlendEXP(MI)) {
538         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
539         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
540         // quads that have at least one active thread.
541         markInstructionUses(MI, StateStrictWQM, Worklist);
542         GlobalFlags |= StateStrictWQM;
543 
544         if (Opcode == AMDGPU::STRICT_WQM) {
545           LowerToMovInstrs.push_back(&MI);
546         } else {
547           // Dual source blend export acts as implicit strict-wqm, its sources
548           // need to be shuffled in strict wqm, but the export itself needs to
549           // run in exact mode.
550           BBI.Needs |= StateExact;
551           if (!(BBI.InNeeds & StateExact)) {
552             BBI.InNeeds |= StateExact;
553             Worklist.emplace_back(MBB);
554           }
555           GlobalFlags |= StateExact;
556           III.Disabled = StateWQM | StateStrict;
557         }
558       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
559                  Opcode == AMDGPU::DS_PARAM_LOAD ||
560                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
561                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
562         // Mark these STRICTWQM, but only for the instruction, not its operands.
563         // This avoid unnecessarily marking M0 as requiring WQM.
564         III.Needs |= StateStrictWQM;
565         GlobalFlags |= StateStrictWQM;
566       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
567         // Disable strict states; StrictWQM will be added as required later.
568         III.Disabled = StateStrict;
569         MachineOperand &Inactive = MI.getOperand(4);
570         if (Inactive.isReg()) {
571           if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
572             LowerToCopyInstrs.insert(&MI);
573           else
574             markOperand(MI, Inactive, StateStrictWWM, Worklist);
575         }
576         SetInactiveInstrs.push_back(&MI);
577         BBI.NeedsLowering = true;
578       } else if (TII->isDisableWQM(MI)) {
579         BBI.Needs |= StateExact;
580         if (!(BBI.InNeeds & StateExact)) {
581           BBI.InNeeds |= StateExact;
582           Worklist.emplace_back(MBB);
583         }
584         GlobalFlags |= StateExact;
585         III.Disabled = StateWQM | StateStrict;
586       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
587                  Opcode == AMDGPU::SI_LIVE_MASK) {
588         LiveMaskQueries.push_back(&MI);
589       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
590                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
591                  Opcode == AMDGPU::SI_DEMOTE_I1) {
592         KillInstrs.push_back(&MI);
593         BBI.NeedsLowering = true;
594       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
595                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
596                  Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
597         InitExecInstrs.push_back(&MI);
598       } else if (WQMOutputs) {
599         // The function is in machine SSA form, which means that physical
600         // VGPRs correspond to shader inputs and outputs. Inputs are
601         // only used, outputs are only defined.
602         // FIXME: is this still valid?
603         for (const MachineOperand &MO : MI.defs()) {
604           Register Reg = MO.getReg();
605           if (Reg.isPhysical() &&
606               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
607             Flags = StateWQM;
608             break;
609           }
610         }
611       }
612 
613       if (Flags) {
614         markInstruction(MI, Flags, Worklist);
615         GlobalFlags |= Flags;
616       }
617     }
618   }
619 
620   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
621   // ever used anywhere in the function. This implements the corresponding
622   // semantics of @llvm.amdgcn.set.inactive.
623   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
624   if (GlobalFlags & StateWQM) {
625     for (MachineInstr *MI : SetInactiveInstrs)
626       markInstruction(*MI, StateWQM, Worklist);
627     for (MachineInstr *MI : SoftWQMInstrs)
628       markInstruction(*MI, StateWQM, Worklist);
629   }
630 
631   return GlobalFlags;
632 }
633 
634 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
635                                            std::vector<WorkItem>& Worklist) {
636   MachineBasicBlock *MBB = MI.getParent();
637   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
638   BlockInfo &BI = Blocks[MBB];
639 
640   // Control flow-type instructions and stores to temporary memory that are
641   // followed by WQM computations must themselves be in WQM.
642   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
643       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
644     Instructions[&MI].Needs = StateWQM;
645     II.Needs = StateWQM;
646   }
647 
648   // Propagate to block level
649   if (II.Needs & StateWQM) {
650     BI.Needs |= StateWQM;
651     if (!(BI.InNeeds & StateWQM)) {
652       BI.InNeeds |= StateWQM;
653       Worklist.emplace_back(MBB);
654     }
655   }
656 
657   // Propagate backwards within block
658   if (MachineInstr *PrevMI = MI.getPrevNode()) {
659     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
660     if (!PrevMI->isPHI()) {
661       InstrInfo &PrevII = Instructions[PrevMI];
662       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
663         PrevII.OutNeeds |= InNeeds;
664         Worklist.emplace_back(PrevMI);
665       }
666     }
667   }
668 
669   // Propagate WQM flag to instruction inputs
670   assert(!(II.Needs & StateExact));
671 
672   if (II.Needs != 0)
673     markInstructionUses(MI, II.Needs, Worklist);
674 
675   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
676   // not require any WQM transitions.
677   if (II.Needs & StateStrictWWM)
678     BI.Needs |= StateStrictWWM;
679   if (II.Needs & StateStrictWQM)
680     BI.Needs |= StateStrictWQM;
681 }
682 
683 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
684                                      std::vector<WorkItem>& Worklist) {
685   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
686 
687   // Propagate through instructions
688   if (!MBB.empty()) {
689     MachineInstr *LastMI = &*MBB.rbegin();
690     InstrInfo &LastII = Instructions[LastMI];
691     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
692       LastII.OutNeeds |= BI.OutNeeds;
693       Worklist.emplace_back(LastMI);
694     }
695   }
696 
697   // Predecessor blocks must provide for our WQM/Exact needs.
698   for (MachineBasicBlock *Pred : MBB.predecessors()) {
699     BlockInfo &PredBI = Blocks[Pred];
700     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
701       continue;
702 
703     PredBI.OutNeeds |= BI.InNeeds;
704     PredBI.InNeeds |= BI.InNeeds;
705     Worklist.emplace_back(Pred);
706   }
707 
708   // All successors must be prepared to accept the same set of WQM/Exact data.
709   for (MachineBasicBlock *Succ : MBB.successors()) {
710     BlockInfo &SuccBI = Blocks[Succ];
711     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
712       continue;
713 
714     SuccBI.InNeeds |= BI.OutNeeds;
715     Worklist.emplace_back(Succ);
716   }
717 }
718 
719 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
720   std::vector<WorkItem> Worklist;
721   char GlobalFlags = scanInstructions(MF, Worklist);
722 
723   while (!Worklist.empty()) {
724     WorkItem WI = Worklist.back();
725     Worklist.pop_back();
726 
727     if (WI.MI)
728       propagateInstruction(*WI.MI, Worklist);
729     else
730       propagateBlock(*WI.MBB, Worklist);
731   }
732 
733   return GlobalFlags;
734 }
735 
736 MachineBasicBlock::iterator
737 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
738                          MachineBasicBlock::iterator Before) {
739   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
740 
741   MachineInstr *Save =
742       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
743           .addReg(AMDGPU::SCC);
744   MachineInstr *Restore =
745       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
746           .addReg(SaveReg);
747 
748   LIS->InsertMachineInstrInMaps(*Save);
749   LIS->InsertMachineInstrInMaps(*Restore);
750   LIS->createAndComputeVirtRegInterval(SaveReg);
751 
752   return Restore;
753 }
754 
755 void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
756   MachineBasicBlock *BB = TermMI->getParent();
757   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
758                     << *TermMI << "\n");
759 
760   MachineBasicBlock *SplitBB =
761       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
762 
763   // Convert last instruction in block to a terminator.
764   // Note: this only covers the expected patterns
765   unsigned NewOpcode = 0;
766   switch (TermMI->getOpcode()) {
767   case AMDGPU::S_AND_B32:
768     NewOpcode = AMDGPU::S_AND_B32_term;
769     break;
770   case AMDGPU::S_AND_B64:
771     NewOpcode = AMDGPU::S_AND_B64_term;
772     break;
773   case AMDGPU::S_MOV_B32:
774     NewOpcode = AMDGPU::S_MOV_B32_term;
775     break;
776   case AMDGPU::S_MOV_B64:
777     NewOpcode = AMDGPU::S_MOV_B64_term;
778     break;
779   case AMDGPU::S_ANDN2_B32:
780     NewOpcode = AMDGPU::S_ANDN2_B32_term;
781     break;
782   case AMDGPU::S_ANDN2_B64:
783     NewOpcode = AMDGPU::S_ANDN2_B64_term;
784     break;
785   default:
786     llvm_unreachable("Unexpected instruction");
787   }
788 
789   // These terminators fallthrough to the next block, no need to add an
790   // unconditional branch to the next block (SplitBB).
791   TermMI->setDesc(TII->get(NewOpcode));
792 
793   if (SplitBB != BB) {
794     // Update dominator trees
795     using DomTreeT = DomTreeBase<MachineBasicBlock>;
796     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
797     for (MachineBasicBlock *Succ : SplitBB->successors()) {
798       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
799       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
800     }
801     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
802     if (MDT)
803       MDT->applyUpdates(DTUpdates);
804     if (PDT)
805       PDT->applyUpdates(DTUpdates);
806   }
807 }
808 
809 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
810   assert(LiveMaskReg.isVirtual());
811 
812   const DebugLoc &DL = MI.getDebugLoc();
813   unsigned Opcode = 0;
814 
815   assert(MI.getOperand(0).isReg());
816 
817   // Comparison is for live lanes; however here we compute the inverse
818   // (killed lanes).  This is because VCMP will always generate 0 bits
819   // for inactive lanes so a mask of live lanes would not be correct
820   // inside control flow.
821   // Invert the comparison by swapping the operands and adjusting
822   // the comparison codes.
823 
824   switch (MI.getOperand(2).getImm()) {
825   case ISD::SETUEQ:
826     Opcode = AMDGPU::V_CMP_LG_F32_e64;
827     break;
828   case ISD::SETUGT:
829     Opcode = AMDGPU::V_CMP_GE_F32_e64;
830     break;
831   case ISD::SETUGE:
832     Opcode = AMDGPU::V_CMP_GT_F32_e64;
833     break;
834   case ISD::SETULT:
835     Opcode = AMDGPU::V_CMP_LE_F32_e64;
836     break;
837   case ISD::SETULE:
838     Opcode = AMDGPU::V_CMP_LT_F32_e64;
839     break;
840   case ISD::SETUNE:
841     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
842     break;
843   case ISD::SETO:
844     Opcode = AMDGPU::V_CMP_O_F32_e64;
845     break;
846   case ISD::SETUO:
847     Opcode = AMDGPU::V_CMP_U_F32_e64;
848     break;
849   case ISD::SETOEQ:
850   case ISD::SETEQ:
851     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
852     break;
853   case ISD::SETOGT:
854   case ISD::SETGT:
855     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
856     break;
857   case ISD::SETOGE:
858   case ISD::SETGE:
859     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
860     break;
861   case ISD::SETOLT:
862   case ISD::SETLT:
863     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
864     break;
865   case ISD::SETOLE:
866   case ISD::SETLE:
867     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
868     break;
869   case ISD::SETONE:
870   case ISD::SETNE:
871     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
872     break;
873   default:
874     llvm_unreachable("invalid ISD:SET cond code");
875   }
876 
877   MachineBasicBlock &MBB = *MI.getParent();
878 
879   // Pick opcode based on comparison type.
880   MachineInstr *VcmpMI;
881   const MachineOperand &Op0 = MI.getOperand(0);
882   const MachineOperand &Op1 = MI.getOperand(1);
883 
884   // VCC represents lanes killed.
885   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
886 
887   if (TRI->isVGPR(*MRI, Op0.getReg())) {
888     Opcode = AMDGPU::getVOPe32(Opcode);
889     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
890   } else {
891     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
892                  .addReg(VCC, RegState::Define)
893                  .addImm(0) // src0 modifiers
894                  .add(Op1)
895                  .addImm(0) // src1 modifiers
896                  .add(Op0)
897                  .addImm(0); // omod
898   }
899 
900   MachineInstr *MaskUpdateMI =
901       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
902           .addReg(LiveMaskReg)
903           .addReg(VCC);
904 
905   // State of SCC represents whether any lanes are live in mask,
906   // if SCC is 0 then no lanes will be alive anymore.
907   MachineInstr *EarlyTermMI =
908       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
909 
910   MachineInstr *ExecMaskMI =
911       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
912 
913   assert(MBB.succ_size() == 1);
914 
915   // Update live intervals
916   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
917   MBB.remove(&MI);
918 
919   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
920   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
921   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
922 
923   return ExecMaskMI;
924 }
925 
926 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
927   assert(LiveMaskReg.isVirtual());
928 
929   MachineBasicBlock &MBB = *MI.getParent();
930 
931   const DebugLoc &DL = MI.getDebugLoc();
932   MachineInstr *MaskUpdateMI = nullptr;
933 
934   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
935   const MachineOperand &Op = MI.getOperand(0);
936   int64_t KillVal = MI.getOperand(1).getImm();
937   MachineInstr *ComputeKilledMaskMI = nullptr;
938   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
939   Register TmpReg;
940 
941   // Is this a static or dynamic kill?
942   if (Op.isImm()) {
943     if (Op.getImm() == KillVal) {
944       // Static: all active lanes are killed
945       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
946                          .addReg(LiveMaskReg)
947                          .addReg(Exec);
948     } else {
949       // Static: kill does nothing
950       bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
951       if (!IsLastTerminator) {
952         LIS->RemoveMachineInstrFromMaps(MI);
953       } else {
954         assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
955         MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
956                                     .addMBB(*MBB.succ_begin());
957         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
958       }
959       MBB.remove(&MI);
960       return nullptr;
961     }
962   } else {
963     if (!KillVal) {
964       // Op represents live lanes after kill,
965       // so exec mask needs to be factored in.
966       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
967       ComputeKilledMaskMI =
968           BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
969       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
970                          .addReg(LiveMaskReg)
971                          .addReg(TmpReg);
972     } else {
973       // Op represents lanes to kill
974       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
975                          .addReg(LiveMaskReg)
976                          .add(Op);
977     }
978   }
979 
980   // State of SCC represents whether any lanes are live in mask,
981   // if SCC is 0 then no lanes will be alive anymore.
982   MachineInstr *EarlyTermMI =
983       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
984 
985   // In the case we got this far some lanes are still live,
986   // update EXEC to deactivate lanes as appropriate.
987   MachineInstr *NewTerm;
988   MachineInstr *WQMMaskMI = nullptr;
989   Register LiveMaskWQM;
990   if (IsDemote) {
991     // Demote - deactivate quads with only helper lanes
992     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
993     WQMMaskMI =
994         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
995     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
996                   .addReg(Exec)
997                   .addReg(LiveMaskWQM);
998   } else {
999     // Kill - deactivate lanes no longer in live mask
1000     if (Op.isImm()) {
1001       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1003     } else if (!IsWQM) {
1004       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1005                     .addReg(Exec)
1006                     .addReg(LiveMaskReg);
1007     } else {
1008       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1009       NewTerm =
1010           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1011     }
1012   }
1013 
1014   // Update live intervals
1015   LIS->RemoveMachineInstrFromMaps(MI);
1016   MBB.remove(&MI);
1017   assert(EarlyTermMI);
1018   assert(MaskUpdateMI);
1019   assert(NewTerm);
1020   if (ComputeKilledMaskMI)
1021     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1022   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1023   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1024   if (WQMMaskMI)
1025     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1026   LIS->InsertMachineInstrInMaps(*NewTerm);
1027 
1028   if (CndReg) {
1029     LIS->removeInterval(CndReg);
1030     LIS->createAndComputeVirtRegInterval(CndReg);
1031   }
1032   if (TmpReg)
1033     LIS->createAndComputeVirtRegInterval(TmpReg);
1034   if (LiveMaskWQM)
1035     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1036 
1037   return NewTerm;
1038 }
1039 
1040 // Replace (or supplement) instructions accessing live mask.
1041 // This can only happen once all the live mask registers have been created
1042 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1043 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
1044   if (!BI.NeedsLowering)
1045     return;
1046 
1047   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1048 
1049   SmallVector<MachineInstr *, 4> SplitPoints;
1050   Register ActiveLanesReg = 0;
1051   char State = BI.InitialState;
1052 
1053   for (MachineInstr &MI : llvm::make_early_inc_range(
1054            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1055     auto MIState = StateTransition.find(&MI);
1056     if (MIState != StateTransition.end())
1057       State = MIState->second;
1058 
1059     MachineInstr *SplitPoint = nullptr;
1060     switch (MI.getOpcode()) {
1061     case AMDGPU::SI_DEMOTE_I1:
1062     case AMDGPU::SI_KILL_I1_TERMINATOR:
1063       SplitPoint = lowerKillI1(MI, State == StateWQM);
1064       break;
1065     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1066       SplitPoint = lowerKillF32(MI);
1067       break;
1068     case AMDGPU::ENTER_STRICT_WWM:
1069       ActiveLanesReg = MI.getOperand(0).getReg();
1070       break;
1071     case AMDGPU::EXIT_STRICT_WWM:
1072       ActiveLanesReg = 0;
1073       break;
1074     case AMDGPU::V_SET_INACTIVE_B32:
1075       if (ActiveLanesReg) {
1076         LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
1077         MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
1078         MI.getOperand(5).setReg(ActiveLanesReg);
1079         LIS->shrinkToUses(&LI);
1080       } else {
1081         assert(State == StateExact || State == StateWQM);
1082       }
1083       break;
1084     default:
1085       break;
1086     }
1087     if (SplitPoint)
1088       SplitPoints.push_back(SplitPoint);
1089   }
1090 
1091   // Perform splitting after instruction scan to simplify iteration.
1092   for (MachineInstr *MI : SplitPoints)
1093     splitBlock(MI);
1094 }
1095 
1096 // Return an iterator in the (inclusive) range [First, Last] at which
1097 // instructions can be safely inserted, keeping in mind that some of the
1098 // instructions we want to add necessarily clobber SCC.
1099 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1100     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1101     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1102   if (!SaveSCC)
1103     return PreferLast ? Last : First;
1104 
1105   LiveRange &LR =
1106       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1107   auto MBBE = MBB.end();
1108   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1109                                      : LIS->getMBBEndIdx(&MBB);
1110   SlotIndex LastIdx =
1111       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1112   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1113   const LiveRange::Segment *S;
1114 
1115   for (;;) {
1116     S = LR.getSegmentContaining(Idx);
1117     if (!S)
1118       break;
1119 
1120     if (PreferLast) {
1121       SlotIndex Next = S->start.getBaseIndex();
1122       if (Next < FirstIdx)
1123         break;
1124       Idx = Next;
1125     } else {
1126       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1127       assert(EndMI && "Segment does not end on valid instruction");
1128       auto NextI = std::next(EndMI->getIterator());
1129       if (NextI == MBB.end())
1130         break;
1131       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1132       if (Next > LastIdx)
1133         break;
1134       Idx = Next;
1135     }
1136   }
1137 
1138   MachineBasicBlock::iterator MBBI;
1139 
1140   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1141     MBBI = MI;
1142   else {
1143     assert(Idx == LIS->getMBBEndIdx(&MBB));
1144     MBBI = MBB.end();
1145   }
1146 
1147   // Move insertion point past any operations modifying EXEC.
1148   // This assumes that the value of SCC defined by any of these operations
1149   // does not need to be preserved.
1150   while (MBBI != Last) {
1151     bool IsExecDef = false;
1152     for (const MachineOperand &MO : MBBI->all_defs()) {
1153       IsExecDef |=
1154           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1155     }
1156     if (!IsExecDef)
1157       break;
1158     MBBI++;
1159     S = nullptr;
1160   }
1161 
1162   if (S)
1163     MBBI = saveSCC(MBB, MBBI);
1164 
1165   return MBBI;
1166 }
1167 
1168 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1169                               MachineBasicBlock::iterator Before,
1170                               Register SaveWQM) {
1171   assert(LiveMaskReg.isVirtual());
1172 
1173   bool IsTerminator = Before == MBB.end();
1174   if (!IsTerminator) {
1175     auto FirstTerm = MBB.getFirstTerminator();
1176     if (FirstTerm != MBB.end()) {
1177       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1178       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1179       IsTerminator = BeforeIdx > FirstTermIdx;
1180     }
1181   }
1182 
1183   MachineInstr *MI;
1184 
1185   if (SaveWQM) {
1186     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1187     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1188              .addReg(LiveMaskReg);
1189   } else {
1190     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1191     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1192              .addReg(Exec)
1193              .addReg(LiveMaskReg);
1194   }
1195 
1196   LIS->InsertMachineInstrInMaps(*MI);
1197   StateTransition[MI] = StateExact;
1198 }
1199 
1200 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1201                             MachineBasicBlock::iterator Before,
1202                             Register SavedWQM) {
1203   MachineInstr *MI;
1204 
1205   if (SavedWQM) {
1206     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1207              .addReg(SavedWQM);
1208   } else {
1209     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1210   }
1211 
1212   LIS->InsertMachineInstrInMaps(*MI);
1213   StateTransition[MI] = StateWQM;
1214 }
1215 
1216 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1217                                    MachineBasicBlock::iterator Before,
1218                                    Register SaveOrig, char StrictStateNeeded) {
1219   MachineInstr *MI;
1220   assert(SaveOrig);
1221   assert(StrictStateNeeded == StateStrictWWM ||
1222          StrictStateNeeded == StateStrictWQM);
1223 
1224   if (StrictStateNeeded == StateStrictWWM) {
1225     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1226                  SaveOrig)
1227              .addImm(-1);
1228   } else {
1229     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1230                  SaveOrig)
1231              .addImm(-1);
1232   }
1233   LIS->InsertMachineInstrInMaps(*MI);
1234   StateTransition[MI] = StrictStateNeeded;
1235 }
1236 
1237 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1238                                      MachineBasicBlock::iterator Before,
1239                                      Register SavedOrig, char NonStrictState,
1240                                      char CurrentStrictState) {
1241   MachineInstr *MI;
1242 
1243   assert(SavedOrig);
1244   assert(CurrentStrictState == StateStrictWWM ||
1245          CurrentStrictState == StateStrictWQM);
1246 
1247   if (CurrentStrictState == StateStrictWWM) {
1248     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1249                  Exec)
1250              .addReg(SavedOrig);
1251   } else {
1252     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1253                  Exec)
1254              .addReg(SavedOrig);
1255   }
1256   LIS->InsertMachineInstrInMaps(*MI);
1257   StateTransition[MI] = NonStrictState;
1258 }
1259 
1260 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
1261                                    bool IsEntry) {
1262   // This is a non-entry block that is WQM throughout, so no need to do
1263   // anything.
1264   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1265     BI.InitialState = StateWQM;
1266     return;
1267   }
1268 
1269   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1270                     << ":\n");
1271 
1272   Register SavedWQMReg;
1273   Register SavedNonStrictReg;
1274   bool WQMFromExec = IsEntry;
1275   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1276   char NonStrictState = 0;
1277   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1278 
1279   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1280   if (IsEntry) {
1281     // Skip the instruction that saves LiveMask
1282     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1283         II->getOperand(1).getReg() == TRI->getExec())
1284       ++II;
1285   }
1286 
1287   // This stores the first instruction where it's safe to switch from WQM to
1288   // Exact or vice versa.
1289   MachineBasicBlock::iterator FirstWQM = IE;
1290 
1291   // This stores the first instruction where it's safe to switch from Strict
1292   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1293   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1294   // be safe to switch to/from WQM as well.
1295   MachineBasicBlock::iterator FirstStrict = IE;
1296 
1297   // Record initial state is block information.
1298   BI.InitialState = State;
1299 
1300   for (unsigned Idx = 0;; ++Idx) {
1301     MachineBasicBlock::iterator Next = II;
1302     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1303     char OutNeeds = 0;
1304 
1305     if (FirstWQM == IE)
1306       FirstWQM = II;
1307 
1308     if (FirstStrict == IE)
1309       FirstStrict = II;
1310 
1311     // Adjust needs if this is first instruction of WQM requiring shader.
1312     if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1313       Needs = StateWQM;
1314 
1315     // First, figure out the allowed states (Needs) based on the propagated
1316     // flags.
1317     if (II != IE) {
1318       MachineInstr &MI = *II;
1319 
1320       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1321         auto III = Instructions.find(&MI);
1322         if (III != Instructions.end()) {
1323           if (III->second.Needs & StateStrictWWM)
1324             Needs = StateStrictWWM;
1325           else if (III->second.Needs & StateStrictWQM)
1326             Needs = StateStrictWQM;
1327           else if (III->second.Needs & StateWQM)
1328             Needs = StateWQM;
1329           else
1330             Needs &= ~III->second.Disabled;
1331           OutNeeds = III->second.OutNeeds;
1332         }
1333       } else {
1334         // If the instruction doesn't actually need a correct EXEC, then we can
1335         // safely leave Strict mode enabled.
1336         Needs = StateExact | StateWQM | StateStrict;
1337       }
1338 
1339       // Exact mode exit can occur in terminators, but must be before branches.
1340       if (MI.isBranch() && OutNeeds == StateExact)
1341         Needs = StateExact;
1342 
1343       ++Next;
1344     } else {
1345       // End of basic block
1346       if (BI.OutNeeds & StateWQM)
1347         Needs = StateWQM;
1348       else if (BI.OutNeeds == StateExact)
1349         Needs = StateExact;
1350       else
1351         Needs = StateWQM | StateExact;
1352     }
1353 
1354     // Now, transition if necessary.
1355     if (!(Needs & State)) {
1356       MachineBasicBlock::iterator First;
1357       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1358           State == StateStrictWQM || Needs == StateStrictWQM) {
1359         // We must switch to or from Strict mode.
1360         First = FirstStrict;
1361       } else {
1362         // We only need to switch to/from WQM, so we can use FirstWQM.
1363         First = FirstWQM;
1364       }
1365 
1366       // Whether we need to save SCC depends on start and end states.
1367       bool SaveSCC = false;
1368       switch (State) {
1369       case StateExact:
1370       case StateStrictWWM:
1371       case StateStrictWQM:
1372         // Exact/Strict -> Strict: save SCC
1373         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1374         // Exact/Strict -> Exact: no save
1375         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1376         break;
1377       case StateWQM:
1378         // WQM -> Exact/Strict: save SCC
1379         SaveSCC = !(Needs & StateWQM);
1380         break;
1381       default:
1382         llvm_unreachable("Unknown state");
1383         break;
1384       }
1385       char StartState = State & StateStrict ? NonStrictState : State;
1386       bool WQMToExact =
1387           StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1388       bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1389                         !(Needs & StateExact);
1390       bool PreferLast = Needs == StateWQM;
1391       // Exact regions in divergent control flow may run at EXEC=0, so try to
1392       // exclude instructions with unexpected effects from them.
1393       // FIXME: ideally we would branch over these when EXEC=0,
1394       // but this requires updating implicit values, live intervals and CFG.
1395       if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1396         for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1397           if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
1398             PreferLast = WQMToExact;
1399             break;
1400           }
1401         }
1402       }
1403       MachineBasicBlock::iterator Before =
1404           prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
1405 
1406       if (State & StateStrict) {
1407         assert(State == StateStrictWWM || State == StateStrictWQM);
1408         assert(SavedNonStrictReg);
1409         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1410 
1411         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1412         SavedNonStrictReg = 0;
1413         State = NonStrictState;
1414       }
1415 
1416       if (Needs & StateStrict) {
1417         NonStrictState = State;
1418         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1419         assert(!SavedNonStrictReg);
1420         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1421 
1422         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1423         State = Needs;
1424       } else {
1425         if (WQMToExact) {
1426           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1427             assert(!SavedWQMReg);
1428             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1429           }
1430 
1431           toExact(MBB, Before, SavedWQMReg);
1432           State = StateExact;
1433         } else if (ExactToWQM) {
1434           assert(WQMFromExec == (SavedWQMReg == 0));
1435 
1436           toWQM(MBB, Before, SavedWQMReg);
1437 
1438           if (SavedWQMReg) {
1439             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1440             SavedWQMReg = 0;
1441           }
1442           State = StateWQM;
1443         } else {
1444           // We can get here if we transitioned from StrictWWM to a
1445           // non-StrictWWM state that already matches our needs, but we
1446           // shouldn't need to do anything.
1447           assert(Needs & State);
1448         }
1449       }
1450     }
1451 
1452     if (Needs != (StateExact | StateWQM | StateStrict)) {
1453       if (Needs != (StateExact | StateWQM))
1454         FirstWQM = IE;
1455       FirstStrict = IE;
1456     }
1457 
1458     if (II == IE)
1459       break;
1460 
1461     II = Next;
1462   }
1463   assert(!SavedWQMReg);
1464   assert(!SavedNonStrictReg);
1465 }
1466 
1467 bool SIWholeQuadMode::lowerLiveMaskQueries() {
1468   for (MachineInstr *MI : LiveMaskQueries) {
1469     const DebugLoc &DL = MI->getDebugLoc();
1470     Register Dest = MI->getOperand(0).getReg();
1471 
1472     MachineInstr *Copy =
1473         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1474             .addReg(LiveMaskReg);
1475 
1476     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1477     MI->eraseFromParent();
1478   }
1479   return !LiveMaskQueries.empty();
1480 }
1481 
1482 bool SIWholeQuadMode::lowerCopyInstrs() {
1483   for (MachineInstr *MI : LowerToMovInstrs) {
1484     assert(MI->getNumExplicitOperands() == 2);
1485 
1486     const Register Reg = MI->getOperand(0).getReg();
1487 
1488     const TargetRegisterClass *regClass =
1489         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1490     if (TRI->isVGPRClass(regClass)) {
1491       const unsigned MovOp = TII->getMovOpcode(regClass);
1492       MI->setDesc(TII->get(MovOp));
1493 
1494       // Check that it already implicitly depends on exec (like all VALU movs
1495       // should do).
1496       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1497         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1498       }));
1499     } else {
1500       // Remove early-clobber and exec dependency from simple SGPR copies.
1501       // This allows some to be eliminated during/post RA.
1502       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1503       if (MI->getOperand(0).isEarlyClobber()) {
1504         LIS->removeInterval(Reg);
1505         MI->getOperand(0).setIsEarlyClobber(false);
1506         LIS->createAndComputeVirtRegInterval(Reg);
1507       }
1508       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1509       while (Index >= 0) {
1510         MI->removeOperand(Index);
1511         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1512       }
1513       MI->setDesc(TII->get(AMDGPU::COPY));
1514       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1515     }
1516   }
1517   for (MachineInstr *MI : LowerToCopyInstrs) {
1518     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1519 
1520     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1521       assert(MI->getNumExplicitOperands() == 6);
1522 
1523       LiveInterval *RecomputeLI = nullptr;
1524       if (MI->getOperand(4).isReg())
1525         RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
1526 
1527       MI->removeOperand(5);
1528       MI->removeOperand(4);
1529       MI->removeOperand(3);
1530       MI->removeOperand(1);
1531 
1532       if (RecomputeLI)
1533         LIS->shrinkToUses(RecomputeLI);
1534     } else {
1535       assert(MI->getNumExplicitOperands() == 2);
1536     }
1537 
1538     unsigned CopyOp = MI->getOperand(1).isReg()
1539                           ? (unsigned)AMDGPU::COPY
1540                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1541                                 *MRI, MI->getOperand(0)));
1542     MI->setDesc(TII->get(CopyOp));
1543     LLVM_DEBUG(dbgs() << " -> " << *MI);
1544   }
1545   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1546 }
1547 
1548 bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1549   for (MachineInstr *MI : KillInstrs) {
1550     MachineInstr *SplitPoint = nullptr;
1551     switch (MI->getOpcode()) {
1552     case AMDGPU::SI_DEMOTE_I1:
1553     case AMDGPU::SI_KILL_I1_TERMINATOR:
1554       SplitPoint = lowerKillI1(*MI, IsWQM);
1555       break;
1556     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1557       SplitPoint = lowerKillF32(*MI);
1558       break;
1559     }
1560     if (SplitPoint)
1561       splitBlock(SplitPoint);
1562   }
1563   return !KillInstrs.empty();
1564 }
1565 
1566 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1567   MachineBasicBlock *MBB = MI.getParent();
1568   bool IsWave32 = ST->isWave32();
1569 
1570   if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1571     assert(MBB == &MBB->getParent()->front() &&
1572            "init whole wave not in entry block");
1573     Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1574     MachineInstr *SaveExec =
1575         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1576                 TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1577                                   : AMDGPU::S_OR_SAVEEXEC_B64),
1578                 EntryExec)
1579             .addImm(-1);
1580 
1581     // Replace all uses of MI's destination reg with EntryExec.
1582     MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1583 
1584     if (LIS) {
1585       LIS->RemoveMachineInstrFromMaps(MI);
1586     }
1587 
1588     MI.eraseFromParent();
1589 
1590     if (LIS) {
1591       LIS->InsertMachineInstrInMaps(*SaveExec);
1592       LIS->createAndComputeVirtRegInterval(EntryExec);
1593     }
1594     return;
1595   }
1596 
1597   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1598     // This should be before all vector instructions.
1599     MachineInstr *InitMI =
1600         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1601                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1602                 Exec)
1603             .addImm(MI.getOperand(0).getImm());
1604     if (LIS) {
1605       LIS->RemoveMachineInstrFromMaps(MI);
1606       LIS->InsertMachineInstrInMaps(*InitMI);
1607     }
1608     MI.eraseFromParent();
1609     return;
1610   }
1611 
1612   // Extract the thread count from an SGPR input and set EXEC accordingly.
1613   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1614   //
1615   // S_BFE_U32 count, input, {shift, 7}
1616   // S_BFM_B64 exec, count, 0
1617   // S_CMP_EQ_U32 count, 64
1618   // S_CMOV_B64 exec, -1
1619   Register InputReg = MI.getOperand(0).getReg();
1620   MachineInstr *FirstMI = &*MBB->begin();
1621   if (InputReg.isVirtual()) {
1622     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1623     assert(DefInstr && DefInstr->isCopy());
1624     if (DefInstr->getParent() == MBB) {
1625       if (DefInstr != FirstMI) {
1626         // If the `InputReg` is defined in current block, we also need to
1627         // move that instruction to the beginning of the block.
1628         DefInstr->removeFromParent();
1629         MBB->insert(FirstMI, DefInstr);
1630         if (LIS)
1631           LIS->handleMove(*DefInstr);
1632       } else {
1633         // If first instruction is definition then move pointer after it.
1634         FirstMI = &*std::next(FirstMI->getIterator());
1635       }
1636     }
1637   }
1638 
1639   // Insert instruction sequence at block beginning (before vector operations).
1640   const DebugLoc DL = MI.getDebugLoc();
1641   const unsigned WavefrontSize = ST->getWavefrontSize();
1642   const unsigned Mask = (WavefrontSize << 1) - 1;
1643   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1644   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1645                    .addReg(InputReg)
1646                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1647   auto BfmMI =
1648       BuildMI(*MBB, FirstMI, DL,
1649               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1650           .addReg(CountReg)
1651           .addImm(0);
1652   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1653                    .addReg(CountReg, RegState::Kill)
1654                    .addImm(WavefrontSize);
1655   auto CmovMI =
1656       BuildMI(*MBB, FirstMI, DL,
1657               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1658               Exec)
1659           .addImm(-1);
1660 
1661   if (!LIS) {
1662     MI.eraseFromParent();
1663     return;
1664   }
1665 
1666   LIS->RemoveMachineInstrFromMaps(MI);
1667   MI.eraseFromParent();
1668 
1669   LIS->InsertMachineInstrInMaps(*BfeMI);
1670   LIS->InsertMachineInstrInMaps(*BfmMI);
1671   LIS->InsertMachineInstrInMaps(*CmpMI);
1672   LIS->InsertMachineInstrInMaps(*CmovMI);
1673 
1674   LIS->removeInterval(InputReg);
1675   LIS->createAndComputeVirtRegInterval(InputReg);
1676   LIS->createAndComputeVirtRegInterval(CountReg);
1677 }
1678 
1679 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1680 /// for instructions that depend on EXEC.
1681 MachineBasicBlock::iterator
1682 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1683   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1684 
1685   for (MachineInstr *MI : InitExecInstrs) {
1686     // Try to handle undefined cases gracefully:
1687     // - multiple INIT_EXEC instructions
1688     // - INIT_EXEC instructions not in the entry block
1689     if (MI->getParent() == &Entry)
1690       InsertPt = std::next(MI->getIterator());
1691 
1692     lowerInitExec(*MI);
1693     Changed = true;
1694   }
1695 
1696   return InsertPt;
1697 }
1698 
1699 bool SIWholeQuadMode::run(MachineFunction &MF) {
1700   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1701                     << " ------------- \n");
1702   LLVM_DEBUG(MF.dump(););
1703 
1704   Instructions.clear();
1705   Blocks.clear();
1706   LiveMaskQueries.clear();
1707   LowerToCopyInstrs.clear();
1708   LowerToMovInstrs.clear();
1709   KillInstrs.clear();
1710   InitExecInstrs.clear();
1711   SetInactiveInstrs.clear();
1712   StateTransition.clear();
1713 
1714   if (ST->isWave32()) {
1715     AndOpc = AMDGPU::S_AND_B32;
1716     AndTermOpc = AMDGPU::S_AND_B32_term;
1717     AndN2Opc = AMDGPU::S_ANDN2_B32;
1718     XorOpc = AMDGPU::S_XOR_B32;
1719     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1720     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1721     WQMOpc = AMDGPU::S_WQM_B32;
1722     Exec = AMDGPU::EXEC_LO;
1723   } else {
1724     AndOpc = AMDGPU::S_AND_B64;
1725     AndTermOpc = AMDGPU::S_AND_B64_term;
1726     AndN2Opc = AMDGPU::S_ANDN2_B64;
1727     XorOpc = AMDGPU::S_XOR_B64;
1728     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1729     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1730     WQMOpc = AMDGPU::S_WQM_B64;
1731     Exec = AMDGPU::EXEC;
1732   }
1733 
1734   const char GlobalFlags = analyzeFunction(MF);
1735   bool Changed = false;
1736 
1737   LiveMaskReg = Exec;
1738 
1739   MachineBasicBlock &Entry = MF.front();
1740   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1741 
1742   // Store a copy of the original live mask when required
1743   const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1744   const bool HasWaveModes = GlobalFlags & ~StateExact;
1745   const bool HasKills = !KillInstrs.empty();
1746   const bool UsesWQM = GlobalFlags & StateWQM;
1747   if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1748     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1749     MachineInstr *MI =
1750         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1751             .addReg(Exec);
1752     LIS->InsertMachineInstrInMaps(*MI);
1753     Changed = true;
1754   }
1755 
1756   // Check if V_SET_INACTIVE was touched by a strict state mode.
1757   // If so, promote to WWM; otherwise lower to COPY.
1758   for (MachineInstr *MI : SetInactiveInstrs) {
1759     if (LowerToCopyInstrs.contains(MI))
1760       continue;
1761     auto &Info = Instructions[MI];
1762     if (Info.MarkedStates & StateStrict) {
1763       Info.Needs |= StateStrictWWM;
1764       Info.Disabled &= ~StateStrictWWM;
1765       Blocks[MI->getParent()].Needs |= StateStrictWWM;
1766     } else {
1767       LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1768       LowerToCopyInstrs.insert(MI);
1769     }
1770   }
1771 
1772   LLVM_DEBUG(printInfo());
1773 
1774   Changed |= lowerLiveMaskQueries();
1775   Changed |= lowerCopyInstrs();
1776 
1777   if (!HasWaveModes) {
1778     // No wave mode execution
1779     Changed |= lowerKillInstrs(false);
1780   } else if (GlobalFlags == StateWQM) {
1781     // Shader only needs WQM
1782     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1783                   .addReg(Exec);
1784     LIS->InsertMachineInstrInMaps(*MI);
1785     lowerKillInstrs(true);
1786     Changed = true;
1787   } else {
1788     // Mark entry for WQM if required.
1789     if (GlobalFlags & StateWQM)
1790       Blocks[&Entry].InNeeds |= StateWQM;
1791     // Wave mode switching requires full lowering pass.
1792     for (auto &BII : Blocks)
1793       processBlock(*BII.first, BII.second, BII.first == &Entry);
1794     // Lowering blocks causes block splitting so perform as a second pass.
1795     for (auto &BII : Blocks)
1796       lowerBlock(*BII.first, BII.second);
1797     Changed = true;
1798   }
1799 
1800   // Compute live range for live mask
1801   if (LiveMaskReg != Exec)
1802     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1803 
1804   // Physical registers like SCC aren't tracked by default anyway, so just
1805   // removing the ranges we computed is the simplest option for maintaining
1806   // the analysis results.
1807   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1808 
1809   // If we performed any kills then recompute EXEC
1810   if (!KillInstrs.empty() || !InitExecInstrs.empty())
1811     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1812 
1813   return Changed;
1814 }
1815 
1816 bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1817   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1818   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1819   MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1820   auto *PDTWrapper =
1821       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1822   MachinePostDominatorTree *PDT =
1823       PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1824   SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1825   return Impl.run(MF);
1826 }
1827 
1828 PreservedAnalyses
1829 SIWholeQuadModePass::run(MachineFunction &MF,
1830                          MachineFunctionAnalysisManager &MFAM) {
1831   MFPropsModifier _(*this, MF);
1832 
1833   LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
1834   MachineDominatorTree *MDT =
1835       MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
1836   MachinePostDominatorTree *PDT =
1837       MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
1838   SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1839   bool Changed = Impl.run(MF);
1840   if (!Changed)
1841     return PreservedAnalyses::all();
1842 
1843   PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1844   PA.preserve<SlotIndexesAnalysis>();
1845   PA.preserve<LiveIntervalsAnalysis>();
1846   PA.preserve<MachineDominatorTreeAnalysis>();
1847   PA.preserve<MachinePostDominatorTreeAnalysis>();
1848   return PA;
1849 }
1850