xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 4fbb9c43aa44d9145151bb5f77d302ba01fb7551)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137   char InitialState = 0;
138   bool NeedsLowering = false;
139 };
140 
141 struct WorkItem {
142   MachineBasicBlock *MBB = nullptr;
143   MachineInstr *MI = nullptr;
144 
145   WorkItem() = default;
146   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147   WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149 
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152   const SIInstrInfo *TII;
153   const SIRegisterInfo *TRI;
154   const GCNSubtarget *ST;
155   MachineRegisterInfo *MRI;
156   LiveIntervals *LIS;
157   MachineDominatorTree *MDT;
158   MachinePostDominatorTree *PDT;
159 
160   unsigned AndOpc;
161   unsigned AndN2Opc;
162   unsigned XorOpc;
163   unsigned AndSaveExecOpc;
164   unsigned OrSaveExecOpc;
165   unsigned WQMOpc;
166   Register Exec;
167   Register LiveMaskReg;
168 
169   DenseMap<const MachineInstr *, InstrInfo> Instructions;
170   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
171 
172   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
173   DenseMap<const MachineInstr *, char> StateTransition;
174 
175   SmallVector<MachineInstr *, 2> LiveMaskQueries;
176   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
177   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
178   SmallVector<MachineInstr *, 4> KillInstrs;
179 
180   void printInfo();
181 
182   void markInstruction(MachineInstr &MI, char Flag,
183                        std::vector<WorkItem> &Worklist);
184   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
185                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
186   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
187                    std::vector<WorkItem> &Worklist);
188   void markInstructionUses(const MachineInstr &MI, char Flag,
189                            std::vector<WorkItem> &Worklist);
190   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
191   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
192   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
193   char analyzeFunction(MachineFunction &MF);
194 
195   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
196                                       MachineBasicBlock::iterator Before);
197   MachineBasicBlock::iterator
198   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
199                    MachineBasicBlock::iterator Last, bool PreferLast,
200                    bool SaveSCC);
201   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
202                Register SaveWQM);
203   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204              Register SavedWQM);
205   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206                     Register SaveOrig, char StrictStateNeeded);
207   void fromStrictMode(MachineBasicBlock &MBB,
208                       MachineBasicBlock::iterator Before, Register SavedOrig,
209                       char NonStrictState, char CurrentStrictState);
210 
211   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
212 
213   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
214                             bool IsWQM);
215   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
216   void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
217                              MachineInstr *Exit);
218 
219   void lowerBlock(MachineBasicBlock &MBB);
220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222   void lowerLiveMaskQueries();
223   void lowerCopyInstrs();
224   void lowerKillInstrs(bool IsWQM);
225 
226 public:
227   static char ID;
228 
229   SIWholeQuadMode() :
230     MachineFunctionPass(ID) { }
231 
232   bool runOnMachineFunction(MachineFunction &MF) override;
233 
234   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235 
236   void getAnalysisUsage(AnalysisUsage &AU) const override {
237     AU.addRequired<LiveIntervals>();
238     AU.addPreserved<SlotIndexes>();
239     AU.addPreserved<LiveIntervals>();
240     AU.addRequired<MachineDominatorTree>();
241     AU.addPreserved<MachineDominatorTree>();
242     AU.addRequired<MachinePostDominatorTree>();
243     AU.addPreserved<MachinePostDominatorTree>();
244     MachineFunctionPass::getAnalysisUsage(AU);
245   }
246 
247   MachineFunctionProperties getClearedProperties() const override {
248     return MachineFunctionProperties().set(
249         MachineFunctionProperties::Property::IsSSA);
250   }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258                       false)
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263                     false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
267 FunctionPass *llvm::createSIWholeQuadModePass() {
268   return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273   for (const auto &BII : Blocks) {
274     dbgs() << "\n"
275            << printMBBReference(*BII.first) << ":\n"
276            << "  InNeeds = " << PrintState(BII.second.InNeeds)
277            << ", Needs = " << PrintState(BII.second.Needs)
278            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280     for (const MachineInstr &MI : *BII.first) {
281       auto III = Instructions.find(&MI);
282       if (III == Instructions.end())
283         continue;
284 
285       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
286              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287     }
288   }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293                                       std::vector<WorkItem> &Worklist) {
294   InstrInfo &II = Instructions[&MI];
295 
296   assert(!(Flag & StateExact) && Flag != 0);
297 
298   // Remove any disabled states from the flag. The user that required it gets
299   // an undefined value in the helper lanes. For example, this can happen if
300   // the result of an atomic is used by instruction that requires WQM, where
301   // ignoring the request for WQM is correct as per the relevant specs.
302   Flag &= ~II.Disabled;
303 
304   // Ignore if the flag is already encompassed by the existing needs, or we
305   // just disabled everything.
306   if ((II.Needs & Flag) == Flag)
307     return;
308 
309   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310   II.Needs |= Flag;
311   Worklist.push_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316                                Register Reg, unsigned SubReg, char Flag,
317                                std::vector<WorkItem> &Worklist) {
318   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321   const VNInfo *Value = UseLRQ.valueIn();
322   if (!Value)
323     return;
324 
325   // Note: this code assumes that lane masks on AMDGPU completely
326   // cover registers.
327   const LaneBitmask UseLanes =
328       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330                                 : LaneBitmask::getNone());
331 
332   // Perform a depth-first iteration of the LiveRange graph marking defs.
333   // Stop processing of a given branch when all use lanes have been defined.
334   // The first definition stops processing for a physical register.
335   struct PhiEntry {
336     const VNInfo *Phi;
337     unsigned PredIdx;
338     LaneBitmask DefinedLanes;
339 
340     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342   };
343   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344   SmallVector<PhiEntry, 2> PhiStack;
345   SmallSet<VisitKey, 4> Visited;
346   LaneBitmask DefinedLanes;
347   unsigned NextPredIdx = 0; // Only used for processing phi nodes
348   do {
349     const VNInfo *NextValue = nullptr;
350     const VisitKey Key(Value, DefinedLanes);
351 
352     if (Visited.insert(Key).second) {
353       // On first visit to a phi then start processing first predecessor
354       NextPredIdx = 0;
355     }
356 
357     if (Value->isPHIDef()) {
358       // Each predecessor node in the phi must be processed as a subgraph
359       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360       assert(MBB && "Phi-def has no defining MBB");
361 
362       // Find next predecessor to process
363       unsigned Idx = NextPredIdx;
364       auto PI = MBB->pred_begin() + Idx;
365       auto PE = MBB->pred_end();
366       for (; PI != PE && !NextValue; ++PI, ++Idx) {
367         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368           if (!Visited.count(VisitKey(VN, DefinedLanes)))
369             NextValue = VN;
370         }
371       }
372 
373       // If there are more predecessors to process; add phi to stack
374       if (PI != PE)
375         PhiStack.emplace_back(Value, Idx, DefinedLanes);
376     } else {
377       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378       assert(MI && "Def has no defining instruction");
379 
380       if (Reg.isVirtual()) {
381         // Iterate over all operands to find relevant definitions
382         bool HasDef = false;
383         for (const MachineOperand &Op : MI->operands()) {
384           if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
385             continue;
386 
387           // Compute lanes defined and overlap with use
388           LaneBitmask OpLanes =
389               Op.isUndef() ? LaneBitmask::getAll()
390                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391           LaneBitmask Overlap = (UseLanes & OpLanes);
392 
393           // Record if this instruction defined any of use
394           HasDef |= Overlap.any();
395 
396           // Mark any lanes defined
397           DefinedLanes |= OpLanes;
398         }
399 
400         // Check if all lanes of use have been defined
401         if ((DefinedLanes & UseLanes) != UseLanes) {
402           // Definition not complete; need to process input value
403           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404           if (const VNInfo *VN = LRQ.valueIn()) {
405             if (!Visited.count(VisitKey(VN, DefinedLanes)))
406               NextValue = VN;
407           }
408         }
409 
410         // Only mark the instruction if it defines some part of the use
411         if (HasDef)
412           markInstruction(*MI, Flag, Worklist);
413       } else {
414         // For physical registers simply mark the defining instruction
415         markInstruction(*MI, Flag, Worklist);
416       }
417     }
418 
419     if (!NextValue && !PhiStack.empty()) {
420       // Reach end of chain; revert to processing last phi
421       PhiEntry &Entry = PhiStack.back();
422       NextValue = Entry.Phi;
423       NextPredIdx = Entry.PredIdx;
424       DefinedLanes = Entry.DefinedLanes;
425       PhiStack.pop_back();
426     }
427 
428     Value = NextValue;
429   } while (Value);
430 }
431 
432 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433                                   const MachineOperand &Op, char Flag,
434                                   std::vector<WorkItem> &Worklist) {
435   assert(Op.isReg());
436   Register Reg = Op.getReg();
437 
438   // Ignore some hardware registers
439   switch (Reg) {
440   case AMDGPU::EXEC:
441   case AMDGPU::EXEC_LO:
442     return;
443   default:
444     break;
445   }
446 
447   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448                     << " for " << MI);
449   if (Reg.isVirtual()) {
450     LiveRange &LR = LIS->getInterval(Reg);
451     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452   } else {
453     // Handle physical registers that we need to track; this is mostly relevant
454     // for VCC, which can appear as the (implicit) input of a uniform branch,
455     // e.g. when a loop counter is stored in a VGPR.
456     for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
457          ++RegUnit) {
458       LiveRange &LR = LIS->getRegUnit(*RegUnit);
459       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
460       if (!Value)
461         continue;
462 
463       markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
464     }
465   }
466 }
467 
468 /// Mark all instructions defining the uses in \p MI with \p Flag.
469 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
470                                           std::vector<WorkItem> &Worklist) {
471   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
472                     << MI);
473 
474   for (const MachineOperand &Use : MI.uses()) {
475     if (!Use.isReg() || !Use.isUse())
476       continue;
477     markOperand(MI, Use, Flag, Worklist);
478   }
479 }
480 
481 // Scan instructions to determine which ones require an Exact execmask and
482 // which ones seed WQM requirements.
483 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
484                                        std::vector<WorkItem> &Worklist) {
485   char GlobalFlags = 0;
486   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
487   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
488   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
489   bool HasImplicitDerivatives =
490       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
491 
492   // We need to visit the basic blocks in reverse post-order so that we visit
493   // defs before uses, in particular so that we don't accidentally mark an
494   // instruction as needing e.g. WQM before visiting it and realizing it needs
495   // WQM disabled.
496   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
497   for (MachineBasicBlock *MBB : RPOT) {
498     BlockInfo &BBI = Blocks[MBB];
499 
500     for (MachineInstr &MI : *MBB) {
501       InstrInfo &III = Instructions[&MI];
502       unsigned Opcode = MI.getOpcode();
503       char Flags = 0;
504 
505       if (TII->isWQM(Opcode)) {
506         // If LOD is not supported WQM is not needed.
507         if (!ST->hasExtendedImageInsts())
508           continue;
509         // Only generate implicit WQM if implicit derivatives are required.
510         // This avoids inserting unintended WQM if a shader type without
511         // implicit derivatives uses an image sampling instruction.
512         if (!HasImplicitDerivatives)
513           continue;
514         // Sampling instructions don't need to produce results for all pixels
515         // in a quad, they just require all inputs of a quad to have been
516         // computed for derivatives.
517         markInstructionUses(MI, StateWQM, Worklist);
518         GlobalFlags |= StateWQM;
519         continue;
520       } else if (Opcode == AMDGPU::WQM) {
521         // The WQM intrinsic requires its output to have all the helper lanes
522         // correct, so we need it to be in WQM.
523         Flags = StateWQM;
524         LowerToCopyInstrs.push_back(&MI);
525       } else if (Opcode == AMDGPU::SOFT_WQM) {
526         LowerToCopyInstrs.push_back(&MI);
527         SoftWQMInstrs.push_back(&MI);
528         continue;
529       } else if (Opcode == AMDGPU::STRICT_WWM) {
530         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
531         // it needs to be executed in WQM or Exact so that its copy doesn't
532         // clobber inactive lanes.
533         markInstructionUses(MI, StateStrictWWM, Worklist);
534         GlobalFlags |= StateStrictWWM;
535         LowerToMovInstrs.push_back(&MI);
536         continue;
537       } else if (Opcode == AMDGPU::STRICT_WQM ||
538                  TII->isDualSourceBlendEXP(MI)) {
539         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
540         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
541         // quads that have at least one active thread.
542         markInstructionUses(MI, StateStrictWQM, Worklist);
543         GlobalFlags |= StateStrictWQM;
544 
545         if (Opcode == AMDGPU::STRICT_WQM) {
546           LowerToMovInstrs.push_back(&MI);
547         } else {
548           // Dual source blend export acts as implicit strict-wqm, its sources
549           // need to be shuffled in strict wqm, but the export itself needs to
550           // run in exact mode.
551           BBI.Needs |= StateExact;
552           if (!(BBI.InNeeds & StateExact)) {
553             BBI.InNeeds |= StateExact;
554             Worklist.push_back(MBB);
555           }
556           GlobalFlags |= StateExact;
557           III.Disabled = StateWQM | StateStrict;
558         }
559         continue;
560       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
561                  Opcode == AMDGPU::LDS_DIRECT_LOAD) {
562         // Mark these STRICTWQM, but only for the instruction, not its operands.
563         // This avoid unnecessarily marking M0 as requiring WQM.
564         InstrInfo &II = Instructions[&MI];
565         II.Needs |= StateStrictWQM;
566         GlobalFlags |= StateStrictWQM;
567         continue;
568       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
569                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
570         III.Disabled = StateStrict;
571         MachineOperand &Inactive = MI.getOperand(2);
572         if (Inactive.isReg()) {
573           if (Inactive.isUndef()) {
574             LowerToCopyInstrs.push_back(&MI);
575           } else {
576             markOperand(MI, Inactive, StateStrictWWM, Worklist);
577           }
578         }
579         SetInactiveInstrs.push_back(&MI);
580         continue;
581       } else if (TII->isDisableWQM(MI)) {
582         BBI.Needs |= StateExact;
583         if (!(BBI.InNeeds & StateExact)) {
584           BBI.InNeeds |= StateExact;
585           Worklist.push_back(MBB);
586         }
587         GlobalFlags |= StateExact;
588         III.Disabled = StateWQM | StateStrict;
589         continue;
590       } else {
591         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
592           LiveMaskQueries.push_back(&MI);
593         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
594                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
595                    Opcode == AMDGPU::SI_DEMOTE_I1) {
596           KillInstrs.push_back(&MI);
597           BBI.NeedsLowering = true;
598         } else if (WQMOutputs) {
599           // The function is in machine SSA form, which means that physical
600           // VGPRs correspond to shader inputs and outputs. Inputs are
601           // only used, outputs are only defined.
602           // FIXME: is this still valid?
603           for (const MachineOperand &MO : MI.defs()) {
604             if (!MO.isReg())
605               continue;
606 
607             Register Reg = MO.getReg();
608 
609             if (!Reg.isVirtual() &&
610                 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
611               Flags = StateWQM;
612               break;
613             }
614           }
615         }
616 
617         if (!Flags)
618           continue;
619       }
620 
621       markInstruction(MI, Flags, Worklist);
622       GlobalFlags |= Flags;
623     }
624   }
625 
626   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
627   // ever used anywhere in the function. This implements the corresponding
628   // semantics of @llvm.amdgcn.set.inactive.
629   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
630   if (GlobalFlags & StateWQM) {
631     for (MachineInstr *MI : SetInactiveInstrs)
632       markInstruction(*MI, StateWQM, Worklist);
633     for (MachineInstr *MI : SoftWQMInstrs)
634       markInstruction(*MI, StateWQM, Worklist);
635   }
636 
637   return GlobalFlags;
638 }
639 
640 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
641                                            std::vector<WorkItem>& Worklist) {
642   MachineBasicBlock *MBB = MI.getParent();
643   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
644   BlockInfo &BI = Blocks[MBB];
645 
646   // Control flow-type instructions and stores to temporary memory that are
647   // followed by WQM computations must themselves be in WQM.
648   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
649       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
650     Instructions[&MI].Needs = StateWQM;
651     II.Needs = StateWQM;
652   }
653 
654   // Propagate to block level
655   if (II.Needs & StateWQM) {
656     BI.Needs |= StateWQM;
657     if (!(BI.InNeeds & StateWQM)) {
658       BI.InNeeds |= StateWQM;
659       Worklist.push_back(MBB);
660     }
661   }
662 
663   // Propagate backwards within block
664   if (MachineInstr *PrevMI = MI.getPrevNode()) {
665     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
666     if (!PrevMI->isPHI()) {
667       InstrInfo &PrevII = Instructions[PrevMI];
668       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
669         PrevII.OutNeeds |= InNeeds;
670         Worklist.push_back(PrevMI);
671       }
672     }
673   }
674 
675   // Propagate WQM flag to instruction inputs
676   assert(!(II.Needs & StateExact));
677 
678   if (II.Needs != 0)
679     markInstructionUses(MI, II.Needs, Worklist);
680 
681   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
682   // not require any WQM transitions.
683   if (II.Needs & StateStrictWWM)
684     BI.Needs |= StateStrictWWM;
685   if (II.Needs & StateStrictWQM)
686     BI.Needs |= StateStrictWQM;
687 }
688 
689 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
690                                      std::vector<WorkItem>& Worklist) {
691   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
692 
693   // Propagate through instructions
694   if (!MBB.empty()) {
695     MachineInstr *LastMI = &*MBB.rbegin();
696     InstrInfo &LastII = Instructions[LastMI];
697     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
698       LastII.OutNeeds |= BI.OutNeeds;
699       Worklist.push_back(LastMI);
700     }
701   }
702 
703   // Predecessor blocks must provide for our WQM/Exact needs.
704   for (MachineBasicBlock *Pred : MBB.predecessors()) {
705     BlockInfo &PredBI = Blocks[Pred];
706     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
707       continue;
708 
709     PredBI.OutNeeds |= BI.InNeeds;
710     PredBI.InNeeds |= BI.InNeeds;
711     Worklist.push_back(Pred);
712   }
713 
714   // All successors must be prepared to accept the same set of WQM/Exact data.
715   for (MachineBasicBlock *Succ : MBB.successors()) {
716     BlockInfo &SuccBI = Blocks[Succ];
717     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
718       continue;
719 
720     SuccBI.InNeeds |= BI.OutNeeds;
721     Worklist.push_back(Succ);
722   }
723 }
724 
725 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
726   std::vector<WorkItem> Worklist;
727   char GlobalFlags = scanInstructions(MF, Worklist);
728 
729   while (!Worklist.empty()) {
730     WorkItem WI = Worklist.back();
731     Worklist.pop_back();
732 
733     if (WI.MI)
734       propagateInstruction(*WI.MI, Worklist);
735     else
736       propagateBlock(*WI.MBB, Worklist);
737   }
738 
739   return GlobalFlags;
740 }
741 
742 MachineBasicBlock::iterator
743 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
744                          MachineBasicBlock::iterator Before) {
745   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
746 
747   MachineInstr *Save =
748       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
749           .addReg(AMDGPU::SCC);
750   MachineInstr *Restore =
751       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
752           .addReg(SaveReg);
753 
754   LIS->InsertMachineInstrInMaps(*Save);
755   LIS->InsertMachineInstrInMaps(*Restore);
756   LIS->createAndComputeVirtRegInterval(SaveReg);
757 
758   return Restore;
759 }
760 
761 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
762                                                MachineInstr *TermMI) {
763   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
764                     << *TermMI << "\n");
765 
766   MachineBasicBlock *SplitBB =
767       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
768 
769   // Convert last instruction in block to a terminator.
770   // Note: this only covers the expected patterns
771   unsigned NewOpcode = 0;
772   switch (TermMI->getOpcode()) {
773   case AMDGPU::S_AND_B32:
774     NewOpcode = AMDGPU::S_AND_B32_term;
775     break;
776   case AMDGPU::S_AND_B64:
777     NewOpcode = AMDGPU::S_AND_B64_term;
778     break;
779   case AMDGPU::S_MOV_B32:
780     NewOpcode = AMDGPU::S_MOV_B32_term;
781     break;
782   case AMDGPU::S_MOV_B64:
783     NewOpcode = AMDGPU::S_MOV_B64_term;
784     break;
785   default:
786     break;
787   }
788   if (NewOpcode)
789     TermMI->setDesc(TII->get(NewOpcode));
790 
791   if (SplitBB != BB) {
792     // Update dominator trees
793     using DomTreeT = DomTreeBase<MachineBasicBlock>;
794     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
795     for (MachineBasicBlock *Succ : SplitBB->successors()) {
796       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
797       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
798     }
799     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
800     if (MDT)
801       MDT->getBase().applyUpdates(DTUpdates);
802     if (PDT)
803       PDT->getBase().applyUpdates(DTUpdates);
804 
805     // Link blocks
806     MachineInstr *MI =
807         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
808             .addMBB(SplitBB);
809     LIS->InsertMachineInstrInMaps(*MI);
810   }
811 
812   return SplitBB;
813 }
814 
815 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
816                                             MachineInstr &MI) {
817   const DebugLoc &DL = MI.getDebugLoc();
818   unsigned Opcode = 0;
819 
820   assert(MI.getOperand(0).isReg());
821 
822   // Comparison is for live lanes; however here we compute the inverse
823   // (killed lanes).  This is because VCMP will always generate 0 bits
824   // for inactive lanes so a mask of live lanes would not be correct
825   // inside control flow.
826   // Invert the comparison by swapping the operands and adjusting
827   // the comparison codes.
828 
829   switch (MI.getOperand(2).getImm()) {
830   case ISD::SETUEQ:
831     Opcode = AMDGPU::V_CMP_LG_F32_e64;
832     break;
833   case ISD::SETUGT:
834     Opcode = AMDGPU::V_CMP_GE_F32_e64;
835     break;
836   case ISD::SETUGE:
837     Opcode = AMDGPU::V_CMP_GT_F32_e64;
838     break;
839   case ISD::SETULT:
840     Opcode = AMDGPU::V_CMP_LE_F32_e64;
841     break;
842   case ISD::SETULE:
843     Opcode = AMDGPU::V_CMP_LT_F32_e64;
844     break;
845   case ISD::SETUNE:
846     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
847     break;
848   case ISD::SETO:
849     Opcode = AMDGPU::V_CMP_O_F32_e64;
850     break;
851   case ISD::SETUO:
852     Opcode = AMDGPU::V_CMP_U_F32_e64;
853     break;
854   case ISD::SETOEQ:
855   case ISD::SETEQ:
856     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
857     break;
858   case ISD::SETOGT:
859   case ISD::SETGT:
860     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
861     break;
862   case ISD::SETOGE:
863   case ISD::SETGE:
864     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
865     break;
866   case ISD::SETOLT:
867   case ISD::SETLT:
868     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
869     break;
870   case ISD::SETOLE:
871   case ISD::SETLE:
872     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
873     break;
874   case ISD::SETONE:
875   case ISD::SETNE:
876     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
877     break;
878   default:
879     llvm_unreachable("invalid ISD:SET cond code");
880   }
881 
882   // Pick opcode based on comparison type.
883   MachineInstr *VcmpMI;
884   const MachineOperand &Op0 = MI.getOperand(0);
885   const MachineOperand &Op1 = MI.getOperand(1);
886 
887   // VCC represents lanes killed.
888   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
889 
890   if (TRI->isVGPR(*MRI, Op0.getReg())) {
891     Opcode = AMDGPU::getVOPe32(Opcode);
892     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
893   } else {
894     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
895                  .addReg(VCC, RegState::Define)
896                  .addImm(0) // src0 modifiers
897                  .add(Op1)
898                  .addImm(0) // src1 modifiers
899                  .add(Op0)
900                  .addImm(0); // omod
901   }
902 
903   MachineInstr *MaskUpdateMI =
904       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
905           .addReg(LiveMaskReg)
906           .addReg(VCC);
907 
908   // State of SCC represents whether any lanes are live in mask,
909   // if SCC is 0 then no lanes will be alive anymore.
910   MachineInstr *EarlyTermMI =
911       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
912 
913   MachineInstr *ExecMaskMI =
914       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
915 
916   assert(MBB.succ_size() == 1);
917   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
918                               .addMBB(*MBB.succ_begin());
919 
920   // Update live intervals
921   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
922   MBB.remove(&MI);
923 
924   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
925   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
926   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
927   LIS->InsertMachineInstrInMaps(*NewTerm);
928 
929   return NewTerm;
930 }
931 
932 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
933                                            MachineInstr &MI, bool IsWQM) {
934   const DebugLoc &DL = MI.getDebugLoc();
935   MachineInstr *MaskUpdateMI = nullptr;
936 
937   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
938   const MachineOperand &Op = MI.getOperand(0);
939   int64_t KillVal = MI.getOperand(1).getImm();
940   MachineInstr *ComputeKilledMaskMI = nullptr;
941   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
942   Register TmpReg;
943 
944   // Is this a static or dynamic kill?
945   if (Op.isImm()) {
946     if (Op.getImm() == KillVal) {
947       // Static: all active lanes are killed
948       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
949                          .addReg(LiveMaskReg)
950                          .addReg(Exec);
951     } else {
952       // Static: kill does nothing
953       MachineInstr *NewTerm = nullptr;
954       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
955         LIS->RemoveMachineInstrFromMaps(MI);
956       } else {
957         assert(MBB.succ_size() == 1);
958         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
959                       .addMBB(*MBB.succ_begin());
960         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
961       }
962       MBB.remove(&MI);
963       return NewTerm;
964     }
965   } else {
966     if (!KillVal) {
967       // Op represents live lanes after kill,
968       // so exec mask needs to be factored in.
969       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
970       ComputeKilledMaskMI =
971           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
972       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
973                          .addReg(LiveMaskReg)
974                          .addReg(TmpReg);
975     } else {
976       // Op represents lanes to kill
977       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
978                          .addReg(LiveMaskReg)
979                          .add(Op);
980     }
981   }
982 
983   // State of SCC represents whether any lanes are live in mask,
984   // if SCC is 0 then no lanes will be alive anymore.
985   MachineInstr *EarlyTermMI =
986       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
987 
988   // In the case we got this far some lanes are still live,
989   // update EXEC to deactivate lanes as appropriate.
990   MachineInstr *NewTerm;
991   MachineInstr *WQMMaskMI = nullptr;
992   Register LiveMaskWQM;
993   if (IsDemote) {
994     // Demote - deactivate quads with only helper lanes
995     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
996     WQMMaskMI =
997         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
998     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
999                   .addReg(Exec)
1000                   .addReg(LiveMaskWQM);
1001   } else {
1002     // Kill - deactivate lanes no longer in live mask
1003     if (Op.isImm()) {
1004       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1005       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1006     } else if (!IsWQM) {
1007       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1008                     .addReg(Exec)
1009                     .addReg(LiveMaskReg);
1010     } else {
1011       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1012       NewTerm =
1013           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1014     }
1015   }
1016 
1017   // Update live intervals
1018   LIS->RemoveMachineInstrFromMaps(MI);
1019   MBB.remove(&MI);
1020   assert(EarlyTermMI);
1021   assert(MaskUpdateMI);
1022   assert(NewTerm);
1023   if (ComputeKilledMaskMI)
1024     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1025   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1026   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1027   if (WQMMaskMI)
1028     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1029   LIS->InsertMachineInstrInMaps(*NewTerm);
1030 
1031   if (CndReg) {
1032     LIS->removeInterval(CndReg);
1033     LIS->createAndComputeVirtRegInterval(CndReg);
1034   }
1035   if (TmpReg)
1036     LIS->createAndComputeVirtRegInterval(TmpReg);
1037   if (LiveMaskWQM)
1038     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1039 
1040   return NewTerm;
1041 }
1042 
1043 // Convert a strict mode transition to a pseudo transition.
1044 // This still pre-allocates registers to prevent clobbering,
1045 // but avoids any EXEC mask changes.
1046 void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1047                                             MachineInstr *Entry,
1048                                             MachineInstr *Exit) {
1049   assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1050   assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1051 
1052   Register SaveOrig = Entry->getOperand(0).getReg();
1053 
1054   MachineInstr *NewEntry =
1055     BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1056   MachineInstr *NewExit =
1057     BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1058 
1059   LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1060   Exit->eraseFromParent();
1061 
1062   LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1063   Entry->eraseFromParent();
1064 
1065   LIS->removeInterval(SaveOrig);
1066 }
1067 
1068 // Replace (or supplement) instructions accessing live mask.
1069 // This can only happen once all the live mask registers have been created
1070 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1071 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1072   auto BII = Blocks.find(&MBB);
1073   if (BII == Blocks.end())
1074     return;
1075 
1076   const BlockInfo &BI = BII->second;
1077   if (!BI.NeedsLowering)
1078     return;
1079 
1080   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1081 
1082   SmallVector<MachineInstr *, 4> SplitPoints;
1083   char State = BI.InitialState;
1084   MachineInstr *StrictEntry = nullptr;
1085 
1086   for (MachineInstr &MI : llvm::make_early_inc_range(
1087            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1088     char PreviousState = State;
1089 
1090     if (StateTransition.count(&MI))
1091       State = StateTransition[&MI];
1092 
1093     MachineInstr *SplitPoint = nullptr;
1094     switch (MI.getOpcode()) {
1095     case AMDGPU::SI_DEMOTE_I1:
1096     case AMDGPU::SI_KILL_I1_TERMINATOR:
1097       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1098       break;
1099     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1100       SplitPoint = lowerKillF32(MBB, MI);
1101       break;
1102     case AMDGPU::ENTER_STRICT_WQM:
1103       StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1104       break;
1105     case AMDGPU::EXIT_STRICT_WQM:
1106       if (State == StateWQM && StrictEntry) {
1107         // Transition WQM -> StrictWQM -> WQM detected.
1108         lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1109       }
1110       StrictEntry = nullptr;
1111       break;
1112     case AMDGPU::ENTER_STRICT_WWM:
1113     case AMDGPU::EXIT_STRICT_WWM:
1114       StrictEntry = nullptr;
1115       break;
1116     default:
1117       break;
1118     }
1119     if (SplitPoint)
1120       SplitPoints.push_back(SplitPoint);
1121   }
1122 
1123   // Perform splitting after instruction scan to simplify iteration.
1124   if (!SplitPoints.empty()) {
1125     MachineBasicBlock *BB = &MBB;
1126     for (MachineInstr *MI : SplitPoints) {
1127       BB = splitBlock(BB, MI);
1128     }
1129   }
1130 }
1131 
1132 // Return an iterator in the (inclusive) range [First, Last] at which
1133 // instructions can be safely inserted, keeping in mind that some of the
1134 // instructions we want to add necessarily clobber SCC.
1135 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1136     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1137     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1138   if (!SaveSCC)
1139     return PreferLast ? Last : First;
1140 
1141   LiveRange &LR =
1142       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1143   auto MBBE = MBB.end();
1144   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1145                                      : LIS->getMBBEndIdx(&MBB);
1146   SlotIndex LastIdx =
1147       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1148   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1149   const LiveRange::Segment *S;
1150 
1151   for (;;) {
1152     S = LR.getSegmentContaining(Idx);
1153     if (!S)
1154       break;
1155 
1156     if (PreferLast) {
1157       SlotIndex Next = S->start.getBaseIndex();
1158       if (Next < FirstIdx)
1159         break;
1160       Idx = Next;
1161     } else {
1162       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1163       assert(EndMI && "Segment does not end on valid instruction");
1164       auto NextI = std::next(EndMI->getIterator());
1165       if (NextI == MBB.end())
1166         break;
1167       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1168       if (Next > LastIdx)
1169         break;
1170       Idx = Next;
1171     }
1172   }
1173 
1174   MachineBasicBlock::iterator MBBI;
1175 
1176   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1177     MBBI = MI;
1178   else {
1179     assert(Idx == LIS->getMBBEndIdx(&MBB));
1180     MBBI = MBB.end();
1181   }
1182 
1183   // Move insertion point past any operations modifying EXEC.
1184   // This assumes that the value of SCC defined by any of these operations
1185   // does not need to be preserved.
1186   while (MBBI != Last) {
1187     bool IsExecDef = false;
1188     for (const MachineOperand &MO : MBBI->operands()) {
1189       if (MO.isReg() && MO.isDef()) {
1190         IsExecDef |=
1191             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1192       }
1193     }
1194     if (!IsExecDef)
1195       break;
1196     MBBI++;
1197     S = nullptr;
1198   }
1199 
1200   if (S)
1201     MBBI = saveSCC(MBB, MBBI);
1202 
1203   return MBBI;
1204 }
1205 
1206 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1207                               MachineBasicBlock::iterator Before,
1208                               Register SaveWQM) {
1209   MachineInstr *MI;
1210 
1211   if (SaveWQM) {
1212     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1213              .addReg(LiveMaskReg);
1214   } else {
1215     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1216              .addReg(Exec)
1217              .addReg(LiveMaskReg);
1218   }
1219 
1220   LIS->InsertMachineInstrInMaps(*MI);
1221   StateTransition[MI] = StateExact;
1222 }
1223 
1224 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1225                             MachineBasicBlock::iterator Before,
1226                             Register SavedWQM) {
1227   MachineInstr *MI;
1228 
1229   if (SavedWQM) {
1230     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1231              .addReg(SavedWQM);
1232   } else {
1233     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1234   }
1235 
1236   LIS->InsertMachineInstrInMaps(*MI);
1237   StateTransition[MI] = StateWQM;
1238 }
1239 
1240 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1241                                    MachineBasicBlock::iterator Before,
1242                                    Register SaveOrig, char StrictStateNeeded) {
1243   MachineInstr *MI;
1244   assert(SaveOrig);
1245   assert(StrictStateNeeded == StateStrictWWM ||
1246          StrictStateNeeded == StateStrictWQM);
1247 
1248   if (StrictStateNeeded == StateStrictWWM) {
1249     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1250                  SaveOrig)
1251              .addImm(-1);
1252   } else {
1253     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1254                  SaveOrig)
1255              .addImm(-1);
1256   }
1257   LIS->InsertMachineInstrInMaps(*MI);
1258   StateTransition[MI] = StrictStateNeeded;
1259 
1260   // Mark block as needing lower so it will be checked for unnecessary transitions.
1261   auto BII = Blocks.find(&MBB);
1262   if (BII != Blocks.end())
1263     BII->second.NeedsLowering = true;
1264 }
1265 
1266 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1267                                      MachineBasicBlock::iterator Before,
1268                                      Register SavedOrig, char NonStrictState,
1269                                      char CurrentStrictState) {
1270   MachineInstr *MI;
1271 
1272   assert(SavedOrig);
1273   assert(CurrentStrictState == StateStrictWWM ||
1274          CurrentStrictState == StateStrictWQM);
1275 
1276   if (CurrentStrictState == StateStrictWWM) {
1277     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1278                  Exec)
1279              .addReg(SavedOrig);
1280   } else {
1281     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1282                  Exec)
1283              .addReg(SavedOrig);
1284   }
1285   LIS->InsertMachineInstrInMaps(*MI);
1286   StateTransition[MI] = NonStrictState;
1287 }
1288 
1289 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1290   auto BII = Blocks.find(&MBB);
1291   if (BII == Blocks.end())
1292     return;
1293 
1294   BlockInfo &BI = BII->second;
1295 
1296   // This is a non-entry block that is WQM throughout, so no need to do
1297   // anything.
1298   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1299     BI.InitialState = StateWQM;
1300     return;
1301   }
1302 
1303   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1304                     << ":\n");
1305 
1306   Register SavedWQMReg;
1307   Register SavedNonStrictReg;
1308   bool WQMFromExec = IsEntry;
1309   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1310   char NonStrictState = 0;
1311   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1312 
1313   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1314   if (IsEntry) {
1315     // Skip the instruction that saves LiveMask
1316     if (II != IE && II->getOpcode() == AMDGPU::COPY)
1317       ++II;
1318   }
1319 
1320   // This stores the first instruction where it's safe to switch from WQM to
1321   // Exact or vice versa.
1322   MachineBasicBlock::iterator FirstWQM = IE;
1323 
1324   // This stores the first instruction where it's safe to switch from Strict
1325   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1326   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1327   // be safe to switch to/from WQM as well.
1328   MachineBasicBlock::iterator FirstStrict = IE;
1329 
1330   // Record initial state is block information.
1331   BI.InitialState = State;
1332 
1333   for (;;) {
1334     MachineBasicBlock::iterator Next = II;
1335     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1336     char OutNeeds = 0;
1337 
1338     if (FirstWQM == IE)
1339       FirstWQM = II;
1340 
1341     if (FirstStrict == IE)
1342       FirstStrict = II;
1343 
1344     // First, figure out the allowed states (Needs) based on the propagated
1345     // flags.
1346     if (II != IE) {
1347       MachineInstr &MI = *II;
1348 
1349       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1350         auto III = Instructions.find(&MI);
1351         if (III != Instructions.end()) {
1352           if (III->second.Needs & StateStrictWWM)
1353             Needs = StateStrictWWM;
1354           else if (III->second.Needs & StateStrictWQM)
1355             Needs = StateStrictWQM;
1356           else if (III->second.Needs & StateWQM)
1357             Needs = StateWQM;
1358           else
1359             Needs &= ~III->second.Disabled;
1360           OutNeeds = III->second.OutNeeds;
1361         }
1362       } else {
1363         // If the instruction doesn't actually need a correct EXEC, then we can
1364         // safely leave Strict mode enabled.
1365         Needs = StateExact | StateWQM | StateStrict;
1366       }
1367 
1368       if (MI.isTerminator() && OutNeeds == StateExact)
1369         Needs = StateExact;
1370 
1371       ++Next;
1372     } else {
1373       // End of basic block
1374       if (BI.OutNeeds & StateWQM)
1375         Needs = StateWQM;
1376       else if (BI.OutNeeds == StateExact)
1377         Needs = StateExact;
1378       else
1379         Needs = StateWQM | StateExact;
1380     }
1381 
1382     // Now, transition if necessary.
1383     if (!(Needs & State)) {
1384       MachineBasicBlock::iterator First;
1385       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1386           State == StateStrictWQM || Needs == StateStrictWQM) {
1387         // We must switch to or from Strict mode.
1388         First = FirstStrict;
1389       } else {
1390         // We only need to switch to/from WQM, so we can use FirstWQM.
1391         First = FirstWQM;
1392       }
1393 
1394       // Whether we need to save SCC depends on start and end states.
1395       bool SaveSCC = false;
1396       switch (State) {
1397       case StateExact:
1398       case StateStrictWWM:
1399       case StateStrictWQM:
1400         // Exact/Strict -> Strict: save SCC
1401         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1402         // Exact/Strict -> Exact: no save
1403         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1404         break;
1405       case StateWQM:
1406         // WQM -> Exact/Strict: save SCC
1407         SaveSCC = !(Needs & StateWQM);
1408         break;
1409       default:
1410         llvm_unreachable("Unknown state");
1411         break;
1412       }
1413       MachineBasicBlock::iterator Before =
1414           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1415 
1416       if (State & StateStrict) {
1417         assert(State == StateStrictWWM || State == StateStrictWQM);
1418         assert(SavedNonStrictReg);
1419         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1420 
1421         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1422         SavedNonStrictReg = 0;
1423         State = NonStrictState;
1424       }
1425 
1426       if (Needs & StateStrict) {
1427         NonStrictState = State;
1428         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1429         assert(!SavedNonStrictReg);
1430         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1431 
1432         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1433         State = Needs;
1434 
1435       } else {
1436         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1437           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1438             assert(!SavedWQMReg);
1439             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1440           }
1441 
1442           toExact(MBB, Before, SavedWQMReg);
1443           State = StateExact;
1444         } else if (State == StateExact && (Needs & StateWQM) &&
1445                    !(Needs & StateExact)) {
1446           assert(WQMFromExec == (SavedWQMReg == 0));
1447 
1448           toWQM(MBB, Before, SavedWQMReg);
1449 
1450           if (SavedWQMReg) {
1451             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1452             SavedWQMReg = 0;
1453           }
1454           State = StateWQM;
1455         } else {
1456           // We can get here if we transitioned from StrictWWM to a
1457           // non-StrictWWM state that already matches our needs, but we
1458           // shouldn't need to do anything.
1459           assert(Needs & State);
1460         }
1461       }
1462     }
1463 
1464     if (Needs != (StateExact | StateWQM | StateStrict)) {
1465       if (Needs != (StateExact | StateWQM))
1466         FirstWQM = IE;
1467       FirstStrict = IE;
1468     }
1469 
1470     if (II == IE)
1471       break;
1472 
1473     II = Next;
1474   }
1475   assert(!SavedWQMReg);
1476   assert(!SavedNonStrictReg);
1477 }
1478 
1479 void SIWholeQuadMode::lowerLiveMaskQueries() {
1480   for (MachineInstr *MI : LiveMaskQueries) {
1481     const DebugLoc &DL = MI->getDebugLoc();
1482     Register Dest = MI->getOperand(0).getReg();
1483 
1484     MachineInstr *Copy =
1485         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1486             .addReg(LiveMaskReg);
1487 
1488     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1489     MI->eraseFromParent();
1490   }
1491 }
1492 
1493 void SIWholeQuadMode::lowerCopyInstrs() {
1494   for (MachineInstr *MI : LowerToMovInstrs) {
1495     assert(MI->getNumExplicitOperands() == 2);
1496 
1497     const Register Reg = MI->getOperand(0).getReg();
1498 
1499     const TargetRegisterClass *regClass =
1500         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1501     if (TRI->isVGPRClass(regClass)) {
1502       const unsigned MovOp = TII->getMovOpcode(regClass);
1503       MI->setDesc(TII->get(MovOp));
1504 
1505       // Check that it already implicitly depends on exec (like all VALU movs
1506       // should do).
1507       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1508         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1509       }));
1510     } else {
1511       // Remove early-clobber and exec dependency from simple SGPR copies.
1512       // This allows some to be eliminated during/post RA.
1513       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1514       if (MI->getOperand(0).isEarlyClobber()) {
1515         LIS->removeInterval(Reg);
1516         MI->getOperand(0).setIsEarlyClobber(false);
1517         LIS->createAndComputeVirtRegInterval(Reg);
1518       }
1519       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1520       while (Index >= 0) {
1521         MI->removeOperand(Index);
1522         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1523       }
1524       MI->setDesc(TII->get(AMDGPU::COPY));
1525       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1526     }
1527   }
1528   for (MachineInstr *MI : LowerToCopyInstrs) {
1529     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1530         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1531       assert(MI->getNumExplicitOperands() == 3);
1532       // the only reason we should be here is V_SET_INACTIVE has
1533       // an undef input so it is being replaced by a simple copy.
1534       // There should be a second undef source that we should remove.
1535       assert(MI->getOperand(2).isUndef());
1536       MI->removeOperand(2);
1537       MI->untieRegOperand(1);
1538     } else {
1539       assert(MI->getNumExplicitOperands() == 2);
1540     }
1541 
1542     MI->setDesc(TII->get(AMDGPU::COPY));
1543   }
1544 }
1545 
1546 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1547   for (MachineInstr *MI : KillInstrs) {
1548     MachineBasicBlock *MBB = MI->getParent();
1549     MachineInstr *SplitPoint = nullptr;
1550     switch (MI->getOpcode()) {
1551     case AMDGPU::SI_DEMOTE_I1:
1552     case AMDGPU::SI_KILL_I1_TERMINATOR:
1553       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1554       break;
1555     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1556       SplitPoint = lowerKillF32(*MBB, *MI);
1557       break;
1558     default:
1559       continue;
1560     }
1561     if (SplitPoint)
1562       splitBlock(MBB, SplitPoint);
1563   }
1564 }
1565 
1566 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1567   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1568                     << " ------------- \n");
1569   LLVM_DEBUG(MF.dump(););
1570 
1571   Instructions.clear();
1572   Blocks.clear();
1573   LiveMaskQueries.clear();
1574   LowerToCopyInstrs.clear();
1575   LowerToMovInstrs.clear();
1576   KillInstrs.clear();
1577   StateTransition.clear();
1578 
1579   ST = &MF.getSubtarget<GCNSubtarget>();
1580 
1581   TII = ST->getInstrInfo();
1582   TRI = &TII->getRegisterInfo();
1583   MRI = &MF.getRegInfo();
1584   LIS = &getAnalysis<LiveIntervals>();
1585   MDT = &getAnalysis<MachineDominatorTree>();
1586   PDT = &getAnalysis<MachinePostDominatorTree>();
1587 
1588   if (ST->isWave32()) {
1589     AndOpc = AMDGPU::S_AND_B32;
1590     AndN2Opc = AMDGPU::S_ANDN2_B32;
1591     XorOpc = AMDGPU::S_XOR_B32;
1592     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1593     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1594     WQMOpc = AMDGPU::S_WQM_B32;
1595     Exec = AMDGPU::EXEC_LO;
1596   } else {
1597     AndOpc = AMDGPU::S_AND_B64;
1598     AndN2Opc = AMDGPU::S_ANDN2_B64;
1599     XorOpc = AMDGPU::S_XOR_B64;
1600     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1601     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1602     WQMOpc = AMDGPU::S_WQM_B64;
1603     Exec = AMDGPU::EXEC;
1604   }
1605 
1606   const char GlobalFlags = analyzeFunction(MF);
1607   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1608 
1609   LiveMaskReg = Exec;
1610 
1611   // Shader is simple does not need any state changes or any complex lowering
1612   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1613       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1614     lowerLiveMaskQueries();
1615     return !LiveMaskQueries.empty();
1616   }
1617 
1618   MachineBasicBlock &Entry = MF.front();
1619   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1620 
1621   // Store a copy of the original live mask when required
1622   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1623     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1624     MachineInstr *MI =
1625         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1626             .addReg(Exec);
1627     LIS->InsertMachineInstrInMaps(*MI);
1628   }
1629 
1630   LLVM_DEBUG(printInfo());
1631 
1632   lowerLiveMaskQueries();
1633   lowerCopyInstrs();
1634 
1635   // Shader only needs WQM
1636   if (GlobalFlags == StateWQM) {
1637     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1638                   .addReg(Exec);
1639     LIS->InsertMachineInstrInMaps(*MI);
1640     lowerKillInstrs(true);
1641   } else {
1642     for (auto BII : Blocks)
1643       processBlock(*BII.first, BII.first == &Entry);
1644     // Lowering blocks causes block splitting so perform as a second pass.
1645     for (auto BII : Blocks)
1646       lowerBlock(*BII.first);
1647   }
1648 
1649   // Compute live range for live mask
1650   if (LiveMaskReg != Exec)
1651     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1652 
1653   // Physical registers like SCC aren't tracked by default anyway, so just
1654   // removing the ranges we computed is the simplest option for maintaining
1655   // the analysis results.
1656   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1657 
1658   // If we performed any kills then recompute EXEC
1659   if (!KillInstrs.empty())
1660     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1661 
1662   return true;
1663 }
1664