xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
PrintState__anonf56fbe7e0111::PrintState103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
operator <<(raw_ostream & OS,const PrintState & PS)107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137   char InitialState = 0;
138   bool NeedsLowering = false;
139 };
140 
141 struct WorkItem {
142   MachineBasicBlock *MBB = nullptr;
143   MachineInstr *MI = nullptr;
144 
145   WorkItem() = default;
WorkItem__anonf56fbe7e0111::WorkItem146   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem__anonf56fbe7e0111::WorkItem147   WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149 
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152   const SIInstrInfo *TII;
153   const SIRegisterInfo *TRI;
154   const GCNSubtarget *ST;
155   MachineRegisterInfo *MRI;
156   LiveIntervals *LIS;
157   MachineDominatorTree *MDT;
158   MachinePostDominatorTree *PDT;
159 
160   unsigned AndOpc;
161   unsigned AndTermOpc;
162   unsigned AndN2Opc;
163   unsigned XorOpc;
164   unsigned AndSaveExecOpc;
165   unsigned AndSaveExecTermOpc;
166   unsigned WQMOpc;
167   Register Exec;
168   Register LiveMaskReg;
169 
170   DenseMap<const MachineInstr *, InstrInfo> Instructions;
171   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172 
173   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174   DenseMap<const MachineInstr *, char> StateTransition;
175 
176   SmallVector<MachineInstr *, 2> LiveMaskQueries;
177   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179   SmallVector<MachineInstr *, 4> KillInstrs;
180   SmallVector<MachineInstr *, 4> InitExecInstrs;
181 
182   void printInfo();
183 
184   void markInstruction(MachineInstr &MI, char Flag,
185                        std::vector<WorkItem> &Worklist);
186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189                    std::vector<WorkItem> &Worklist);
190   void markInstructionUses(const MachineInstr &MI, char Flag,
191                            std::vector<WorkItem> &Worklist);
192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195   char analyzeFunction(MachineFunction &MF);
196 
197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198                                       MachineBasicBlock::iterator Before);
199   MachineBasicBlock::iterator
200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201                    MachineBasicBlock::iterator Last, bool PreferLast,
202                    bool SaveSCC);
203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204                Register SaveWQM);
205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206              Register SavedWQM);
207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208                     Register SaveOrig, char StrictStateNeeded);
209   void fromStrictMode(MachineBasicBlock &MBB,
210                       MachineBasicBlock::iterator Before, Register SavedOrig,
211                       char NonStrictState, char CurrentStrictState);
212 
213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214 
215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216                             bool IsWQM);
217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218 
219   void lowerBlock(MachineBasicBlock &MBB);
220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222   bool lowerLiveMaskQueries();
223   bool lowerCopyInstrs();
224   bool lowerKillInstrs(bool IsWQM);
225   void lowerInitExec(MachineInstr &MI);
226   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
227                                                   bool &Changed);
228 
229 public:
230   static char ID;
231 
SIWholeQuadMode()232   SIWholeQuadMode() :
233     MachineFunctionPass(ID) { }
234 
235   bool runOnMachineFunction(MachineFunction &MF) override;
236 
getPassName() const237   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
238 
getAnalysisUsage(AnalysisUsage & AU) const239   void getAnalysisUsage(AnalysisUsage &AU) const override {
240     AU.addRequired<LiveIntervalsWrapperPass>();
241     AU.addPreserved<SlotIndexesWrapperPass>();
242     AU.addPreserved<LiveIntervalsWrapperPass>();
243     AU.addPreserved<MachineDominatorTreeWrapperPass>();
244     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
245     MachineFunctionPass::getAnalysisUsage(AU);
246   }
247 
getClearedProperties() const248   MachineFunctionProperties getClearedProperties() const override {
249     return MachineFunctionProperties().set(
250         MachineFunctionProperties::Property::IsSSA);
251   }
252 };
253 
254 } // end anonymous namespace
255 
256 char SIWholeQuadMode::ID = 0;
257 
258 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
259                       false)
260 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
261 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
262 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
263 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
264                     false)
265 
266 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
267 
createSIWholeQuadModePass()268 FunctionPass *llvm::createSIWholeQuadModePass() {
269   return new SIWholeQuadMode;
270 }
271 
272 #ifndef NDEBUG
printInfo()273 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
274   for (const auto &BII : Blocks) {
275     dbgs() << "\n"
276            << printMBBReference(*BII.first) << ":\n"
277            << "  InNeeds = " << PrintState(BII.second.InNeeds)
278            << ", Needs = " << PrintState(BII.second.Needs)
279            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
280 
281     for (const MachineInstr &MI : *BII.first) {
282       auto III = Instructions.find(&MI);
283       if (III != Instructions.end()) {
284         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
285                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
286       }
287     }
288   }
289 }
290 #endif
291 
markInstruction(MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293                                       std::vector<WorkItem> &Worklist) {
294   InstrInfo &II = Instructions[&MI];
295 
296   assert(!(Flag & StateExact) && Flag != 0);
297 
298   // Remove any disabled states from the flag. The user that required it gets
299   // an undefined value in the helper lanes. For example, this can happen if
300   // the result of an atomic is used by instruction that requires WQM, where
301   // ignoring the request for WQM is correct as per the relevant specs.
302   Flag &= ~II.Disabled;
303 
304   // Ignore if the flag is already encompassed by the existing needs, or we
305   // just disabled everything.
306   if ((II.Needs & Flag) == Flag)
307     return;
308 
309   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310   II.Needs |= Flag;
311   Worklist.emplace_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
markDefs(const MachineInstr & UseMI,LiveRange & LR,Register Reg,unsigned SubReg,char Flag,std::vector<WorkItem> & Worklist)315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316                                Register Reg, unsigned SubReg, char Flag,
317                                std::vector<WorkItem> &Worklist) {
318   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321   const VNInfo *Value = UseLRQ.valueIn();
322   if (!Value)
323     return;
324 
325   // Note: this code assumes that lane masks on AMDGPU completely
326   // cover registers.
327   const LaneBitmask UseLanes =
328       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330                                 : LaneBitmask::getNone());
331 
332   // Perform a depth-first iteration of the LiveRange graph marking defs.
333   // Stop processing of a given branch when all use lanes have been defined.
334   // The first definition stops processing for a physical register.
335   struct PhiEntry {
336     const VNInfo *Phi;
337     unsigned PredIdx;
338     LaneBitmask DefinedLanes;
339 
340     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342   };
343   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344   SmallVector<PhiEntry, 2> PhiStack;
345   SmallSet<VisitKey, 4> Visited;
346   LaneBitmask DefinedLanes;
347   unsigned NextPredIdx = 0; // Only used for processing phi nodes
348   do {
349     const VNInfo *NextValue = nullptr;
350     const VisitKey Key(Value, DefinedLanes);
351 
352     if (Visited.insert(Key).second) {
353       // On first visit to a phi then start processing first predecessor
354       NextPredIdx = 0;
355     }
356 
357     if (Value->isPHIDef()) {
358       // Each predecessor node in the phi must be processed as a subgraph
359       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360       assert(MBB && "Phi-def has no defining MBB");
361 
362       // Find next predecessor to process
363       unsigned Idx = NextPredIdx;
364       auto PI = MBB->pred_begin() + Idx;
365       auto PE = MBB->pred_end();
366       for (; PI != PE && !NextValue; ++PI, ++Idx) {
367         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368           if (!Visited.count(VisitKey(VN, DefinedLanes)))
369             NextValue = VN;
370         }
371       }
372 
373       // If there are more predecessors to process; add phi to stack
374       if (PI != PE)
375         PhiStack.emplace_back(Value, Idx, DefinedLanes);
376     } else {
377       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378       assert(MI && "Def has no defining instruction");
379 
380       if (Reg.isVirtual()) {
381         // Iterate over all operands to find relevant definitions
382         bool HasDef = false;
383         for (const MachineOperand &Op : MI->all_defs()) {
384           if (Op.getReg() != Reg)
385             continue;
386 
387           // Compute lanes defined and overlap with use
388           LaneBitmask OpLanes =
389               Op.isUndef() ? LaneBitmask::getAll()
390                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391           LaneBitmask Overlap = (UseLanes & OpLanes);
392 
393           // Record if this instruction defined any of use
394           HasDef |= Overlap.any();
395 
396           // Mark any lanes defined
397           DefinedLanes |= OpLanes;
398         }
399 
400         // Check if all lanes of use have been defined
401         if ((DefinedLanes & UseLanes) != UseLanes) {
402           // Definition not complete; need to process input value
403           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404           if (const VNInfo *VN = LRQ.valueIn()) {
405             if (!Visited.count(VisitKey(VN, DefinedLanes)))
406               NextValue = VN;
407           }
408         }
409 
410         // Only mark the instruction if it defines some part of the use
411         if (HasDef)
412           markInstruction(*MI, Flag, Worklist);
413       } else {
414         // For physical registers simply mark the defining instruction
415         markInstruction(*MI, Flag, Worklist);
416       }
417     }
418 
419     if (!NextValue && !PhiStack.empty()) {
420       // Reach end of chain; revert to processing last phi
421       PhiEntry &Entry = PhiStack.back();
422       NextValue = Entry.Phi;
423       NextPredIdx = Entry.PredIdx;
424       DefinedLanes = Entry.DefinedLanes;
425       PhiStack.pop_back();
426     }
427 
428     Value = NextValue;
429   } while (Value);
430 }
431 
markOperand(const MachineInstr & MI,const MachineOperand & Op,char Flag,std::vector<WorkItem> & Worklist)432 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433                                   const MachineOperand &Op, char Flag,
434                                   std::vector<WorkItem> &Worklist) {
435   assert(Op.isReg());
436   Register Reg = Op.getReg();
437 
438   // Ignore some hardware registers
439   switch (Reg) {
440   case AMDGPU::EXEC:
441   case AMDGPU::EXEC_LO:
442     return;
443   default:
444     break;
445   }
446 
447   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448                     << " for " << MI);
449   if (Reg.isVirtual()) {
450     LiveRange &LR = LIS->getInterval(Reg);
451     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452   } else {
453     // Handle physical registers that we need to track; this is mostly relevant
454     // for VCC, which can appear as the (implicit) input of a uniform branch,
455     // e.g. when a loop counter is stored in a VGPR.
456     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
457       LiveRange &LR = LIS->getRegUnit(Unit);
458       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
459       if (Value)
460         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
461     }
462   }
463 }
464 
465 /// Mark all instructions defining the uses in \p MI with \p Flag.
markInstructionUses(const MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)466 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
467                                           std::vector<WorkItem> &Worklist) {
468   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
469                     << MI);
470 
471   for (const MachineOperand &Use : MI.all_uses())
472     markOperand(MI, Use, Flag, Worklist);
473 }
474 
475 // Scan instructions to determine which ones require an Exact execmask and
476 // which ones seed WQM requirements.
scanInstructions(MachineFunction & MF,std::vector<WorkItem> & Worklist)477 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
478                                        std::vector<WorkItem> &Worklist) {
479   char GlobalFlags = 0;
480   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
481   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
482   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
483   bool HasImplicitDerivatives =
484       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
485 
486   // We need to visit the basic blocks in reverse post-order so that we visit
487   // defs before uses, in particular so that we don't accidentally mark an
488   // instruction as needing e.g. WQM before visiting it and realizing it needs
489   // WQM disabled.
490   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
491   for (MachineBasicBlock *MBB : RPOT) {
492     BlockInfo &BBI = Blocks[MBB];
493 
494     for (MachineInstr &MI : *MBB) {
495       InstrInfo &III = Instructions[&MI];
496       unsigned Opcode = MI.getOpcode();
497       char Flags = 0;
498 
499       if (TII->isWQM(Opcode)) {
500         // If LOD is not supported WQM is not needed.
501         // Only generate implicit WQM if implicit derivatives are required.
502         // This avoids inserting unintended WQM if a shader type without
503         // implicit derivatives uses an image sampling instruction.
504         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
505           // Sampling instructions don't need to produce results for all pixels
506           // in a quad, they just require all inputs of a quad to have been
507           // computed for derivatives.
508           markInstructionUses(MI, StateWQM, Worklist);
509           GlobalFlags |= StateWQM;
510         }
511       } else if (Opcode == AMDGPU::WQM) {
512         // The WQM intrinsic requires its output to have all the helper lanes
513         // correct, so we need it to be in WQM.
514         Flags = StateWQM;
515         LowerToCopyInstrs.push_back(&MI);
516       } else if (Opcode == AMDGPU::SOFT_WQM) {
517         LowerToCopyInstrs.push_back(&MI);
518         SoftWQMInstrs.push_back(&MI);
519       } else if (Opcode == AMDGPU::STRICT_WWM) {
520         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
521         // it needs to be executed in WQM or Exact so that its copy doesn't
522         // clobber inactive lanes.
523         markInstructionUses(MI, StateStrictWWM, Worklist);
524         GlobalFlags |= StateStrictWWM;
525         LowerToMovInstrs.push_back(&MI);
526       } else if (Opcode == AMDGPU::STRICT_WQM ||
527                  TII->isDualSourceBlendEXP(MI)) {
528         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
529         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
530         // quads that have at least one active thread.
531         markInstructionUses(MI, StateStrictWQM, Worklist);
532         GlobalFlags |= StateStrictWQM;
533 
534         if (Opcode == AMDGPU::STRICT_WQM) {
535           LowerToMovInstrs.push_back(&MI);
536         } else {
537           // Dual source blend export acts as implicit strict-wqm, its sources
538           // need to be shuffled in strict wqm, but the export itself needs to
539           // run in exact mode.
540           BBI.Needs |= StateExact;
541           if (!(BBI.InNeeds & StateExact)) {
542             BBI.InNeeds |= StateExact;
543             Worklist.emplace_back(MBB);
544           }
545           GlobalFlags |= StateExact;
546           III.Disabled = StateWQM | StateStrict;
547         }
548       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
549                  Opcode == AMDGPU::DS_PARAM_LOAD ||
550                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
551                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
552         // Mark these STRICTWQM, but only for the instruction, not its operands.
553         // This avoid unnecessarily marking M0 as requiring WQM.
554         III.Needs |= StateStrictWQM;
555         GlobalFlags |= StateStrictWQM;
556       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
557                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
558         III.Disabled = StateStrict;
559         MachineOperand &Inactive = MI.getOperand(2);
560         if (Inactive.isReg()) {
561           if (Inactive.isUndef()) {
562             LowerToCopyInstrs.push_back(&MI);
563           } else {
564             markOperand(MI, Inactive, StateStrictWWM, Worklist);
565           }
566         }
567         SetInactiveInstrs.push_back(&MI);
568       } else if (TII->isDisableWQM(MI)) {
569         BBI.Needs |= StateExact;
570         if (!(BBI.InNeeds & StateExact)) {
571           BBI.InNeeds |= StateExact;
572           Worklist.emplace_back(MBB);
573         }
574         GlobalFlags |= StateExact;
575         III.Disabled = StateWQM | StateStrict;
576       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
577                  Opcode == AMDGPU::SI_LIVE_MASK) {
578         LiveMaskQueries.push_back(&MI);
579       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
580                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
581                  Opcode == AMDGPU::SI_DEMOTE_I1) {
582         KillInstrs.push_back(&MI);
583         BBI.NeedsLowering = true;
584       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
585                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
586         InitExecInstrs.push_back(&MI);
587       } else if (WQMOutputs) {
588         // The function is in machine SSA form, which means that physical
589         // VGPRs correspond to shader inputs and outputs. Inputs are
590         // only used, outputs are only defined.
591         // FIXME: is this still valid?
592         for (const MachineOperand &MO : MI.defs()) {
593           Register Reg = MO.getReg();
594           if (Reg.isPhysical() &&
595               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
596             Flags = StateWQM;
597             break;
598           }
599         }
600       }
601 
602       if (Flags) {
603         markInstruction(MI, Flags, Worklist);
604         GlobalFlags |= Flags;
605       }
606     }
607   }
608 
609   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
610   // ever used anywhere in the function. This implements the corresponding
611   // semantics of @llvm.amdgcn.set.inactive.
612   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
613   if (GlobalFlags & StateWQM) {
614     for (MachineInstr *MI : SetInactiveInstrs)
615       markInstruction(*MI, StateWQM, Worklist);
616     for (MachineInstr *MI : SoftWQMInstrs)
617       markInstruction(*MI, StateWQM, Worklist);
618   }
619 
620   return GlobalFlags;
621 }
622 
propagateInstruction(MachineInstr & MI,std::vector<WorkItem> & Worklist)623 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
624                                            std::vector<WorkItem>& Worklist) {
625   MachineBasicBlock *MBB = MI.getParent();
626   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
627   BlockInfo &BI = Blocks[MBB];
628 
629   // Control flow-type instructions and stores to temporary memory that are
630   // followed by WQM computations must themselves be in WQM.
631   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
632       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
633     Instructions[&MI].Needs = StateWQM;
634     II.Needs = StateWQM;
635   }
636 
637   // Propagate to block level
638   if (II.Needs & StateWQM) {
639     BI.Needs |= StateWQM;
640     if (!(BI.InNeeds & StateWQM)) {
641       BI.InNeeds |= StateWQM;
642       Worklist.emplace_back(MBB);
643     }
644   }
645 
646   // Propagate backwards within block
647   if (MachineInstr *PrevMI = MI.getPrevNode()) {
648     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
649     if (!PrevMI->isPHI()) {
650       InstrInfo &PrevII = Instructions[PrevMI];
651       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
652         PrevII.OutNeeds |= InNeeds;
653         Worklist.emplace_back(PrevMI);
654       }
655     }
656   }
657 
658   // Propagate WQM flag to instruction inputs
659   assert(!(II.Needs & StateExact));
660 
661   if (II.Needs != 0)
662     markInstructionUses(MI, II.Needs, Worklist);
663 
664   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
665   // not require any WQM transitions.
666   if (II.Needs & StateStrictWWM)
667     BI.Needs |= StateStrictWWM;
668   if (II.Needs & StateStrictWQM)
669     BI.Needs |= StateStrictWQM;
670 }
671 
propagateBlock(MachineBasicBlock & MBB,std::vector<WorkItem> & Worklist)672 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
673                                      std::vector<WorkItem>& Worklist) {
674   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
675 
676   // Propagate through instructions
677   if (!MBB.empty()) {
678     MachineInstr *LastMI = &*MBB.rbegin();
679     InstrInfo &LastII = Instructions[LastMI];
680     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
681       LastII.OutNeeds |= BI.OutNeeds;
682       Worklist.emplace_back(LastMI);
683     }
684   }
685 
686   // Predecessor blocks must provide for our WQM/Exact needs.
687   for (MachineBasicBlock *Pred : MBB.predecessors()) {
688     BlockInfo &PredBI = Blocks[Pred];
689     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
690       continue;
691 
692     PredBI.OutNeeds |= BI.InNeeds;
693     PredBI.InNeeds |= BI.InNeeds;
694     Worklist.emplace_back(Pred);
695   }
696 
697   // All successors must be prepared to accept the same set of WQM/Exact data.
698   for (MachineBasicBlock *Succ : MBB.successors()) {
699     BlockInfo &SuccBI = Blocks[Succ];
700     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
701       continue;
702 
703     SuccBI.InNeeds |= BI.OutNeeds;
704     Worklist.emplace_back(Succ);
705   }
706 }
707 
analyzeFunction(MachineFunction & MF)708 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
709   std::vector<WorkItem> Worklist;
710   char GlobalFlags = scanInstructions(MF, Worklist);
711 
712   while (!Worklist.empty()) {
713     WorkItem WI = Worklist.back();
714     Worklist.pop_back();
715 
716     if (WI.MI)
717       propagateInstruction(*WI.MI, Worklist);
718     else
719       propagateBlock(*WI.MBB, Worklist);
720   }
721 
722   return GlobalFlags;
723 }
724 
725 MachineBasicBlock::iterator
saveSCC(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before)726 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
727                          MachineBasicBlock::iterator Before) {
728   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
729 
730   MachineInstr *Save =
731       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
732           .addReg(AMDGPU::SCC);
733   MachineInstr *Restore =
734       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
735           .addReg(SaveReg);
736 
737   LIS->InsertMachineInstrInMaps(*Save);
738   LIS->InsertMachineInstrInMaps(*Restore);
739   LIS->createAndComputeVirtRegInterval(SaveReg);
740 
741   return Restore;
742 }
743 
splitBlock(MachineBasicBlock * BB,MachineInstr * TermMI)744 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
745                                                MachineInstr *TermMI) {
746   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
747                     << *TermMI << "\n");
748 
749   MachineBasicBlock *SplitBB =
750       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
751 
752   // Convert last instruction in block to a terminator.
753   // Note: this only covers the expected patterns
754   unsigned NewOpcode = 0;
755   switch (TermMI->getOpcode()) {
756   case AMDGPU::S_AND_B32:
757     NewOpcode = AMDGPU::S_AND_B32_term;
758     break;
759   case AMDGPU::S_AND_B64:
760     NewOpcode = AMDGPU::S_AND_B64_term;
761     break;
762   case AMDGPU::S_MOV_B32:
763     NewOpcode = AMDGPU::S_MOV_B32_term;
764     break;
765   case AMDGPU::S_MOV_B64:
766     NewOpcode = AMDGPU::S_MOV_B64_term;
767     break;
768   default:
769     break;
770   }
771   if (NewOpcode)
772     TermMI->setDesc(TII->get(NewOpcode));
773 
774   if (SplitBB != BB) {
775     // Update dominator trees
776     using DomTreeT = DomTreeBase<MachineBasicBlock>;
777     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
778     for (MachineBasicBlock *Succ : SplitBB->successors()) {
779       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
780       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
781     }
782     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
783     if (MDT)
784       MDT->getBase().applyUpdates(DTUpdates);
785     if (PDT)
786       PDT->applyUpdates(DTUpdates);
787 
788     // Link blocks
789     MachineInstr *MI =
790         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
791             .addMBB(SplitBB);
792     LIS->InsertMachineInstrInMaps(*MI);
793   }
794 
795   return SplitBB;
796 }
797 
lowerKillF32(MachineBasicBlock & MBB,MachineInstr & MI)798 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
799                                             MachineInstr &MI) {
800   assert(LiveMaskReg.isVirtual());
801 
802   const DebugLoc &DL = MI.getDebugLoc();
803   unsigned Opcode = 0;
804 
805   assert(MI.getOperand(0).isReg());
806 
807   // Comparison is for live lanes; however here we compute the inverse
808   // (killed lanes).  This is because VCMP will always generate 0 bits
809   // for inactive lanes so a mask of live lanes would not be correct
810   // inside control flow.
811   // Invert the comparison by swapping the operands and adjusting
812   // the comparison codes.
813 
814   switch (MI.getOperand(2).getImm()) {
815   case ISD::SETUEQ:
816     Opcode = AMDGPU::V_CMP_LG_F32_e64;
817     break;
818   case ISD::SETUGT:
819     Opcode = AMDGPU::V_CMP_GE_F32_e64;
820     break;
821   case ISD::SETUGE:
822     Opcode = AMDGPU::V_CMP_GT_F32_e64;
823     break;
824   case ISD::SETULT:
825     Opcode = AMDGPU::V_CMP_LE_F32_e64;
826     break;
827   case ISD::SETULE:
828     Opcode = AMDGPU::V_CMP_LT_F32_e64;
829     break;
830   case ISD::SETUNE:
831     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
832     break;
833   case ISD::SETO:
834     Opcode = AMDGPU::V_CMP_O_F32_e64;
835     break;
836   case ISD::SETUO:
837     Opcode = AMDGPU::V_CMP_U_F32_e64;
838     break;
839   case ISD::SETOEQ:
840   case ISD::SETEQ:
841     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
842     break;
843   case ISD::SETOGT:
844   case ISD::SETGT:
845     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
846     break;
847   case ISD::SETOGE:
848   case ISD::SETGE:
849     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
850     break;
851   case ISD::SETOLT:
852   case ISD::SETLT:
853     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
854     break;
855   case ISD::SETOLE:
856   case ISD::SETLE:
857     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
858     break;
859   case ISD::SETONE:
860   case ISD::SETNE:
861     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
862     break;
863   default:
864     llvm_unreachable("invalid ISD:SET cond code");
865   }
866 
867   // Pick opcode based on comparison type.
868   MachineInstr *VcmpMI;
869   const MachineOperand &Op0 = MI.getOperand(0);
870   const MachineOperand &Op1 = MI.getOperand(1);
871 
872   // VCC represents lanes killed.
873   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
874 
875   if (TRI->isVGPR(*MRI, Op0.getReg())) {
876     Opcode = AMDGPU::getVOPe32(Opcode);
877     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
878   } else {
879     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
880                  .addReg(VCC, RegState::Define)
881                  .addImm(0) // src0 modifiers
882                  .add(Op1)
883                  .addImm(0) // src1 modifiers
884                  .add(Op0)
885                  .addImm(0); // omod
886   }
887 
888   MachineInstr *MaskUpdateMI =
889       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
890           .addReg(LiveMaskReg)
891           .addReg(VCC);
892 
893   // State of SCC represents whether any lanes are live in mask,
894   // if SCC is 0 then no lanes will be alive anymore.
895   MachineInstr *EarlyTermMI =
896       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
897 
898   MachineInstr *ExecMaskMI =
899       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
900 
901   assert(MBB.succ_size() == 1);
902   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
903                               .addMBB(*MBB.succ_begin());
904 
905   // Update live intervals
906   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
907   MBB.remove(&MI);
908 
909   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
910   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
911   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
912   LIS->InsertMachineInstrInMaps(*NewTerm);
913 
914   return NewTerm;
915 }
916 
lowerKillI1(MachineBasicBlock & MBB,MachineInstr & MI,bool IsWQM)917 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
918                                            MachineInstr &MI, bool IsWQM) {
919   assert(LiveMaskReg.isVirtual());
920 
921   const DebugLoc &DL = MI.getDebugLoc();
922   MachineInstr *MaskUpdateMI = nullptr;
923 
924   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
925   const MachineOperand &Op = MI.getOperand(0);
926   int64_t KillVal = MI.getOperand(1).getImm();
927   MachineInstr *ComputeKilledMaskMI = nullptr;
928   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
929   Register TmpReg;
930 
931   // Is this a static or dynamic kill?
932   if (Op.isImm()) {
933     if (Op.getImm() == KillVal) {
934       // Static: all active lanes are killed
935       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
936                          .addReg(LiveMaskReg)
937                          .addReg(Exec);
938     } else {
939       // Static: kill does nothing
940       MachineInstr *NewTerm = nullptr;
941       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
942         LIS->RemoveMachineInstrFromMaps(MI);
943       } else {
944         assert(MBB.succ_size() == 1);
945         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
946                       .addMBB(*MBB.succ_begin());
947         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
948       }
949       MBB.remove(&MI);
950       return NewTerm;
951     }
952   } else {
953     if (!KillVal) {
954       // Op represents live lanes after kill,
955       // so exec mask needs to be factored in.
956       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
957       ComputeKilledMaskMI =
958           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
959       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
960                          .addReg(LiveMaskReg)
961                          .addReg(TmpReg);
962     } else {
963       // Op represents lanes to kill
964       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
965                          .addReg(LiveMaskReg)
966                          .add(Op);
967     }
968   }
969 
970   // State of SCC represents whether any lanes are live in mask,
971   // if SCC is 0 then no lanes will be alive anymore.
972   MachineInstr *EarlyTermMI =
973       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
974 
975   // In the case we got this far some lanes are still live,
976   // update EXEC to deactivate lanes as appropriate.
977   MachineInstr *NewTerm;
978   MachineInstr *WQMMaskMI = nullptr;
979   Register LiveMaskWQM;
980   if (IsDemote) {
981     // Demote - deactivate quads with only helper lanes
982     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
983     WQMMaskMI =
984         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
985     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
986                   .addReg(Exec)
987                   .addReg(LiveMaskWQM);
988   } else {
989     // Kill - deactivate lanes no longer in live mask
990     if (Op.isImm()) {
991       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
992       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
993     } else if (!IsWQM) {
994       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
995                     .addReg(Exec)
996                     .addReg(LiveMaskReg);
997     } else {
998       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
999       NewTerm =
1000           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1001     }
1002   }
1003 
1004   // Update live intervals
1005   LIS->RemoveMachineInstrFromMaps(MI);
1006   MBB.remove(&MI);
1007   assert(EarlyTermMI);
1008   assert(MaskUpdateMI);
1009   assert(NewTerm);
1010   if (ComputeKilledMaskMI)
1011     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1012   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1013   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1014   if (WQMMaskMI)
1015     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1016   LIS->InsertMachineInstrInMaps(*NewTerm);
1017 
1018   if (CndReg) {
1019     LIS->removeInterval(CndReg);
1020     LIS->createAndComputeVirtRegInterval(CndReg);
1021   }
1022   if (TmpReg)
1023     LIS->createAndComputeVirtRegInterval(TmpReg);
1024   if (LiveMaskWQM)
1025     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1026 
1027   return NewTerm;
1028 }
1029 
1030 // Replace (or supplement) instructions accessing live mask.
1031 // This can only happen once all the live mask registers have been created
1032 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
lowerBlock(MachineBasicBlock & MBB)1033 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1034   auto BII = Blocks.find(&MBB);
1035   if (BII == Blocks.end())
1036     return;
1037 
1038   const BlockInfo &BI = BII->second;
1039   if (!BI.NeedsLowering)
1040     return;
1041 
1042   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1043 
1044   SmallVector<MachineInstr *, 4> SplitPoints;
1045   char State = BI.InitialState;
1046 
1047   for (MachineInstr &MI : llvm::make_early_inc_range(
1048            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1049     if (StateTransition.count(&MI))
1050       State = StateTransition[&MI];
1051 
1052     MachineInstr *SplitPoint = nullptr;
1053     switch (MI.getOpcode()) {
1054     case AMDGPU::SI_DEMOTE_I1:
1055     case AMDGPU::SI_KILL_I1_TERMINATOR:
1056       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1057       break;
1058     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1059       SplitPoint = lowerKillF32(MBB, MI);
1060       break;
1061     default:
1062       break;
1063     }
1064     if (SplitPoint)
1065       SplitPoints.push_back(SplitPoint);
1066   }
1067 
1068   // Perform splitting after instruction scan to simplify iteration.
1069   if (!SplitPoints.empty()) {
1070     MachineBasicBlock *BB = &MBB;
1071     for (MachineInstr *MI : SplitPoints) {
1072       BB = splitBlock(BB, MI);
1073     }
1074   }
1075 }
1076 
1077 // Return an iterator in the (inclusive) range [First, Last] at which
1078 // instructions can be safely inserted, keeping in mind that some of the
1079 // instructions we want to add necessarily clobber SCC.
prepareInsertion(MachineBasicBlock & MBB,MachineBasicBlock::iterator First,MachineBasicBlock::iterator Last,bool PreferLast,bool SaveSCC)1080 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1081     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1082     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1083   if (!SaveSCC)
1084     return PreferLast ? Last : First;
1085 
1086   LiveRange &LR =
1087       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1088   auto MBBE = MBB.end();
1089   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1090                                      : LIS->getMBBEndIdx(&MBB);
1091   SlotIndex LastIdx =
1092       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1093   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1094   const LiveRange::Segment *S;
1095 
1096   for (;;) {
1097     S = LR.getSegmentContaining(Idx);
1098     if (!S)
1099       break;
1100 
1101     if (PreferLast) {
1102       SlotIndex Next = S->start.getBaseIndex();
1103       if (Next < FirstIdx)
1104         break;
1105       Idx = Next;
1106     } else {
1107       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1108       assert(EndMI && "Segment does not end on valid instruction");
1109       auto NextI = std::next(EndMI->getIterator());
1110       if (NextI == MBB.end())
1111         break;
1112       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1113       if (Next > LastIdx)
1114         break;
1115       Idx = Next;
1116     }
1117   }
1118 
1119   MachineBasicBlock::iterator MBBI;
1120 
1121   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1122     MBBI = MI;
1123   else {
1124     assert(Idx == LIS->getMBBEndIdx(&MBB));
1125     MBBI = MBB.end();
1126   }
1127 
1128   // Move insertion point past any operations modifying EXEC.
1129   // This assumes that the value of SCC defined by any of these operations
1130   // does not need to be preserved.
1131   while (MBBI != Last) {
1132     bool IsExecDef = false;
1133     for (const MachineOperand &MO : MBBI->all_defs()) {
1134       IsExecDef |=
1135           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1136     }
1137     if (!IsExecDef)
1138       break;
1139     MBBI++;
1140     S = nullptr;
1141   }
1142 
1143   if (S)
1144     MBBI = saveSCC(MBB, MBBI);
1145 
1146   return MBBI;
1147 }
1148 
toExact(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveWQM)1149 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1150                               MachineBasicBlock::iterator Before,
1151                               Register SaveWQM) {
1152   assert(LiveMaskReg.isVirtual());
1153 
1154   bool IsTerminator = Before == MBB.end();
1155   if (!IsTerminator) {
1156     auto FirstTerm = MBB.getFirstTerminator();
1157     if (FirstTerm != MBB.end()) {
1158       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1159       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1160       IsTerminator = BeforeIdx > FirstTermIdx;
1161     }
1162   }
1163 
1164   MachineInstr *MI;
1165 
1166   if (SaveWQM) {
1167     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1168     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1169              .addReg(LiveMaskReg);
1170   } else {
1171     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1172     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1173              .addReg(Exec)
1174              .addReg(LiveMaskReg);
1175   }
1176 
1177   LIS->InsertMachineInstrInMaps(*MI);
1178   StateTransition[MI] = StateExact;
1179 }
1180 
toWQM(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedWQM)1181 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1182                             MachineBasicBlock::iterator Before,
1183                             Register SavedWQM) {
1184   MachineInstr *MI;
1185 
1186   if (SavedWQM) {
1187     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1188              .addReg(SavedWQM);
1189   } else {
1190     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1191   }
1192 
1193   LIS->InsertMachineInstrInMaps(*MI);
1194   StateTransition[MI] = StateWQM;
1195 }
1196 
toStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveOrig,char StrictStateNeeded)1197 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1198                                    MachineBasicBlock::iterator Before,
1199                                    Register SaveOrig, char StrictStateNeeded) {
1200   MachineInstr *MI;
1201   assert(SaveOrig);
1202   assert(StrictStateNeeded == StateStrictWWM ||
1203          StrictStateNeeded == StateStrictWQM);
1204 
1205   if (StrictStateNeeded == StateStrictWWM) {
1206     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1207                  SaveOrig)
1208              .addImm(-1);
1209   } else {
1210     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1211                  SaveOrig)
1212              .addImm(-1);
1213   }
1214   LIS->InsertMachineInstrInMaps(*MI);
1215   StateTransition[MI] = StrictStateNeeded;
1216 }
1217 
fromStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedOrig,char NonStrictState,char CurrentStrictState)1218 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1219                                      MachineBasicBlock::iterator Before,
1220                                      Register SavedOrig, char NonStrictState,
1221                                      char CurrentStrictState) {
1222   MachineInstr *MI;
1223 
1224   assert(SavedOrig);
1225   assert(CurrentStrictState == StateStrictWWM ||
1226          CurrentStrictState == StateStrictWQM);
1227 
1228   if (CurrentStrictState == StateStrictWWM) {
1229     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1230                  Exec)
1231              .addReg(SavedOrig);
1232   } else {
1233     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1234                  Exec)
1235              .addReg(SavedOrig);
1236   }
1237   LIS->InsertMachineInstrInMaps(*MI);
1238   StateTransition[MI] = NonStrictState;
1239 }
1240 
processBlock(MachineBasicBlock & MBB,bool IsEntry)1241 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1242   auto BII = Blocks.find(&MBB);
1243   if (BII == Blocks.end())
1244     return;
1245 
1246   BlockInfo &BI = BII->second;
1247 
1248   // This is a non-entry block that is WQM throughout, so no need to do
1249   // anything.
1250   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1251     BI.InitialState = StateWQM;
1252     return;
1253   }
1254 
1255   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1256                     << ":\n");
1257 
1258   Register SavedWQMReg;
1259   Register SavedNonStrictReg;
1260   bool WQMFromExec = IsEntry;
1261   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1262   char NonStrictState = 0;
1263   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1264 
1265   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1266   if (IsEntry) {
1267     // Skip the instruction that saves LiveMask
1268     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1269         II->getOperand(1).getReg() == TRI->getExec())
1270       ++II;
1271   }
1272 
1273   // This stores the first instruction where it's safe to switch from WQM to
1274   // Exact or vice versa.
1275   MachineBasicBlock::iterator FirstWQM = IE;
1276 
1277   // This stores the first instruction where it's safe to switch from Strict
1278   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1279   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1280   // be safe to switch to/from WQM as well.
1281   MachineBasicBlock::iterator FirstStrict = IE;
1282 
1283   // Record initial state is block information.
1284   BI.InitialState = State;
1285 
1286   for (;;) {
1287     MachineBasicBlock::iterator Next = II;
1288     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1289     char OutNeeds = 0;
1290 
1291     if (FirstWQM == IE)
1292       FirstWQM = II;
1293 
1294     if (FirstStrict == IE)
1295       FirstStrict = II;
1296 
1297     // First, figure out the allowed states (Needs) based on the propagated
1298     // flags.
1299     if (II != IE) {
1300       MachineInstr &MI = *II;
1301 
1302       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1303         auto III = Instructions.find(&MI);
1304         if (III != Instructions.end()) {
1305           if (III->second.Needs & StateStrictWWM)
1306             Needs = StateStrictWWM;
1307           else if (III->second.Needs & StateStrictWQM)
1308             Needs = StateStrictWQM;
1309           else if (III->second.Needs & StateWQM)
1310             Needs = StateWQM;
1311           else
1312             Needs &= ~III->second.Disabled;
1313           OutNeeds = III->second.OutNeeds;
1314         }
1315       } else {
1316         // If the instruction doesn't actually need a correct EXEC, then we can
1317         // safely leave Strict mode enabled.
1318         Needs = StateExact | StateWQM | StateStrict;
1319       }
1320 
1321       // Exact mode exit can occur in terminators, but must be before branches.
1322       if (MI.isBranch() && OutNeeds == StateExact)
1323         Needs = StateExact;
1324 
1325       ++Next;
1326     } else {
1327       // End of basic block
1328       if (BI.OutNeeds & StateWQM)
1329         Needs = StateWQM;
1330       else if (BI.OutNeeds == StateExact)
1331         Needs = StateExact;
1332       else
1333         Needs = StateWQM | StateExact;
1334     }
1335 
1336     // Now, transition if necessary.
1337     if (!(Needs & State)) {
1338       MachineBasicBlock::iterator First;
1339       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1340           State == StateStrictWQM || Needs == StateStrictWQM) {
1341         // We must switch to or from Strict mode.
1342         First = FirstStrict;
1343       } else {
1344         // We only need to switch to/from WQM, so we can use FirstWQM.
1345         First = FirstWQM;
1346       }
1347 
1348       // Whether we need to save SCC depends on start and end states.
1349       bool SaveSCC = false;
1350       switch (State) {
1351       case StateExact:
1352       case StateStrictWWM:
1353       case StateStrictWQM:
1354         // Exact/Strict -> Strict: save SCC
1355         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1356         // Exact/Strict -> Exact: no save
1357         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1358         break;
1359       case StateWQM:
1360         // WQM -> Exact/Strict: save SCC
1361         SaveSCC = !(Needs & StateWQM);
1362         break;
1363       default:
1364         llvm_unreachable("Unknown state");
1365         break;
1366       }
1367       MachineBasicBlock::iterator Before =
1368           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1369 
1370       if (State & StateStrict) {
1371         assert(State == StateStrictWWM || State == StateStrictWQM);
1372         assert(SavedNonStrictReg);
1373         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1374 
1375         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1376         SavedNonStrictReg = 0;
1377         State = NonStrictState;
1378       }
1379 
1380       if (Needs & StateStrict) {
1381         NonStrictState = State;
1382         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1383         assert(!SavedNonStrictReg);
1384         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1385 
1386         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1387         State = Needs;
1388 
1389       } else {
1390         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1391           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1392             assert(!SavedWQMReg);
1393             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1394           }
1395 
1396           toExact(MBB, Before, SavedWQMReg);
1397           State = StateExact;
1398         } else if (State == StateExact && (Needs & StateWQM) &&
1399                    !(Needs & StateExact)) {
1400           assert(WQMFromExec == (SavedWQMReg == 0));
1401 
1402           toWQM(MBB, Before, SavedWQMReg);
1403 
1404           if (SavedWQMReg) {
1405             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1406             SavedWQMReg = 0;
1407           }
1408           State = StateWQM;
1409         } else {
1410           // We can get here if we transitioned from StrictWWM to a
1411           // non-StrictWWM state that already matches our needs, but we
1412           // shouldn't need to do anything.
1413           assert(Needs & State);
1414         }
1415       }
1416     }
1417 
1418     if (Needs != (StateExact | StateWQM | StateStrict)) {
1419       if (Needs != (StateExact | StateWQM))
1420         FirstWQM = IE;
1421       FirstStrict = IE;
1422     }
1423 
1424     if (II == IE)
1425       break;
1426 
1427     II = Next;
1428   }
1429   assert(!SavedWQMReg);
1430   assert(!SavedNonStrictReg);
1431 }
1432 
lowerLiveMaskQueries()1433 bool SIWholeQuadMode::lowerLiveMaskQueries() {
1434   for (MachineInstr *MI : LiveMaskQueries) {
1435     const DebugLoc &DL = MI->getDebugLoc();
1436     Register Dest = MI->getOperand(0).getReg();
1437 
1438     MachineInstr *Copy =
1439         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1440             .addReg(LiveMaskReg);
1441 
1442     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1443     MI->eraseFromParent();
1444   }
1445   return !LiveMaskQueries.empty();
1446 }
1447 
lowerCopyInstrs()1448 bool SIWholeQuadMode::lowerCopyInstrs() {
1449   for (MachineInstr *MI : LowerToMovInstrs) {
1450     assert(MI->getNumExplicitOperands() == 2);
1451 
1452     const Register Reg = MI->getOperand(0).getReg();
1453 
1454     const TargetRegisterClass *regClass =
1455         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1456     if (TRI->isVGPRClass(regClass)) {
1457       const unsigned MovOp = TII->getMovOpcode(regClass);
1458       MI->setDesc(TII->get(MovOp));
1459 
1460       // Check that it already implicitly depends on exec (like all VALU movs
1461       // should do).
1462       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1463         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1464       }));
1465     } else {
1466       // Remove early-clobber and exec dependency from simple SGPR copies.
1467       // This allows some to be eliminated during/post RA.
1468       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1469       if (MI->getOperand(0).isEarlyClobber()) {
1470         LIS->removeInterval(Reg);
1471         MI->getOperand(0).setIsEarlyClobber(false);
1472         LIS->createAndComputeVirtRegInterval(Reg);
1473       }
1474       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1475       while (Index >= 0) {
1476         MI->removeOperand(Index);
1477         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1478       }
1479       MI->setDesc(TII->get(AMDGPU::COPY));
1480       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1481     }
1482   }
1483   for (MachineInstr *MI : LowerToCopyInstrs) {
1484     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1485         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1486       assert(MI->getNumExplicitOperands() == 3);
1487       // the only reason we should be here is V_SET_INACTIVE has
1488       // an undef input so it is being replaced by a simple copy.
1489       // There should be a second undef source that we should remove.
1490       assert(MI->getOperand(2).isUndef());
1491       MI->removeOperand(2);
1492       MI->untieRegOperand(1);
1493     } else {
1494       assert(MI->getNumExplicitOperands() == 2);
1495     }
1496 
1497     unsigned CopyOp = MI->getOperand(1).isReg()
1498                           ? (unsigned)AMDGPU::COPY
1499                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1500                                 *MRI, MI->getOperand(0)));
1501     MI->setDesc(TII->get(CopyOp));
1502   }
1503   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1504 }
1505 
lowerKillInstrs(bool IsWQM)1506 bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1507   for (MachineInstr *MI : KillInstrs) {
1508     MachineBasicBlock *MBB = MI->getParent();
1509     MachineInstr *SplitPoint = nullptr;
1510     switch (MI->getOpcode()) {
1511     case AMDGPU::SI_DEMOTE_I1:
1512     case AMDGPU::SI_KILL_I1_TERMINATOR:
1513       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1514       break;
1515     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1516       SplitPoint = lowerKillF32(*MBB, *MI);
1517       break;
1518     }
1519     if (SplitPoint)
1520       splitBlock(MBB, SplitPoint);
1521   }
1522   return !KillInstrs.empty();
1523 }
1524 
lowerInitExec(MachineInstr & MI)1525 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1526   MachineBasicBlock *MBB = MI.getParent();
1527   bool IsWave32 = ST->isWave32();
1528 
1529   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1530     // This should be before all vector instructions.
1531     MachineInstr *InitMI =
1532         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1533                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1534                 Exec)
1535             .addImm(MI.getOperand(0).getImm());
1536     if (LIS) {
1537       LIS->RemoveMachineInstrFromMaps(MI);
1538       LIS->InsertMachineInstrInMaps(*InitMI);
1539     }
1540     MI.eraseFromParent();
1541     return;
1542   }
1543 
1544   // Extract the thread count from an SGPR input and set EXEC accordingly.
1545   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1546   //
1547   // S_BFE_U32 count, input, {shift, 7}
1548   // S_BFM_B64 exec, count, 0
1549   // S_CMP_EQ_U32 count, 64
1550   // S_CMOV_B64 exec, -1
1551   Register InputReg = MI.getOperand(0).getReg();
1552   MachineInstr *FirstMI = &*MBB->begin();
1553   if (InputReg.isVirtual()) {
1554     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1555     assert(DefInstr && DefInstr->isCopy());
1556     if (DefInstr->getParent() == MBB) {
1557       if (DefInstr != FirstMI) {
1558         // If the `InputReg` is defined in current block, we also need to
1559         // move that instruction to the beginning of the block.
1560         DefInstr->removeFromParent();
1561         MBB->insert(FirstMI, DefInstr);
1562         if (LIS)
1563           LIS->handleMove(*DefInstr);
1564       } else {
1565         // If first instruction is definition then move pointer after it.
1566         FirstMI = &*std::next(FirstMI->getIterator());
1567       }
1568     }
1569   }
1570 
1571   // Insert instruction sequence at block beginning (before vector operations).
1572   const DebugLoc DL = MI.getDebugLoc();
1573   const unsigned WavefrontSize = ST->getWavefrontSize();
1574   const unsigned Mask = (WavefrontSize << 1) - 1;
1575   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1576   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1577                    .addReg(InputReg)
1578                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1579   auto BfmMI =
1580       BuildMI(*MBB, FirstMI, DL,
1581               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1582           .addReg(CountReg)
1583           .addImm(0);
1584   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1585                    .addReg(CountReg, RegState::Kill)
1586                    .addImm(WavefrontSize);
1587   auto CmovMI =
1588       BuildMI(*MBB, FirstMI, DL,
1589               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1590               Exec)
1591           .addImm(-1);
1592 
1593   if (!LIS) {
1594     MI.eraseFromParent();
1595     return;
1596   }
1597 
1598   LIS->RemoveMachineInstrFromMaps(MI);
1599   MI.eraseFromParent();
1600 
1601   LIS->InsertMachineInstrInMaps(*BfeMI);
1602   LIS->InsertMachineInstrInMaps(*BfmMI);
1603   LIS->InsertMachineInstrInMaps(*CmpMI);
1604   LIS->InsertMachineInstrInMaps(*CmovMI);
1605 
1606   LIS->removeInterval(InputReg);
1607   LIS->createAndComputeVirtRegInterval(InputReg);
1608   LIS->createAndComputeVirtRegInterval(CountReg);
1609 }
1610 
1611 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1612 /// for instructions that depend on EXEC.
1613 MachineBasicBlock::iterator
lowerInitExecInstrs(MachineBasicBlock & Entry,bool & Changed)1614 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1615   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1616 
1617   for (MachineInstr *MI : InitExecInstrs) {
1618     // Try to handle undefined cases gracefully:
1619     // - multiple INIT_EXEC instructions
1620     // - INIT_EXEC instructions not in the entry block
1621     if (MI->getParent() == &Entry)
1622       InsertPt = std::next(MI->getIterator());
1623 
1624     lowerInitExec(*MI);
1625     Changed = true;
1626   }
1627 
1628   return InsertPt;
1629 }
1630 
runOnMachineFunction(MachineFunction & MF)1631 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1632   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1633                     << " ------------- \n");
1634   LLVM_DEBUG(MF.dump(););
1635 
1636   Instructions.clear();
1637   Blocks.clear();
1638   LiveMaskQueries.clear();
1639   LowerToCopyInstrs.clear();
1640   LowerToMovInstrs.clear();
1641   KillInstrs.clear();
1642   InitExecInstrs.clear();
1643   StateTransition.clear();
1644 
1645   ST = &MF.getSubtarget<GCNSubtarget>();
1646 
1647   TII = ST->getInstrInfo();
1648   TRI = &TII->getRegisterInfo();
1649   MRI = &MF.getRegInfo();
1650   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1651   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1652   MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1653   auto *PDTWrapper =
1654       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1655   PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1656 
1657   if (ST->isWave32()) {
1658     AndOpc = AMDGPU::S_AND_B32;
1659     AndTermOpc = AMDGPU::S_AND_B32_term;
1660     AndN2Opc = AMDGPU::S_ANDN2_B32;
1661     XorOpc = AMDGPU::S_XOR_B32;
1662     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1663     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1664     WQMOpc = AMDGPU::S_WQM_B32;
1665     Exec = AMDGPU::EXEC_LO;
1666   } else {
1667     AndOpc = AMDGPU::S_AND_B64;
1668     AndTermOpc = AMDGPU::S_AND_B64_term;
1669     AndN2Opc = AMDGPU::S_ANDN2_B64;
1670     XorOpc = AMDGPU::S_XOR_B64;
1671     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1672     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1673     WQMOpc = AMDGPU::S_WQM_B64;
1674     Exec = AMDGPU::EXEC;
1675   }
1676 
1677   const char GlobalFlags = analyzeFunction(MF);
1678   bool Changed = false;
1679 
1680   LiveMaskReg = Exec;
1681 
1682   MachineBasicBlock &Entry = MF.front();
1683   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1684 
1685   // Store a copy of the original live mask when required
1686   const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1687   const bool HasWaveModes = GlobalFlags & ~StateExact;
1688   const bool HasKills = !KillInstrs.empty();
1689   const bool UsesWQM = GlobalFlags & StateWQM;
1690   if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1691     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1692     MachineInstr *MI =
1693         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1694             .addReg(Exec);
1695     LIS->InsertMachineInstrInMaps(*MI);
1696     Changed = true;
1697   }
1698 
1699   LLVM_DEBUG(printInfo());
1700 
1701   Changed |= lowerLiveMaskQueries();
1702   Changed |= lowerCopyInstrs();
1703 
1704   if (!HasWaveModes) {
1705     // No wave mode execution
1706     Changed |= lowerKillInstrs(false);
1707   } else if (GlobalFlags == StateWQM) {
1708     // Shader only needs WQM
1709     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1710                   .addReg(Exec);
1711     LIS->InsertMachineInstrInMaps(*MI);
1712     lowerKillInstrs(true);
1713     Changed = true;
1714   } else {
1715     // Wave mode switching requires full lowering pass.
1716     for (auto BII : Blocks)
1717       processBlock(*BII.first, BII.first == &Entry);
1718     // Lowering blocks causes block splitting so perform as a second pass.
1719     for (auto BII : Blocks)
1720       lowerBlock(*BII.first);
1721     Changed = true;
1722   }
1723 
1724   // Compute live range for live mask
1725   if (LiveMaskReg != Exec)
1726     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1727 
1728   // Physical registers like SCC aren't tracked by default anyway, so just
1729   // removing the ranges we computed is the simplest option for maintaining
1730   // the analysis results.
1731   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1732 
1733   // If we performed any kills then recompute EXEC
1734   if (!KillInstrs.empty() || !InitExecInstrs.empty())
1735     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1736 
1737   return Changed;
1738 }
1739