xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
10fe6060f1SDimitry Andric /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11fe6060f1SDimitry Andric /// for pixel shaders, and strict whole wavefront mode for all programs.
12fe6060f1SDimitry Andric ///
13fe6060f1SDimitry Andric /// The "strict" prefix indicates that inactive lanes do not take part in
14fe6060f1SDimitry Andric /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15fe6060f1SDimitry Andric /// always be enabled irrespective of control flow decisions. Conversely in
16fe6060f1SDimitry Andric /// non-strict WQM inactive lanes may control flow decisions.
170b57cec5SDimitry Andric ///
180b57cec5SDimitry Andric /// Whole quad mode is required for derivative computations, but it interferes
19fe6060f1SDimitry Andric /// with shader side effects (stores and atomics). It ensures that WQM is
20fe6060f1SDimitry Andric /// enabled when necessary, but disabled around stores and atomics.
210b57cec5SDimitry Andric ///
220b57cec5SDimitry Andric /// When necessary, this pass creates a function prolog
230b57cec5SDimitry Andric ///
240b57cec5SDimitry Andric ///   S_MOV_B64 LiveMask, EXEC
250b57cec5SDimitry Andric ///   S_WQM_B64 EXEC, EXEC
260b57cec5SDimitry Andric ///
270b57cec5SDimitry Andric /// to enter WQM at the top of the function and surrounds blocks of Exact
280b57cec5SDimitry Andric /// instructions by
290b57cec5SDimitry Andric ///
300b57cec5SDimitry Andric ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
310b57cec5SDimitry Andric ///   ...
320b57cec5SDimitry Andric ///   S_MOV_B64 EXEC, Tmp
330b57cec5SDimitry Andric ///
34fe6060f1SDimitry Andric /// We also compute when a sequence of instructions requires strict whole
35fe6060f1SDimitry Andric /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
360b57cec5SDimitry Andric ///
370b57cec5SDimitry Andric ///   S_OR_SAVEEXEC_B64 Tmp, -1
380b57cec5SDimitry Andric ///   ...
390b57cec5SDimitry Andric ///   S_MOV_B64 EXEC, Tmp
400b57cec5SDimitry Andric ///
41fe6060f1SDimitry Andric /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42fe6060f1SDimitry Andric /// we use a similar save and restore mechanism and force whole quad mode for
43fe6060f1SDimitry Andric /// those instructions:
44fe6060f1SDimitry Andric ///
45fe6060f1SDimitry Andric ///  S_MOV_B64 Tmp, EXEC
46fe6060f1SDimitry Andric ///  S_WQM_B64 EXEC, EXEC
47fe6060f1SDimitry Andric ///  ...
48fe6060f1SDimitry Andric ///  S_MOV_B64 EXEC, Tmp
49fe6060f1SDimitry Andric ///
500b57cec5SDimitry Andric /// In order to avoid excessive switching during sequences of Exact
510b57cec5SDimitry Andric /// instructions, the pass first analyzes which instructions must be run in WQM
520b57cec5SDimitry Andric /// (aka which instructions produce values that lead to derivative
530b57cec5SDimitry Andric /// computations).
540b57cec5SDimitry Andric ///
550b57cec5SDimitry Andric /// Basic blocks are always exited in WQM as long as some successor needs WQM.
560b57cec5SDimitry Andric ///
570b57cec5SDimitry Andric /// There is room for improvement given better control flow analysis:
580b57cec5SDimitry Andric ///
590b57cec5SDimitry Andric ///  (1) at the top level (outside of control flow statements, and as long as
600b57cec5SDimitry Andric ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
610b57cec5SDimitry Andric ///      the LiveMask (this is implemented for the entry block).
620b57cec5SDimitry Andric ///
630b57cec5SDimitry Andric ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
640b57cec5SDimitry Andric ///      consist of exact and don't-care instructions, the switch only has to
650b57cec5SDimitry Andric ///      be done at the entry and exit points rather than potentially in each
660b57cec5SDimitry Andric ///      block of the region.
670b57cec5SDimitry Andric ///
680b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
690b57cec5SDimitry Andric 
700b57cec5SDimitry Andric #include "AMDGPU.h"
71e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
72480093f4SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
735ffd83dbSDimitry Andric #include "llvm/ADT/MapVector.h"
740b57cec5SDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
750b57cec5SDimitry Andric #include "llvm/CodeGen/LiveIntervals.h"
760b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
77fe6060f1SDimitry Andric #include "llvm/CodeGen/MachineDominators.h"
780b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
790b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
80fe6060f1SDimitry Andric #include "llvm/CodeGen/MachinePostDominators.h"
810b57cec5SDimitry Andric #include "llvm/IR/CallingConv.h"
82480093f4SDimitry Andric #include "llvm/InitializePasses.h"
830b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h"
840b57cec5SDimitry Andric 
850b57cec5SDimitry Andric using namespace llvm;
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric #define DEBUG_TYPE "si-wqm"
880b57cec5SDimitry Andric 
890b57cec5SDimitry Andric namespace {
900b57cec5SDimitry Andric 
910b57cec5SDimitry Andric enum {
920b57cec5SDimitry Andric   StateWQM = 0x1,
93fe6060f1SDimitry Andric   StateStrictWWM = 0x2,
94fe6060f1SDimitry Andric   StateStrictWQM = 0x4,
95fe6060f1SDimitry Andric   StateExact = 0x8,
96fe6060f1SDimitry Andric   StateStrict = StateStrictWWM | StateStrictWQM,
970b57cec5SDimitry Andric };
980b57cec5SDimitry Andric 
990b57cec5SDimitry Andric struct PrintState {
1000b57cec5SDimitry Andric public:
1010b57cec5SDimitry Andric   int State;
1020b57cec5SDimitry Andric 
PrintState__anonf56fbe7e0111::PrintState1030b57cec5SDimitry Andric   explicit PrintState(int State) : State(State) {}
1040b57cec5SDimitry Andric };
1050b57cec5SDimitry Andric 
1060b57cec5SDimitry Andric #ifndef NDEBUG
operator <<(raw_ostream & OS,const PrintState & PS)1070b57cec5SDimitry Andric static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
1080b57cec5SDimitry Andric 
109fe6060f1SDimitry Andric   static const std::pair<char, const char *> Mapping[] = {
110bdd1243dSDimitry Andric       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111bdd1243dSDimitry Andric       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112fe6060f1SDimitry Andric   char State = PS.State;
113fe6060f1SDimitry Andric   for (auto M : Mapping) {
114fe6060f1SDimitry Andric     if (State & M.first) {
115fe6060f1SDimitry Andric       OS << M.second;
116fe6060f1SDimitry Andric       State &= ~M.first;
117fe6060f1SDimitry Andric 
118fe6060f1SDimitry Andric       if (State)
119fe6060f1SDimitry Andric         OS << '|';
120fe6060f1SDimitry Andric     }
121fe6060f1SDimitry Andric   }
122fe6060f1SDimitry Andric   assert(State == 0);
1230b57cec5SDimitry Andric   return OS;
1240b57cec5SDimitry Andric }
1250b57cec5SDimitry Andric #endif
1260b57cec5SDimitry Andric 
1270b57cec5SDimitry Andric struct InstrInfo {
1280b57cec5SDimitry Andric   char Needs = 0;
1290b57cec5SDimitry Andric   char Disabled = 0;
1300b57cec5SDimitry Andric   char OutNeeds = 0;
1310b57cec5SDimitry Andric };
1320b57cec5SDimitry Andric 
1330b57cec5SDimitry Andric struct BlockInfo {
1340b57cec5SDimitry Andric   char Needs = 0;
1350b57cec5SDimitry Andric   char InNeeds = 0;
1360b57cec5SDimitry Andric   char OutNeeds = 0;
137fe6060f1SDimitry Andric   char InitialState = 0;
138fe6060f1SDimitry Andric   bool NeedsLowering = false;
1390b57cec5SDimitry Andric };
1400b57cec5SDimitry Andric 
1410b57cec5SDimitry Andric struct WorkItem {
1420b57cec5SDimitry Andric   MachineBasicBlock *MBB = nullptr;
1430b57cec5SDimitry Andric   MachineInstr *MI = nullptr;
1440b57cec5SDimitry Andric 
1450b57cec5SDimitry Andric   WorkItem() = default;
WorkItem__anonf56fbe7e0111::WorkItem1460b57cec5SDimitry Andric   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem__anonf56fbe7e0111::WorkItem1470b57cec5SDimitry Andric   WorkItem(MachineInstr *MI) : MI(MI) {}
1480b57cec5SDimitry Andric };
1490b57cec5SDimitry Andric 
1500b57cec5SDimitry Andric class SIWholeQuadMode : public MachineFunctionPass {
1510b57cec5SDimitry Andric private:
1520b57cec5SDimitry Andric   const SIInstrInfo *TII;
1530b57cec5SDimitry Andric   const SIRegisterInfo *TRI;
1540b57cec5SDimitry Andric   const GCNSubtarget *ST;
1550b57cec5SDimitry Andric   MachineRegisterInfo *MRI;
1560b57cec5SDimitry Andric   LiveIntervals *LIS;
157fe6060f1SDimitry Andric   MachineDominatorTree *MDT;
158fe6060f1SDimitry Andric   MachinePostDominatorTree *PDT;
1590b57cec5SDimitry Andric 
160e8d8bef9SDimitry Andric   unsigned AndOpc;
16106c3fb27SDimitry Andric   unsigned AndTermOpc;
162fe6060f1SDimitry Andric   unsigned AndN2Opc;
163fe6060f1SDimitry Andric   unsigned XorOpc;
164fe6060f1SDimitry Andric   unsigned AndSaveExecOpc;
16506c3fb27SDimitry Andric   unsigned AndSaveExecTermOpc;
166fe6060f1SDimitry Andric   unsigned WQMOpc;
167fe6060f1SDimitry Andric   Register Exec;
168fe6060f1SDimitry Andric   Register LiveMaskReg;
169e8d8bef9SDimitry Andric 
1700b57cec5SDimitry Andric   DenseMap<const MachineInstr *, InstrInfo> Instructions;
1715ffd83dbSDimitry Andric   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172fe6060f1SDimitry Andric 
173fe6060f1SDimitry Andric   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174fe6060f1SDimitry Andric   DenseMap<const MachineInstr *, char> StateTransition;
175fe6060f1SDimitry Andric 
176fe6060f1SDimitry Andric   SmallVector<MachineInstr *, 2> LiveMaskQueries;
177480093f4SDimitry Andric   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
1780b57cec5SDimitry Andric   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179fe6060f1SDimitry Andric   SmallVector<MachineInstr *, 4> KillInstrs;
180*0fca6ea1SDimitry Andric   SmallVector<MachineInstr *, 4> InitExecInstrs;
1810b57cec5SDimitry Andric 
1820b57cec5SDimitry Andric   void printInfo();
1830b57cec5SDimitry Andric 
1840b57cec5SDimitry Andric   void markInstruction(MachineInstr &MI, char Flag,
1850b57cec5SDimitry Andric                        std::vector<WorkItem> &Worklist);
186e8d8bef9SDimitry Andric   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187e8d8bef9SDimitry Andric                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188fe6060f1SDimitry Andric   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189fe6060f1SDimitry Andric                    std::vector<WorkItem> &Worklist);
1900b57cec5SDimitry Andric   void markInstructionUses(const MachineInstr &MI, char Flag,
1910b57cec5SDimitry Andric                            std::vector<WorkItem> &Worklist);
1920b57cec5SDimitry Andric   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
1930b57cec5SDimitry Andric   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
1940b57cec5SDimitry Andric   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
1950b57cec5SDimitry Andric   char analyzeFunction(MachineFunction &MF);
1960b57cec5SDimitry Andric 
1970b57cec5SDimitry Andric   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
1980b57cec5SDimitry Andric                                       MachineBasicBlock::iterator Before);
1990b57cec5SDimitry Andric   MachineBasicBlock::iterator
2000b57cec5SDimitry Andric   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
2010b57cec5SDimitry Andric                    MachineBasicBlock::iterator Last, bool PreferLast,
2020b57cec5SDimitry Andric                    bool SaveSCC);
2030b57cec5SDimitry Andric   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204fe6060f1SDimitry Andric                Register SaveWQM);
2050b57cec5SDimitry Andric   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206fe6060f1SDimitry Andric              Register SavedWQM);
207fe6060f1SDimitry Andric   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208fe6060f1SDimitry Andric                     Register SaveOrig, char StrictStateNeeded);
209fe6060f1SDimitry Andric   void fromStrictMode(MachineBasicBlock &MBB,
210fe6060f1SDimitry Andric                       MachineBasicBlock::iterator Before, Register SavedOrig,
211fe6060f1SDimitry Andric                       char NonStrictState, char CurrentStrictState);
2120b57cec5SDimitry Andric 
213fe6060f1SDimitry Andric   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214fe6060f1SDimitry Andric 
215fe6060f1SDimitry Andric   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216fe6060f1SDimitry Andric                             bool IsWQM);
217fe6060f1SDimitry Andric   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218fe6060f1SDimitry Andric 
219fe6060f1SDimitry Andric   void lowerBlock(MachineBasicBlock &MBB);
220fe6060f1SDimitry Andric   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221fe6060f1SDimitry Andric 
222*0fca6ea1SDimitry Andric   bool lowerLiveMaskQueries();
223*0fca6ea1SDimitry Andric   bool lowerCopyInstrs();
224*0fca6ea1SDimitry Andric   bool lowerKillInstrs(bool IsWQM);
225*0fca6ea1SDimitry Andric   void lowerInitExec(MachineInstr &MI);
226*0fca6ea1SDimitry Andric   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
227*0fca6ea1SDimitry Andric                                                   bool &Changed);
2280b57cec5SDimitry Andric 
2290b57cec5SDimitry Andric public:
2300b57cec5SDimitry Andric   static char ID;
2310b57cec5SDimitry Andric 
SIWholeQuadMode()2320b57cec5SDimitry Andric   SIWholeQuadMode() :
2330b57cec5SDimitry Andric     MachineFunctionPass(ID) { }
2340b57cec5SDimitry Andric 
2350b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
2360b57cec5SDimitry Andric 
getPassName() const2370b57cec5SDimitry Andric   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
2380b57cec5SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const2390b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
240*0fca6ea1SDimitry Andric     AU.addRequired<LiveIntervalsWrapperPass>();
241*0fca6ea1SDimitry Andric     AU.addPreserved<SlotIndexesWrapperPass>();
242*0fca6ea1SDimitry Andric     AU.addPreserved<LiveIntervalsWrapperPass>();
243*0fca6ea1SDimitry Andric     AU.addPreserved<MachineDominatorTreeWrapperPass>();
244*0fca6ea1SDimitry Andric     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
2450b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
2460b57cec5SDimitry Andric   }
247fe6060f1SDimitry Andric 
getClearedProperties() const248fe6060f1SDimitry Andric   MachineFunctionProperties getClearedProperties() const override {
249fe6060f1SDimitry Andric     return MachineFunctionProperties().set(
250fe6060f1SDimitry Andric         MachineFunctionProperties::Property::IsSSA);
251fe6060f1SDimitry Andric   }
2520b57cec5SDimitry Andric };
2530b57cec5SDimitry Andric 
2540b57cec5SDimitry Andric } // end anonymous namespace
2550b57cec5SDimitry Andric 
2560b57cec5SDimitry Andric char SIWholeQuadMode::ID = 0;
2570b57cec5SDimitry Andric 
2580b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
2590b57cec5SDimitry Andric                       false)
260*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
261*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
262*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
2630b57cec5SDimitry Andric INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
2640b57cec5SDimitry Andric                     false)
2650b57cec5SDimitry Andric 
2660b57cec5SDimitry Andric char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
2670b57cec5SDimitry Andric 
createSIWholeQuadModePass()2680b57cec5SDimitry Andric FunctionPass *llvm::createSIWholeQuadModePass() {
2690b57cec5SDimitry Andric   return new SIWholeQuadMode;
2700b57cec5SDimitry Andric }
2710b57cec5SDimitry Andric 
2720b57cec5SDimitry Andric #ifndef NDEBUG
printInfo()2730b57cec5SDimitry Andric LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
2740b57cec5SDimitry Andric   for (const auto &BII : Blocks) {
2750b57cec5SDimitry Andric     dbgs() << "\n"
2760b57cec5SDimitry Andric            << printMBBReference(*BII.first) << ":\n"
2770b57cec5SDimitry Andric            << "  InNeeds = " << PrintState(BII.second.InNeeds)
2780b57cec5SDimitry Andric            << ", Needs = " << PrintState(BII.second.Needs)
2790b57cec5SDimitry Andric            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
2800b57cec5SDimitry Andric 
2810b57cec5SDimitry Andric     for (const MachineInstr &MI : *BII.first) {
2820b57cec5SDimitry Andric       auto III = Instructions.find(&MI);
283*0fca6ea1SDimitry Andric       if (III != Instructions.end()) {
2840b57cec5SDimitry Andric         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
2850b57cec5SDimitry Andric                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
2860b57cec5SDimitry Andric       }
2870b57cec5SDimitry Andric     }
2880b57cec5SDimitry Andric   }
289*0fca6ea1SDimitry Andric }
2900b57cec5SDimitry Andric #endif
2910b57cec5SDimitry Andric 
markInstruction(MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)2920b57cec5SDimitry Andric void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
2930b57cec5SDimitry Andric                                       std::vector<WorkItem> &Worklist) {
2940b57cec5SDimitry Andric   InstrInfo &II = Instructions[&MI];
2950b57cec5SDimitry Andric 
2960b57cec5SDimitry Andric   assert(!(Flag & StateExact) && Flag != 0);
2970b57cec5SDimitry Andric 
2980b57cec5SDimitry Andric   // Remove any disabled states from the flag. The user that required it gets
2990b57cec5SDimitry Andric   // an undefined value in the helper lanes. For example, this can happen if
3000b57cec5SDimitry Andric   // the result of an atomic is used by instruction that requires WQM, where
3010b57cec5SDimitry Andric   // ignoring the request for WQM is correct as per the relevant specs.
3020b57cec5SDimitry Andric   Flag &= ~II.Disabled;
3030b57cec5SDimitry Andric 
3040b57cec5SDimitry Andric   // Ignore if the flag is already encompassed by the existing needs, or we
3050b57cec5SDimitry Andric   // just disabled everything.
3060b57cec5SDimitry Andric   if ((II.Needs & Flag) == Flag)
3070b57cec5SDimitry Andric     return;
3080b57cec5SDimitry Andric 
309fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
3100b57cec5SDimitry Andric   II.Needs |= Flag;
311*0fca6ea1SDimitry Andric   Worklist.emplace_back(&MI);
3120b57cec5SDimitry Andric }
3130b57cec5SDimitry Andric 
314e8d8bef9SDimitry Andric /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
markDefs(const MachineInstr & UseMI,LiveRange & LR,Register Reg,unsigned SubReg,char Flag,std::vector<WorkItem> & Worklist)315e8d8bef9SDimitry Andric void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316e8d8bef9SDimitry Andric                                Register Reg, unsigned SubReg, char Flag,
317e8d8bef9SDimitry Andric                                std::vector<WorkItem> &Worklist) {
318e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319e8d8bef9SDimitry Andric 
320e8d8bef9SDimitry Andric   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321fe6060f1SDimitry Andric   const VNInfo *Value = UseLRQ.valueIn();
322fe6060f1SDimitry Andric   if (!Value)
323e8d8bef9SDimitry Andric     return;
324e8d8bef9SDimitry Andric 
325fe6060f1SDimitry Andric   // Note: this code assumes that lane masks on AMDGPU completely
326fe6060f1SDimitry Andric   // cover registers.
327fe6060f1SDimitry Andric   const LaneBitmask UseLanes =
328fe6060f1SDimitry Andric       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329fe6060f1SDimitry Andric              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330fe6060f1SDimitry Andric                                 : LaneBitmask::getNone());
331fe6060f1SDimitry Andric 
332fe6060f1SDimitry Andric   // Perform a depth-first iteration of the LiveRange graph marking defs.
333fe6060f1SDimitry Andric   // Stop processing of a given branch when all use lanes have been defined.
334fe6060f1SDimitry Andric   // The first definition stops processing for a physical register.
335fe6060f1SDimitry Andric   struct PhiEntry {
336fe6060f1SDimitry Andric     const VNInfo *Phi;
337fe6060f1SDimitry Andric     unsigned PredIdx;
338fe6060f1SDimitry Andric     LaneBitmask DefinedLanes;
339fe6060f1SDimitry Andric 
340fe6060f1SDimitry Andric     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341fe6060f1SDimitry Andric         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342fe6060f1SDimitry Andric   };
343fe6060f1SDimitry Andric   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344fe6060f1SDimitry Andric   SmallVector<PhiEntry, 2> PhiStack;
345fe6060f1SDimitry Andric   SmallSet<VisitKey, 4> Visited;
346fe6060f1SDimitry Andric   LaneBitmask DefinedLanes;
347fe6060f1SDimitry Andric   unsigned NextPredIdx = 0; // Only used for processing phi nodes
348e8d8bef9SDimitry Andric   do {
349fe6060f1SDimitry Andric     const VNInfo *NextValue = nullptr;
350fe6060f1SDimitry Andric     const VisitKey Key(Value, DefinedLanes);
351fe6060f1SDimitry Andric 
35281ad6265SDimitry Andric     if (Visited.insert(Key).second) {
353fe6060f1SDimitry Andric       // On first visit to a phi then start processing first predecessor
354fe6060f1SDimitry Andric       NextPredIdx = 0;
355fe6060f1SDimitry Andric     }
356e8d8bef9SDimitry Andric 
357e8d8bef9SDimitry Andric     if (Value->isPHIDef()) {
358fe6060f1SDimitry Andric       // Each predecessor node in the phi must be processed as a subgraph
359e8d8bef9SDimitry Andric       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360e8d8bef9SDimitry Andric       assert(MBB && "Phi-def has no defining MBB");
361fe6060f1SDimitry Andric 
362fe6060f1SDimitry Andric       // Find next predecessor to process
363fe6060f1SDimitry Andric       unsigned Idx = NextPredIdx;
364fe6060f1SDimitry Andric       auto PI = MBB->pred_begin() + Idx;
365fe6060f1SDimitry Andric       auto PE = MBB->pred_end();
366fe6060f1SDimitry Andric       for (; PI != PE && !NextValue; ++PI, ++Idx) {
367e8d8bef9SDimitry Andric         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368fe6060f1SDimitry Andric           if (!Visited.count(VisitKey(VN, DefinedLanes)))
369fe6060f1SDimitry Andric             NextValue = VN;
370e8d8bef9SDimitry Andric         }
371e8d8bef9SDimitry Andric       }
372fe6060f1SDimitry Andric 
373fe6060f1SDimitry Andric       // If there are more predecessors to process; add phi to stack
374fe6060f1SDimitry Andric       if (PI != PE)
375fe6060f1SDimitry Andric         PhiStack.emplace_back(Value, Idx, DefinedLanes);
376e8d8bef9SDimitry Andric     } else {
377e8d8bef9SDimitry Andric       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378e8d8bef9SDimitry Andric       assert(MI && "Def has no defining instruction");
379e8d8bef9SDimitry Andric 
380fe6060f1SDimitry Andric       if (Reg.isVirtual()) {
381e8d8bef9SDimitry Andric         // Iterate over all operands to find relevant definitions
382fe6060f1SDimitry Andric         bool HasDef = false;
38306c3fb27SDimitry Andric         for (const MachineOperand &Op : MI->all_defs()) {
38406c3fb27SDimitry Andric           if (Op.getReg() != Reg)
385e8d8bef9SDimitry Andric             continue;
386e8d8bef9SDimitry Andric 
387fe6060f1SDimitry Andric           // Compute lanes defined and overlap with use
388fe6060f1SDimitry Andric           LaneBitmask OpLanes =
389fe6060f1SDimitry Andric               Op.isUndef() ? LaneBitmask::getAll()
390fe6060f1SDimitry Andric                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391fe6060f1SDimitry Andric           LaneBitmask Overlap = (UseLanes & OpLanes);
392fe6060f1SDimitry Andric 
393fe6060f1SDimitry Andric           // Record if this instruction defined any of use
394fe6060f1SDimitry Andric           HasDef |= Overlap.any();
395fe6060f1SDimitry Andric 
396fe6060f1SDimitry Andric           // Mark any lanes defined
397fe6060f1SDimitry Andric           DefinedLanes |= OpLanes;
398fe6060f1SDimitry Andric         }
399fe6060f1SDimitry Andric 
400fe6060f1SDimitry Andric         // Check if all lanes of use have been defined
401fe6060f1SDimitry Andric         if ((DefinedLanes & UseLanes) != UseLanes) {
402fe6060f1SDimitry Andric           // Definition not complete; need to process input value
403e8d8bef9SDimitry Andric           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404e8d8bef9SDimitry Andric           if (const VNInfo *VN = LRQ.valueIn()) {
405fe6060f1SDimitry Andric             if (!Visited.count(VisitKey(VN, DefinedLanes)))
406fe6060f1SDimitry Andric               NextValue = VN;
407e8d8bef9SDimitry Andric           }
408e8d8bef9SDimitry Andric         }
409e8d8bef9SDimitry Andric 
410fe6060f1SDimitry Andric         // Only mark the instruction if it defines some part of the use
411fe6060f1SDimitry Andric         if (HasDef)
412fe6060f1SDimitry Andric           markInstruction(*MI, Flag, Worklist);
413fe6060f1SDimitry Andric       } else {
414fe6060f1SDimitry Andric         // For physical registers simply mark the defining instruction
415fe6060f1SDimitry Andric         markInstruction(*MI, Flag, Worklist);
416fe6060f1SDimitry Andric       }
417fe6060f1SDimitry Andric     }
418fe6060f1SDimitry Andric 
419fe6060f1SDimitry Andric     if (!NextValue && !PhiStack.empty()) {
420fe6060f1SDimitry Andric       // Reach end of chain; revert to processing last phi
421fe6060f1SDimitry Andric       PhiEntry &Entry = PhiStack.back();
422fe6060f1SDimitry Andric       NextValue = Entry.Phi;
423fe6060f1SDimitry Andric       NextPredIdx = Entry.PredIdx;
424fe6060f1SDimitry Andric       DefinedLanes = Entry.DefinedLanes;
425fe6060f1SDimitry Andric       PhiStack.pop_back();
426fe6060f1SDimitry Andric     }
427fe6060f1SDimitry Andric 
428fe6060f1SDimitry Andric     Value = NextValue;
429fe6060f1SDimitry Andric   } while (Value);
430fe6060f1SDimitry Andric }
431fe6060f1SDimitry Andric 
markOperand(const MachineInstr & MI,const MachineOperand & Op,char Flag,std::vector<WorkItem> & Worklist)432fe6060f1SDimitry Andric void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433fe6060f1SDimitry Andric                                   const MachineOperand &Op, char Flag,
4340b57cec5SDimitry Andric                                   std::vector<WorkItem> &Worklist) {
435fe6060f1SDimitry Andric   assert(Op.isReg());
436fe6060f1SDimitry Andric   Register Reg = Op.getReg();
437e8d8bef9SDimitry Andric 
438fe6060f1SDimitry Andric   // Ignore some hardware registers
439fe6060f1SDimitry Andric   switch (Reg) {
440fe6060f1SDimitry Andric   case AMDGPU::EXEC:
441fe6060f1SDimitry Andric   case AMDGPU::EXEC_LO:
442fe6060f1SDimitry Andric     return;
443fe6060f1SDimitry Andric   default:
444fe6060f1SDimitry Andric     break;
445fe6060f1SDimitry Andric   }
446e8d8bef9SDimitry Andric 
447fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448fe6060f1SDimitry Andric                     << " for " << MI);
449fe6060f1SDimitry Andric   if (Reg.isVirtual()) {
450fe6060f1SDimitry Andric     LiveRange &LR = LIS->getInterval(Reg);
451fe6060f1SDimitry Andric     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452fe6060f1SDimitry Andric   } else {
4530b57cec5SDimitry Andric     // Handle physical registers that we need to track; this is mostly relevant
4540b57cec5SDimitry Andric     // for VCC, which can appear as the (implicit) input of a uniform branch,
4550b57cec5SDimitry Andric     // e.g. when a loop counter is stored in a VGPR.
45606c3fb27SDimitry Andric     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
45706c3fb27SDimitry Andric       LiveRange &LR = LIS->getRegUnit(Unit);
4580b57cec5SDimitry Andric       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
459*0fca6ea1SDimitry Andric       if (Value)
46006c3fb27SDimitry Andric         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
461e8d8bef9SDimitry Andric     }
4620b57cec5SDimitry Andric   }
463fe6060f1SDimitry Andric }
4640b57cec5SDimitry Andric 
465fe6060f1SDimitry Andric /// Mark all instructions defining the uses in \p MI with \p Flag.
markInstructionUses(const MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)466fe6060f1SDimitry Andric void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
467fe6060f1SDimitry Andric                                           std::vector<WorkItem> &Worklist) {
468fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
469fe6060f1SDimitry Andric                     << MI);
470fe6060f1SDimitry Andric 
47106c3fb27SDimitry Andric   for (const MachineOperand &Use : MI.all_uses())
472fe6060f1SDimitry Andric     markOperand(MI, Use, Flag, Worklist);
4730b57cec5SDimitry Andric }
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric // Scan instructions to determine which ones require an Exact execmask and
4760b57cec5SDimitry Andric // which ones seed WQM requirements.
scanInstructions(MachineFunction & MF,std::vector<WorkItem> & Worklist)4770b57cec5SDimitry Andric char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
4780b57cec5SDimitry Andric                                        std::vector<WorkItem> &Worklist) {
4790b57cec5SDimitry Andric   char GlobalFlags = 0;
4800b57cec5SDimitry Andric   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
4810b57cec5SDimitry Andric   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
4828bcb0991SDimitry Andric   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
4834824e7fdSDimitry Andric   bool HasImplicitDerivatives =
4844824e7fdSDimitry Andric       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
4850b57cec5SDimitry Andric 
4860b57cec5SDimitry Andric   // We need to visit the basic blocks in reverse post-order so that we visit
4870b57cec5SDimitry Andric   // defs before uses, in particular so that we don't accidentally mark an
4880b57cec5SDimitry Andric   // instruction as needing e.g. WQM before visiting it and realizing it needs
4890b57cec5SDimitry Andric   // WQM disabled.
4900b57cec5SDimitry Andric   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
4910eae32dcSDimitry Andric   for (MachineBasicBlock *MBB : RPOT) {
4920eae32dcSDimitry Andric     BlockInfo &BBI = Blocks[MBB];
4930b57cec5SDimitry Andric 
4940eae32dcSDimitry Andric     for (MachineInstr &MI : *MBB) {
4950b57cec5SDimitry Andric       InstrInfo &III = Instructions[&MI];
4960b57cec5SDimitry Andric       unsigned Opcode = MI.getOpcode();
4970b57cec5SDimitry Andric       char Flags = 0;
4980b57cec5SDimitry Andric 
4990b57cec5SDimitry Andric       if (TII->isWQM(Opcode)) {
500fe6060f1SDimitry Andric         // If LOD is not supported WQM is not needed.
5014824e7fdSDimitry Andric         // Only generate implicit WQM if implicit derivatives are required.
5024824e7fdSDimitry Andric         // This avoids inserting unintended WQM if a shader type without
5034824e7fdSDimitry Andric         // implicit derivatives uses an image sampling instruction.
504*0fca6ea1SDimitry Andric         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
5050b57cec5SDimitry Andric           // Sampling instructions don't need to produce results for all pixels
5060b57cec5SDimitry Andric           // in a quad, they just require all inputs of a quad to have been
5070b57cec5SDimitry Andric           // computed for derivatives.
5080b57cec5SDimitry Andric           markInstructionUses(MI, StateWQM, Worklist);
5090b57cec5SDimitry Andric           GlobalFlags |= StateWQM;
510*0fca6ea1SDimitry Andric         }
5110b57cec5SDimitry Andric       } else if (Opcode == AMDGPU::WQM) {
5120b57cec5SDimitry Andric         // The WQM intrinsic requires its output to have all the helper lanes
5130b57cec5SDimitry Andric         // correct, so we need it to be in WQM.
5140b57cec5SDimitry Andric         Flags = StateWQM;
5150b57cec5SDimitry Andric         LowerToCopyInstrs.push_back(&MI);
5168bcb0991SDimitry Andric       } else if (Opcode == AMDGPU::SOFT_WQM) {
5178bcb0991SDimitry Andric         LowerToCopyInstrs.push_back(&MI);
5188bcb0991SDimitry Andric         SoftWQMInstrs.push_back(&MI);
519fe6060f1SDimitry Andric       } else if (Opcode == AMDGPU::STRICT_WWM) {
520fe6060f1SDimitry Andric         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
521fe6060f1SDimitry Andric         // it needs to be executed in WQM or Exact so that its copy doesn't
522fe6060f1SDimitry Andric         // clobber inactive lanes.
523fe6060f1SDimitry Andric         markInstructionUses(MI, StateStrictWWM, Worklist);
524fe6060f1SDimitry Andric         GlobalFlags |= StateStrictWWM;
525fe6060f1SDimitry Andric         LowerToMovInstrs.push_back(&MI);
52681ad6265SDimitry Andric       } else if (Opcode == AMDGPU::STRICT_WQM ||
52781ad6265SDimitry Andric                  TII->isDualSourceBlendEXP(MI)) {
528fe6060f1SDimitry Andric         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
529fe6060f1SDimitry Andric         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
530fe6060f1SDimitry Andric         // quads that have at least one active thread.
531fe6060f1SDimitry Andric         markInstructionUses(MI, StateStrictWQM, Worklist);
532fe6060f1SDimitry Andric         GlobalFlags |= StateStrictWQM;
53381ad6265SDimitry Andric 
53481ad6265SDimitry Andric         if (Opcode == AMDGPU::STRICT_WQM) {
535480093f4SDimitry Andric           LowerToMovInstrs.push_back(&MI);
53681ad6265SDimitry Andric         } else {
53781ad6265SDimitry Andric           // Dual source blend export acts as implicit strict-wqm, its sources
53881ad6265SDimitry Andric           // need to be shuffled in strict wqm, but the export itself needs to
53981ad6265SDimitry Andric           // run in exact mode.
54081ad6265SDimitry Andric           BBI.Needs |= StateExact;
54181ad6265SDimitry Andric           if (!(BBI.InNeeds & StateExact)) {
54281ad6265SDimitry Andric             BBI.InNeeds |= StateExact;
543*0fca6ea1SDimitry Andric             Worklist.emplace_back(MBB);
54481ad6265SDimitry Andric           }
54581ad6265SDimitry Andric           GlobalFlags |= StateExact;
54681ad6265SDimitry Andric           III.Disabled = StateWQM | StateStrict;
54781ad6265SDimitry Andric         }
54881ad6265SDimitry Andric       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
5491db9f3b2SDimitry Andric                  Opcode == AMDGPU::DS_PARAM_LOAD ||
5501db9f3b2SDimitry Andric                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
5511db9f3b2SDimitry Andric                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
55281ad6265SDimitry Andric         // Mark these STRICTWQM, but only for the instruction, not its operands.
55381ad6265SDimitry Andric         // This avoid unnecessarily marking M0 as requiring WQM.
554*0fca6ea1SDimitry Andric         III.Needs |= StateStrictWQM;
55581ad6265SDimitry Andric         GlobalFlags |= StateStrictWQM;
5560b57cec5SDimitry Andric       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
5570b57cec5SDimitry Andric                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
558fe6060f1SDimitry Andric         III.Disabled = StateStrict;
5590b57cec5SDimitry Andric         MachineOperand &Inactive = MI.getOperand(2);
5600b57cec5SDimitry Andric         if (Inactive.isReg()) {
5610b57cec5SDimitry Andric           if (Inactive.isUndef()) {
5620b57cec5SDimitry Andric             LowerToCopyInstrs.push_back(&MI);
5630b57cec5SDimitry Andric           } else {
564fe6060f1SDimitry Andric             markOperand(MI, Inactive, StateStrictWWM, Worklist);
5650b57cec5SDimitry Andric           }
5660b57cec5SDimitry Andric         }
5670b57cec5SDimitry Andric         SetInactiveInstrs.push_back(&MI);
5680b57cec5SDimitry Andric       } else if (TII->isDisableWQM(MI)) {
5690b57cec5SDimitry Andric         BBI.Needs |= StateExact;
5700b57cec5SDimitry Andric         if (!(BBI.InNeeds & StateExact)) {
5710b57cec5SDimitry Andric           BBI.InNeeds |= StateExact;
572*0fca6ea1SDimitry Andric           Worklist.emplace_back(MBB);
5730b57cec5SDimitry Andric         }
5740b57cec5SDimitry Andric         GlobalFlags |= StateExact;
575fe6060f1SDimitry Andric         III.Disabled = StateWQM | StateStrict;
576*0fca6ea1SDimitry Andric       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
577*0fca6ea1SDimitry Andric                  Opcode == AMDGPU::SI_LIVE_MASK) {
5780b57cec5SDimitry Andric         LiveMaskQueries.push_back(&MI);
579fe6060f1SDimitry Andric       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
580fe6060f1SDimitry Andric                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
581fe6060f1SDimitry Andric                  Opcode == AMDGPU::SI_DEMOTE_I1) {
582fe6060f1SDimitry Andric         KillInstrs.push_back(&MI);
583fe6060f1SDimitry Andric         BBI.NeedsLowering = true;
584*0fca6ea1SDimitry Andric       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
585*0fca6ea1SDimitry Andric                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
586*0fca6ea1SDimitry Andric         InitExecInstrs.push_back(&MI);
5870b57cec5SDimitry Andric       } else if (WQMOutputs) {
5880b57cec5SDimitry Andric         // The function is in machine SSA form, which means that physical
5890b57cec5SDimitry Andric         // VGPRs correspond to shader inputs and outputs. Inputs are
5900b57cec5SDimitry Andric         // only used, outputs are only defined.
591fe6060f1SDimitry Andric         // FIXME: is this still valid?
5920b57cec5SDimitry Andric         for (const MachineOperand &MO : MI.defs()) {
5938bcb0991SDimitry Andric           Register Reg = MO.getReg();
594*0fca6ea1SDimitry Andric           if (Reg.isPhysical() &&
595bdd1243dSDimitry Andric               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
5960b57cec5SDimitry Andric             Flags = StateWQM;
5970b57cec5SDimitry Andric             break;
5980b57cec5SDimitry Andric           }
5990b57cec5SDimitry Andric         }
6000b57cec5SDimitry Andric       }
6010b57cec5SDimitry Andric 
602*0fca6ea1SDimitry Andric       if (Flags) {
6030b57cec5SDimitry Andric         markInstruction(MI, Flags, Worklist);
6040b57cec5SDimitry Andric         GlobalFlags |= Flags;
6050b57cec5SDimitry Andric       }
6060b57cec5SDimitry Andric     }
607*0fca6ea1SDimitry Andric   }
6080b57cec5SDimitry Andric 
6090b57cec5SDimitry Andric   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
6100b57cec5SDimitry Andric   // ever used anywhere in the function. This implements the corresponding
6110b57cec5SDimitry Andric   // semantics of @llvm.amdgcn.set.inactive.
6128bcb0991SDimitry Andric   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
6130b57cec5SDimitry Andric   if (GlobalFlags & StateWQM) {
6140b57cec5SDimitry Andric     for (MachineInstr *MI : SetInactiveInstrs)
6150b57cec5SDimitry Andric       markInstruction(*MI, StateWQM, Worklist);
6168bcb0991SDimitry Andric     for (MachineInstr *MI : SoftWQMInstrs)
6178bcb0991SDimitry Andric       markInstruction(*MI, StateWQM, Worklist);
6180b57cec5SDimitry Andric   }
6190b57cec5SDimitry Andric 
6200b57cec5SDimitry Andric   return GlobalFlags;
6210b57cec5SDimitry Andric }
6220b57cec5SDimitry Andric 
propagateInstruction(MachineInstr & MI,std::vector<WorkItem> & Worklist)6230b57cec5SDimitry Andric void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
6240b57cec5SDimitry Andric                                            std::vector<WorkItem>& Worklist) {
6250b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
6260b57cec5SDimitry Andric   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
6270b57cec5SDimitry Andric   BlockInfo &BI = Blocks[MBB];
6280b57cec5SDimitry Andric 
6290b57cec5SDimitry Andric   // Control flow-type instructions and stores to temporary memory that are
6300b57cec5SDimitry Andric   // followed by WQM computations must themselves be in WQM.
6310b57cec5SDimitry Andric   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
6320b57cec5SDimitry Andric       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
6330b57cec5SDimitry Andric     Instructions[&MI].Needs = StateWQM;
6340b57cec5SDimitry Andric     II.Needs = StateWQM;
6350b57cec5SDimitry Andric   }
6360b57cec5SDimitry Andric 
6370b57cec5SDimitry Andric   // Propagate to block level
6380b57cec5SDimitry Andric   if (II.Needs & StateWQM) {
6390b57cec5SDimitry Andric     BI.Needs |= StateWQM;
6400b57cec5SDimitry Andric     if (!(BI.InNeeds & StateWQM)) {
6410b57cec5SDimitry Andric       BI.InNeeds |= StateWQM;
642*0fca6ea1SDimitry Andric       Worklist.emplace_back(MBB);
6430b57cec5SDimitry Andric     }
6440b57cec5SDimitry Andric   }
6450b57cec5SDimitry Andric 
6460b57cec5SDimitry Andric   // Propagate backwards within block
6470b57cec5SDimitry Andric   if (MachineInstr *PrevMI = MI.getPrevNode()) {
648fe6060f1SDimitry Andric     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
6490b57cec5SDimitry Andric     if (!PrevMI->isPHI()) {
6500b57cec5SDimitry Andric       InstrInfo &PrevII = Instructions[PrevMI];
6510b57cec5SDimitry Andric       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
6520b57cec5SDimitry Andric         PrevII.OutNeeds |= InNeeds;
653*0fca6ea1SDimitry Andric         Worklist.emplace_back(PrevMI);
6540b57cec5SDimitry Andric       }
6550b57cec5SDimitry Andric     }
6560b57cec5SDimitry Andric   }
6570b57cec5SDimitry Andric 
6580b57cec5SDimitry Andric   // Propagate WQM flag to instruction inputs
6590b57cec5SDimitry Andric   assert(!(II.Needs & StateExact));
6600b57cec5SDimitry Andric 
6610b57cec5SDimitry Andric   if (II.Needs != 0)
6620b57cec5SDimitry Andric     markInstructionUses(MI, II.Needs, Worklist);
6630b57cec5SDimitry Andric 
664fe6060f1SDimitry Andric   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
665fe6060f1SDimitry Andric   // not require any WQM transitions.
666fe6060f1SDimitry Andric   if (II.Needs & StateStrictWWM)
667fe6060f1SDimitry Andric     BI.Needs |= StateStrictWWM;
668fe6060f1SDimitry Andric   if (II.Needs & StateStrictWQM)
669fe6060f1SDimitry Andric     BI.Needs |= StateStrictWQM;
6700b57cec5SDimitry Andric }
6710b57cec5SDimitry Andric 
propagateBlock(MachineBasicBlock & MBB,std::vector<WorkItem> & Worklist)6720b57cec5SDimitry Andric void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
6730b57cec5SDimitry Andric                                      std::vector<WorkItem>& Worklist) {
6740b57cec5SDimitry Andric   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
6750b57cec5SDimitry Andric 
6760b57cec5SDimitry Andric   // Propagate through instructions
6770b57cec5SDimitry Andric   if (!MBB.empty()) {
6780b57cec5SDimitry Andric     MachineInstr *LastMI = &*MBB.rbegin();
6790b57cec5SDimitry Andric     InstrInfo &LastII = Instructions[LastMI];
6800b57cec5SDimitry Andric     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
6810b57cec5SDimitry Andric       LastII.OutNeeds |= BI.OutNeeds;
682*0fca6ea1SDimitry Andric       Worklist.emplace_back(LastMI);
6830b57cec5SDimitry Andric     }
6840b57cec5SDimitry Andric   }
6850b57cec5SDimitry Andric 
6860b57cec5SDimitry Andric   // Predecessor blocks must provide for our WQM/Exact needs.
6870b57cec5SDimitry Andric   for (MachineBasicBlock *Pred : MBB.predecessors()) {
6880b57cec5SDimitry Andric     BlockInfo &PredBI = Blocks[Pred];
6890b57cec5SDimitry Andric     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
6900b57cec5SDimitry Andric       continue;
6910b57cec5SDimitry Andric 
6920b57cec5SDimitry Andric     PredBI.OutNeeds |= BI.InNeeds;
6930b57cec5SDimitry Andric     PredBI.InNeeds |= BI.InNeeds;
694*0fca6ea1SDimitry Andric     Worklist.emplace_back(Pred);
6950b57cec5SDimitry Andric   }
6960b57cec5SDimitry Andric 
6970b57cec5SDimitry Andric   // All successors must be prepared to accept the same set of WQM/Exact data.
6980b57cec5SDimitry Andric   for (MachineBasicBlock *Succ : MBB.successors()) {
6990b57cec5SDimitry Andric     BlockInfo &SuccBI = Blocks[Succ];
7000b57cec5SDimitry Andric     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
7010b57cec5SDimitry Andric       continue;
7020b57cec5SDimitry Andric 
7030b57cec5SDimitry Andric     SuccBI.InNeeds |= BI.OutNeeds;
704*0fca6ea1SDimitry Andric     Worklist.emplace_back(Succ);
7050b57cec5SDimitry Andric   }
7060b57cec5SDimitry Andric }
7070b57cec5SDimitry Andric 
analyzeFunction(MachineFunction & MF)7080b57cec5SDimitry Andric char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
7090b57cec5SDimitry Andric   std::vector<WorkItem> Worklist;
7100b57cec5SDimitry Andric   char GlobalFlags = scanInstructions(MF, Worklist);
7110b57cec5SDimitry Andric 
7120b57cec5SDimitry Andric   while (!Worklist.empty()) {
7130b57cec5SDimitry Andric     WorkItem WI = Worklist.back();
7140b57cec5SDimitry Andric     Worklist.pop_back();
7150b57cec5SDimitry Andric 
7160b57cec5SDimitry Andric     if (WI.MI)
7170b57cec5SDimitry Andric       propagateInstruction(*WI.MI, Worklist);
7180b57cec5SDimitry Andric     else
7190b57cec5SDimitry Andric       propagateBlock(*WI.MBB, Worklist);
7200b57cec5SDimitry Andric   }
7210b57cec5SDimitry Andric 
7220b57cec5SDimitry Andric   return GlobalFlags;
7230b57cec5SDimitry Andric }
7240b57cec5SDimitry Andric 
7250b57cec5SDimitry Andric MachineBasicBlock::iterator
saveSCC(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before)7260b57cec5SDimitry Andric SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
7270b57cec5SDimitry Andric                          MachineBasicBlock::iterator Before) {
7288bcb0991SDimitry Andric   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7290b57cec5SDimitry Andric 
7300b57cec5SDimitry Andric   MachineInstr *Save =
7310b57cec5SDimitry Andric       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
7320b57cec5SDimitry Andric           .addReg(AMDGPU::SCC);
7330b57cec5SDimitry Andric   MachineInstr *Restore =
7340b57cec5SDimitry Andric       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
7350b57cec5SDimitry Andric           .addReg(SaveReg);
7360b57cec5SDimitry Andric 
7370b57cec5SDimitry Andric   LIS->InsertMachineInstrInMaps(*Save);
7380b57cec5SDimitry Andric   LIS->InsertMachineInstrInMaps(*Restore);
7390b57cec5SDimitry Andric   LIS->createAndComputeVirtRegInterval(SaveReg);
7400b57cec5SDimitry Andric 
7410b57cec5SDimitry Andric   return Restore;
7420b57cec5SDimitry Andric }
7430b57cec5SDimitry Andric 
splitBlock(MachineBasicBlock * BB,MachineInstr * TermMI)744fe6060f1SDimitry Andric MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
745fe6060f1SDimitry Andric                                                MachineInstr *TermMI) {
746fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
747fe6060f1SDimitry Andric                     << *TermMI << "\n");
748fe6060f1SDimitry Andric 
749fe6060f1SDimitry Andric   MachineBasicBlock *SplitBB =
750fe6060f1SDimitry Andric       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
751fe6060f1SDimitry Andric 
752fe6060f1SDimitry Andric   // Convert last instruction in block to a terminator.
753fe6060f1SDimitry Andric   // Note: this only covers the expected patterns
754fe6060f1SDimitry Andric   unsigned NewOpcode = 0;
755fe6060f1SDimitry Andric   switch (TermMI->getOpcode()) {
756fe6060f1SDimitry Andric   case AMDGPU::S_AND_B32:
757fe6060f1SDimitry Andric     NewOpcode = AMDGPU::S_AND_B32_term;
758fe6060f1SDimitry Andric     break;
759fe6060f1SDimitry Andric   case AMDGPU::S_AND_B64:
760fe6060f1SDimitry Andric     NewOpcode = AMDGPU::S_AND_B64_term;
761fe6060f1SDimitry Andric     break;
762fe6060f1SDimitry Andric   case AMDGPU::S_MOV_B32:
763fe6060f1SDimitry Andric     NewOpcode = AMDGPU::S_MOV_B32_term;
764fe6060f1SDimitry Andric     break;
765fe6060f1SDimitry Andric   case AMDGPU::S_MOV_B64:
766fe6060f1SDimitry Andric     NewOpcode = AMDGPU::S_MOV_B64_term;
767fe6060f1SDimitry Andric     break;
768fe6060f1SDimitry Andric   default:
769fe6060f1SDimitry Andric     break;
770fe6060f1SDimitry Andric   }
771fe6060f1SDimitry Andric   if (NewOpcode)
772fe6060f1SDimitry Andric     TermMI->setDesc(TII->get(NewOpcode));
773fe6060f1SDimitry Andric 
774fe6060f1SDimitry Andric   if (SplitBB != BB) {
775fe6060f1SDimitry Andric     // Update dominator trees
776fe6060f1SDimitry Andric     using DomTreeT = DomTreeBase<MachineBasicBlock>;
777fe6060f1SDimitry Andric     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
778fe6060f1SDimitry Andric     for (MachineBasicBlock *Succ : SplitBB->successors()) {
779fe6060f1SDimitry Andric       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
780fe6060f1SDimitry Andric       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
781fe6060f1SDimitry Andric     }
782fe6060f1SDimitry Andric     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
783fe6060f1SDimitry Andric     if (MDT)
784fe6060f1SDimitry Andric       MDT->getBase().applyUpdates(DTUpdates);
785fe6060f1SDimitry Andric     if (PDT)
786*0fca6ea1SDimitry Andric       PDT->applyUpdates(DTUpdates);
787fe6060f1SDimitry Andric 
788fe6060f1SDimitry Andric     // Link blocks
789fe6060f1SDimitry Andric     MachineInstr *MI =
790fe6060f1SDimitry Andric         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
791fe6060f1SDimitry Andric             .addMBB(SplitBB);
792fe6060f1SDimitry Andric     LIS->InsertMachineInstrInMaps(*MI);
793fe6060f1SDimitry Andric   }
794fe6060f1SDimitry Andric 
795fe6060f1SDimitry Andric   return SplitBB;
796fe6060f1SDimitry Andric }
797fe6060f1SDimitry Andric 
lowerKillF32(MachineBasicBlock & MBB,MachineInstr & MI)798fe6060f1SDimitry Andric MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
799fe6060f1SDimitry Andric                                             MachineInstr &MI) {
800*0fca6ea1SDimitry Andric   assert(LiveMaskReg.isVirtual());
801*0fca6ea1SDimitry Andric 
802fe6060f1SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
803fe6060f1SDimitry Andric   unsigned Opcode = 0;
804fe6060f1SDimitry Andric 
805fe6060f1SDimitry Andric   assert(MI.getOperand(0).isReg());
806fe6060f1SDimitry Andric 
807fe6060f1SDimitry Andric   // Comparison is for live lanes; however here we compute the inverse
808fe6060f1SDimitry Andric   // (killed lanes).  This is because VCMP will always generate 0 bits
809fe6060f1SDimitry Andric   // for inactive lanes so a mask of live lanes would not be correct
810fe6060f1SDimitry Andric   // inside control flow.
811fe6060f1SDimitry Andric   // Invert the comparison by swapping the operands and adjusting
812fe6060f1SDimitry Andric   // the comparison codes.
813fe6060f1SDimitry Andric 
814fe6060f1SDimitry Andric   switch (MI.getOperand(2).getImm()) {
815fe6060f1SDimitry Andric   case ISD::SETUEQ:
816fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_LG_F32_e64;
817fe6060f1SDimitry Andric     break;
818fe6060f1SDimitry Andric   case ISD::SETUGT:
819fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_GE_F32_e64;
820fe6060f1SDimitry Andric     break;
821fe6060f1SDimitry Andric   case ISD::SETUGE:
822fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_GT_F32_e64;
823fe6060f1SDimitry Andric     break;
824fe6060f1SDimitry Andric   case ISD::SETULT:
825fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_LE_F32_e64;
826fe6060f1SDimitry Andric     break;
827fe6060f1SDimitry Andric   case ISD::SETULE:
828fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_LT_F32_e64;
829fe6060f1SDimitry Andric     break;
830fe6060f1SDimitry Andric   case ISD::SETUNE:
831fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
832fe6060f1SDimitry Andric     break;
833fe6060f1SDimitry Andric   case ISD::SETO:
834fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_O_F32_e64;
835fe6060f1SDimitry Andric     break;
836fe6060f1SDimitry Andric   case ISD::SETUO:
837fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_U_F32_e64;
838fe6060f1SDimitry Andric     break;
839fe6060f1SDimitry Andric   case ISD::SETOEQ:
840fe6060f1SDimitry Andric   case ISD::SETEQ:
841fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
842fe6060f1SDimitry Andric     break;
843fe6060f1SDimitry Andric   case ISD::SETOGT:
844fe6060f1SDimitry Andric   case ISD::SETGT:
845fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
846fe6060f1SDimitry Andric     break;
847fe6060f1SDimitry Andric   case ISD::SETOGE:
848fe6060f1SDimitry Andric   case ISD::SETGE:
849fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
850fe6060f1SDimitry Andric     break;
851fe6060f1SDimitry Andric   case ISD::SETOLT:
852fe6060f1SDimitry Andric   case ISD::SETLT:
853fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
854fe6060f1SDimitry Andric     break;
855fe6060f1SDimitry Andric   case ISD::SETOLE:
856fe6060f1SDimitry Andric   case ISD::SETLE:
857fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
858fe6060f1SDimitry Andric     break;
859fe6060f1SDimitry Andric   case ISD::SETONE:
860fe6060f1SDimitry Andric   case ISD::SETNE:
861fe6060f1SDimitry Andric     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
862fe6060f1SDimitry Andric     break;
863fe6060f1SDimitry Andric   default:
864fe6060f1SDimitry Andric     llvm_unreachable("invalid ISD:SET cond code");
865fe6060f1SDimitry Andric   }
866fe6060f1SDimitry Andric 
867fe6060f1SDimitry Andric   // Pick opcode based on comparison type.
868fe6060f1SDimitry Andric   MachineInstr *VcmpMI;
869fe6060f1SDimitry Andric   const MachineOperand &Op0 = MI.getOperand(0);
870fe6060f1SDimitry Andric   const MachineOperand &Op1 = MI.getOperand(1);
87104eeddc0SDimitry Andric 
87204eeddc0SDimitry Andric   // VCC represents lanes killed.
87304eeddc0SDimitry Andric   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
87404eeddc0SDimitry Andric 
875fe6060f1SDimitry Andric   if (TRI->isVGPR(*MRI, Op0.getReg())) {
876fe6060f1SDimitry Andric     Opcode = AMDGPU::getVOPe32(Opcode);
877fe6060f1SDimitry Andric     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
878fe6060f1SDimitry Andric   } else {
879fe6060f1SDimitry Andric     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
88004eeddc0SDimitry Andric                  .addReg(VCC, RegState::Define)
881fe6060f1SDimitry Andric                  .addImm(0) // src0 modifiers
882fe6060f1SDimitry Andric                  .add(Op1)
883fe6060f1SDimitry Andric                  .addImm(0) // src1 modifiers
884fe6060f1SDimitry Andric                  .add(Op0)
885fe6060f1SDimitry Andric                  .addImm(0); // omod
886fe6060f1SDimitry Andric   }
887fe6060f1SDimitry Andric 
888fe6060f1SDimitry Andric   MachineInstr *MaskUpdateMI =
889fe6060f1SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
890fe6060f1SDimitry Andric           .addReg(LiveMaskReg)
891fe6060f1SDimitry Andric           .addReg(VCC);
892fe6060f1SDimitry Andric 
893fe6060f1SDimitry Andric   // State of SCC represents whether any lanes are live in mask,
894fe6060f1SDimitry Andric   // if SCC is 0 then no lanes will be alive anymore.
895fe6060f1SDimitry Andric   MachineInstr *EarlyTermMI =
896fe6060f1SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
897fe6060f1SDimitry Andric 
898fe6060f1SDimitry Andric   MachineInstr *ExecMaskMI =
899fe6060f1SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
900fe6060f1SDimitry Andric 
901fe6060f1SDimitry Andric   assert(MBB.succ_size() == 1);
902fe6060f1SDimitry Andric   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
903fe6060f1SDimitry Andric                               .addMBB(*MBB.succ_begin());
904fe6060f1SDimitry Andric 
905fe6060f1SDimitry Andric   // Update live intervals
906fe6060f1SDimitry Andric   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
907fe6060f1SDimitry Andric   MBB.remove(&MI);
908fe6060f1SDimitry Andric 
909fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
910fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
911fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
912fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*NewTerm);
913fe6060f1SDimitry Andric 
914fe6060f1SDimitry Andric   return NewTerm;
915fe6060f1SDimitry Andric }
916fe6060f1SDimitry Andric 
lowerKillI1(MachineBasicBlock & MBB,MachineInstr & MI,bool IsWQM)917fe6060f1SDimitry Andric MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
918fe6060f1SDimitry Andric                                            MachineInstr &MI, bool IsWQM) {
919*0fca6ea1SDimitry Andric   assert(LiveMaskReg.isVirtual());
920*0fca6ea1SDimitry Andric 
921fe6060f1SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
922fe6060f1SDimitry Andric   MachineInstr *MaskUpdateMI = nullptr;
923fe6060f1SDimitry Andric 
924fe6060f1SDimitry Andric   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
925fe6060f1SDimitry Andric   const MachineOperand &Op = MI.getOperand(0);
926fe6060f1SDimitry Andric   int64_t KillVal = MI.getOperand(1).getImm();
927fe6060f1SDimitry Andric   MachineInstr *ComputeKilledMaskMI = nullptr;
928fe6060f1SDimitry Andric   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
929fe6060f1SDimitry Andric   Register TmpReg;
930fe6060f1SDimitry Andric 
931fe6060f1SDimitry Andric   // Is this a static or dynamic kill?
932fe6060f1SDimitry Andric   if (Op.isImm()) {
933fe6060f1SDimitry Andric     if (Op.getImm() == KillVal) {
934fe6060f1SDimitry Andric       // Static: all active lanes are killed
935fe6060f1SDimitry Andric       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
936fe6060f1SDimitry Andric                          .addReg(LiveMaskReg)
937fe6060f1SDimitry Andric                          .addReg(Exec);
938fe6060f1SDimitry Andric     } else {
939fe6060f1SDimitry Andric       // Static: kill does nothing
940fe6060f1SDimitry Andric       MachineInstr *NewTerm = nullptr;
941fe6060f1SDimitry Andric       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
942fe6060f1SDimitry Andric         LIS->RemoveMachineInstrFromMaps(MI);
943fe6060f1SDimitry Andric       } else {
944fe6060f1SDimitry Andric         assert(MBB.succ_size() == 1);
945fe6060f1SDimitry Andric         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
946fe6060f1SDimitry Andric                       .addMBB(*MBB.succ_begin());
947fe6060f1SDimitry Andric         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
948fe6060f1SDimitry Andric       }
949fe6060f1SDimitry Andric       MBB.remove(&MI);
950fe6060f1SDimitry Andric       return NewTerm;
951fe6060f1SDimitry Andric     }
952fe6060f1SDimitry Andric   } else {
953fe6060f1SDimitry Andric     if (!KillVal) {
954fe6060f1SDimitry Andric       // Op represents live lanes after kill,
955fe6060f1SDimitry Andric       // so exec mask needs to be factored in.
956fe6060f1SDimitry Andric       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
957fe6060f1SDimitry Andric       ComputeKilledMaskMI =
958fe6060f1SDimitry Andric           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
959fe6060f1SDimitry Andric       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
960fe6060f1SDimitry Andric                          .addReg(LiveMaskReg)
961fe6060f1SDimitry Andric                          .addReg(TmpReg);
962fe6060f1SDimitry Andric     } else {
963fe6060f1SDimitry Andric       // Op represents lanes to kill
964fe6060f1SDimitry Andric       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
965fe6060f1SDimitry Andric                          .addReg(LiveMaskReg)
966fe6060f1SDimitry Andric                          .add(Op);
967fe6060f1SDimitry Andric     }
968fe6060f1SDimitry Andric   }
969fe6060f1SDimitry Andric 
970fe6060f1SDimitry Andric   // State of SCC represents whether any lanes are live in mask,
971fe6060f1SDimitry Andric   // if SCC is 0 then no lanes will be alive anymore.
972fe6060f1SDimitry Andric   MachineInstr *EarlyTermMI =
973fe6060f1SDimitry Andric       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
974fe6060f1SDimitry Andric 
975fe6060f1SDimitry Andric   // In the case we got this far some lanes are still live,
976fe6060f1SDimitry Andric   // update EXEC to deactivate lanes as appropriate.
977fe6060f1SDimitry Andric   MachineInstr *NewTerm;
978fe6060f1SDimitry Andric   MachineInstr *WQMMaskMI = nullptr;
979fe6060f1SDimitry Andric   Register LiveMaskWQM;
980fe6060f1SDimitry Andric   if (IsDemote) {
98181ad6265SDimitry Andric     // Demote - deactivate quads with only helper lanes
982fe6060f1SDimitry Andric     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
983fe6060f1SDimitry Andric     WQMMaskMI =
984fe6060f1SDimitry Andric         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
985fe6060f1SDimitry Andric     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
986fe6060f1SDimitry Andric                   .addReg(Exec)
987fe6060f1SDimitry Andric                   .addReg(LiveMaskWQM);
988fe6060f1SDimitry Andric   } else {
98981ad6265SDimitry Andric     // Kill - deactivate lanes no longer in live mask
990fe6060f1SDimitry Andric     if (Op.isImm()) {
991fe6060f1SDimitry Andric       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
992fe6060f1SDimitry Andric       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
993fe6060f1SDimitry Andric     } else if (!IsWQM) {
994fe6060f1SDimitry Andric       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
995fe6060f1SDimitry Andric                     .addReg(Exec)
996fe6060f1SDimitry Andric                     .addReg(LiveMaskReg);
997fe6060f1SDimitry Andric     } else {
998fe6060f1SDimitry Andric       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
999fe6060f1SDimitry Andric       NewTerm =
1000fe6060f1SDimitry Andric           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1001fe6060f1SDimitry Andric     }
1002fe6060f1SDimitry Andric   }
1003fe6060f1SDimitry Andric 
1004fe6060f1SDimitry Andric   // Update live intervals
1005fe6060f1SDimitry Andric   LIS->RemoveMachineInstrFromMaps(MI);
1006fe6060f1SDimitry Andric   MBB.remove(&MI);
1007fe6060f1SDimitry Andric   assert(EarlyTermMI);
1008fe6060f1SDimitry Andric   assert(MaskUpdateMI);
1009fe6060f1SDimitry Andric   assert(NewTerm);
1010fe6060f1SDimitry Andric   if (ComputeKilledMaskMI)
1011fe6060f1SDimitry Andric     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1012fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1013fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1014fe6060f1SDimitry Andric   if (WQMMaskMI)
1015fe6060f1SDimitry Andric     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1016fe6060f1SDimitry Andric   LIS->InsertMachineInstrInMaps(*NewTerm);
1017fe6060f1SDimitry Andric 
1018fe6060f1SDimitry Andric   if (CndReg) {
1019fe6060f1SDimitry Andric     LIS->removeInterval(CndReg);
1020fe6060f1SDimitry Andric     LIS->createAndComputeVirtRegInterval(CndReg);
1021fe6060f1SDimitry Andric   }
1022fe6060f1SDimitry Andric   if (TmpReg)
1023fe6060f1SDimitry Andric     LIS->createAndComputeVirtRegInterval(TmpReg);
1024fe6060f1SDimitry Andric   if (LiveMaskWQM)
1025fe6060f1SDimitry Andric     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1026fe6060f1SDimitry Andric 
1027fe6060f1SDimitry Andric   return NewTerm;
1028fe6060f1SDimitry Andric }
1029fe6060f1SDimitry Andric 
1030fe6060f1SDimitry Andric // Replace (or supplement) instructions accessing live mask.
1031fe6060f1SDimitry Andric // This can only happen once all the live mask registers have been created
1032fe6060f1SDimitry Andric // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
lowerBlock(MachineBasicBlock & MBB)1033fe6060f1SDimitry Andric void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1034fe6060f1SDimitry Andric   auto BII = Blocks.find(&MBB);
1035fe6060f1SDimitry Andric   if (BII == Blocks.end())
1036fe6060f1SDimitry Andric     return;
1037fe6060f1SDimitry Andric 
1038fe6060f1SDimitry Andric   const BlockInfo &BI = BII->second;
1039fe6060f1SDimitry Andric   if (!BI.NeedsLowering)
1040fe6060f1SDimitry Andric     return;
1041fe6060f1SDimitry Andric 
1042fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1043fe6060f1SDimitry Andric 
1044fe6060f1SDimitry Andric   SmallVector<MachineInstr *, 4> SplitPoints;
1045fe6060f1SDimitry Andric   char State = BI.InitialState;
1046fe6060f1SDimitry Andric 
1047349cc55cSDimitry Andric   for (MachineInstr &MI : llvm::make_early_inc_range(
1048349cc55cSDimitry Andric            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1049fe6060f1SDimitry Andric     if (StateTransition.count(&MI))
1050fe6060f1SDimitry Andric       State = StateTransition[&MI];
1051fe6060f1SDimitry Andric 
1052fe6060f1SDimitry Andric     MachineInstr *SplitPoint = nullptr;
1053fe6060f1SDimitry Andric     switch (MI.getOpcode()) {
1054fe6060f1SDimitry Andric     case AMDGPU::SI_DEMOTE_I1:
1055fe6060f1SDimitry Andric     case AMDGPU::SI_KILL_I1_TERMINATOR:
1056fe6060f1SDimitry Andric       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1057fe6060f1SDimitry Andric       break;
1058fe6060f1SDimitry Andric     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1059fe6060f1SDimitry Andric       SplitPoint = lowerKillF32(MBB, MI);
1060fe6060f1SDimitry Andric       break;
1061fe6060f1SDimitry Andric     default:
1062fe6060f1SDimitry Andric       break;
1063fe6060f1SDimitry Andric     }
1064fe6060f1SDimitry Andric     if (SplitPoint)
1065fe6060f1SDimitry Andric       SplitPoints.push_back(SplitPoint);
1066fe6060f1SDimitry Andric   }
1067fe6060f1SDimitry Andric 
1068fe6060f1SDimitry Andric   // Perform splitting after instruction scan to simplify iteration.
1069fe6060f1SDimitry Andric   if (!SplitPoints.empty()) {
1070fe6060f1SDimitry Andric     MachineBasicBlock *BB = &MBB;
1071fe6060f1SDimitry Andric     for (MachineInstr *MI : SplitPoints) {
1072fe6060f1SDimitry Andric       BB = splitBlock(BB, MI);
1073fe6060f1SDimitry Andric     }
1074fe6060f1SDimitry Andric   }
1075fe6060f1SDimitry Andric }
1076fe6060f1SDimitry Andric 
10770b57cec5SDimitry Andric // Return an iterator in the (inclusive) range [First, Last] at which
10780b57cec5SDimitry Andric // instructions can be safely inserted, keeping in mind that some of the
10790b57cec5SDimitry Andric // instructions we want to add necessarily clobber SCC.
prepareInsertion(MachineBasicBlock & MBB,MachineBasicBlock::iterator First,MachineBasicBlock::iterator Last,bool PreferLast,bool SaveSCC)10800b57cec5SDimitry Andric MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
10810b57cec5SDimitry Andric     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
10820b57cec5SDimitry Andric     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
10830b57cec5SDimitry Andric   if (!SaveSCC)
10840b57cec5SDimitry Andric     return PreferLast ? Last : First;
10850b57cec5SDimitry Andric 
1086e8d8bef9SDimitry Andric   LiveRange &LR =
108706c3fb27SDimitry Andric       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
10880b57cec5SDimitry Andric   auto MBBE = MBB.end();
10890b57cec5SDimitry Andric   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
10900b57cec5SDimitry Andric                                      : LIS->getMBBEndIdx(&MBB);
10910b57cec5SDimitry Andric   SlotIndex LastIdx =
10920b57cec5SDimitry Andric       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
10930b57cec5SDimitry Andric   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
10940b57cec5SDimitry Andric   const LiveRange::Segment *S;
10950b57cec5SDimitry Andric 
10960b57cec5SDimitry Andric   for (;;) {
10970b57cec5SDimitry Andric     S = LR.getSegmentContaining(Idx);
10980b57cec5SDimitry Andric     if (!S)
10990b57cec5SDimitry Andric       break;
11000b57cec5SDimitry Andric 
11010b57cec5SDimitry Andric     if (PreferLast) {
11020b57cec5SDimitry Andric       SlotIndex Next = S->start.getBaseIndex();
11030b57cec5SDimitry Andric       if (Next < FirstIdx)
11040b57cec5SDimitry Andric         break;
11050b57cec5SDimitry Andric       Idx = Next;
11060b57cec5SDimitry Andric     } else {
1107e8d8bef9SDimitry Andric       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1108e8d8bef9SDimitry Andric       assert(EndMI && "Segment does not end on valid instruction");
1109e8d8bef9SDimitry Andric       auto NextI = std::next(EndMI->getIterator());
1110e8d8bef9SDimitry Andric       if (NextI == MBB.end())
1111e8d8bef9SDimitry Andric         break;
1112e8d8bef9SDimitry Andric       SlotIndex Next = LIS->getInstructionIndex(*NextI);
11130b57cec5SDimitry Andric       if (Next > LastIdx)
11140b57cec5SDimitry Andric         break;
11150b57cec5SDimitry Andric       Idx = Next;
11160b57cec5SDimitry Andric     }
11170b57cec5SDimitry Andric   }
11180b57cec5SDimitry Andric 
11190b57cec5SDimitry Andric   MachineBasicBlock::iterator MBBI;
11200b57cec5SDimitry Andric 
11210b57cec5SDimitry Andric   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
11220b57cec5SDimitry Andric     MBBI = MI;
11230b57cec5SDimitry Andric   else {
11240b57cec5SDimitry Andric     assert(Idx == LIS->getMBBEndIdx(&MBB));
11250b57cec5SDimitry Andric     MBBI = MBB.end();
11260b57cec5SDimitry Andric   }
11270b57cec5SDimitry Andric 
1128e8d8bef9SDimitry Andric   // Move insertion point past any operations modifying EXEC.
1129e8d8bef9SDimitry Andric   // This assumes that the value of SCC defined by any of these operations
1130e8d8bef9SDimitry Andric   // does not need to be preserved.
1131e8d8bef9SDimitry Andric   while (MBBI != Last) {
1132e8d8bef9SDimitry Andric     bool IsExecDef = false;
113306c3fb27SDimitry Andric     for (const MachineOperand &MO : MBBI->all_defs()) {
1134e8d8bef9SDimitry Andric       IsExecDef |=
1135e8d8bef9SDimitry Andric           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1136e8d8bef9SDimitry Andric     }
1137e8d8bef9SDimitry Andric     if (!IsExecDef)
1138e8d8bef9SDimitry Andric       break;
1139e8d8bef9SDimitry Andric     MBBI++;
1140e8d8bef9SDimitry Andric     S = nullptr;
1141e8d8bef9SDimitry Andric   }
1142e8d8bef9SDimitry Andric 
11430b57cec5SDimitry Andric   if (S)
11440b57cec5SDimitry Andric     MBBI = saveSCC(MBB, MBBI);
11450b57cec5SDimitry Andric 
11460b57cec5SDimitry Andric   return MBBI;
11470b57cec5SDimitry Andric }
11480b57cec5SDimitry Andric 
toExact(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveWQM)11490b57cec5SDimitry Andric void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
11500b57cec5SDimitry Andric                               MachineBasicBlock::iterator Before,
1151fe6060f1SDimitry Andric                               Register SaveWQM) {
1152*0fca6ea1SDimitry Andric   assert(LiveMaskReg.isVirtual());
1153*0fca6ea1SDimitry Andric 
115406c3fb27SDimitry Andric   bool IsTerminator = Before == MBB.end();
115506c3fb27SDimitry Andric   if (!IsTerminator) {
115606c3fb27SDimitry Andric     auto FirstTerm = MBB.getFirstTerminator();
115706c3fb27SDimitry Andric     if (FirstTerm != MBB.end()) {
115806c3fb27SDimitry Andric       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
115906c3fb27SDimitry Andric       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
116006c3fb27SDimitry Andric       IsTerminator = BeforeIdx > FirstTermIdx;
116106c3fb27SDimitry Andric     }
116206c3fb27SDimitry Andric   }
116306c3fb27SDimitry Andric 
11640b57cec5SDimitry Andric   MachineInstr *MI;
11650b57cec5SDimitry Andric 
11660b57cec5SDimitry Andric   if (SaveWQM) {
116706c3fb27SDimitry Andric     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
116806c3fb27SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
11690b57cec5SDimitry Andric              .addReg(LiveMaskReg);
11700b57cec5SDimitry Andric   } else {
117106c3fb27SDimitry Andric     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
117206c3fb27SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
11730b57cec5SDimitry Andric              .addReg(Exec)
11740b57cec5SDimitry Andric              .addReg(LiveMaskReg);
11750b57cec5SDimitry Andric   }
11760b57cec5SDimitry Andric 
11770b57cec5SDimitry Andric   LIS->InsertMachineInstrInMaps(*MI);
1178fe6060f1SDimitry Andric   StateTransition[MI] = StateExact;
11790b57cec5SDimitry Andric }
11800b57cec5SDimitry Andric 
toWQM(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedWQM)11810b57cec5SDimitry Andric void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
11820b57cec5SDimitry Andric                             MachineBasicBlock::iterator Before,
1183fe6060f1SDimitry Andric                             Register SavedWQM) {
11840b57cec5SDimitry Andric   MachineInstr *MI;
11850b57cec5SDimitry Andric 
11860b57cec5SDimitry Andric   if (SavedWQM) {
11870b57cec5SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
11880b57cec5SDimitry Andric              .addReg(SavedWQM);
11890b57cec5SDimitry Andric   } else {
1190fe6060f1SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
11910b57cec5SDimitry Andric   }
11920b57cec5SDimitry Andric 
11930b57cec5SDimitry Andric   LIS->InsertMachineInstrInMaps(*MI);
1194fe6060f1SDimitry Andric   StateTransition[MI] = StateWQM;
11950b57cec5SDimitry Andric }
11960b57cec5SDimitry Andric 
toStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveOrig,char StrictStateNeeded)1197fe6060f1SDimitry Andric void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
11980b57cec5SDimitry Andric                                    MachineBasicBlock::iterator Before,
1199fe6060f1SDimitry Andric                                    Register SaveOrig, char StrictStateNeeded) {
12000b57cec5SDimitry Andric   MachineInstr *MI;
12010b57cec5SDimitry Andric   assert(SaveOrig);
1202fe6060f1SDimitry Andric   assert(StrictStateNeeded == StateStrictWWM ||
1203fe6060f1SDimitry Andric          StrictStateNeeded == StateStrictWQM);
1204fe6060f1SDimitry Andric 
1205fe6060f1SDimitry Andric   if (StrictStateNeeded == StateStrictWWM) {
1206fe6060f1SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1207fe6060f1SDimitry Andric                  SaveOrig)
12080b57cec5SDimitry Andric              .addImm(-1);
1209fe6060f1SDimitry Andric   } else {
1210fe6060f1SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1211fe6060f1SDimitry Andric                  SaveOrig)
1212fe6060f1SDimitry Andric              .addImm(-1);
1213fe6060f1SDimitry Andric   }
12140b57cec5SDimitry Andric   LIS->InsertMachineInstrInMaps(*MI);
1215bdd1243dSDimitry Andric   StateTransition[MI] = StrictStateNeeded;
12160b57cec5SDimitry Andric }
12170b57cec5SDimitry Andric 
fromStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedOrig,char NonStrictState,char CurrentStrictState)1218fe6060f1SDimitry Andric void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
12190b57cec5SDimitry Andric                                      MachineBasicBlock::iterator Before,
1220fe6060f1SDimitry Andric                                      Register SavedOrig, char NonStrictState,
1221fe6060f1SDimitry Andric                                      char CurrentStrictState) {
12220b57cec5SDimitry Andric   MachineInstr *MI;
12230b57cec5SDimitry Andric 
12240b57cec5SDimitry Andric   assert(SavedOrig);
1225fe6060f1SDimitry Andric   assert(CurrentStrictState == StateStrictWWM ||
1226fe6060f1SDimitry Andric          CurrentStrictState == StateStrictWQM);
1227fe6060f1SDimitry Andric 
1228fe6060f1SDimitry Andric   if (CurrentStrictState == StateStrictWWM) {
1229fe6060f1SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1230fe6060f1SDimitry Andric                  Exec)
12310b57cec5SDimitry Andric              .addReg(SavedOrig);
1232fe6060f1SDimitry Andric   } else {
1233fe6060f1SDimitry Andric     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1234fe6060f1SDimitry Andric                  Exec)
1235fe6060f1SDimitry Andric              .addReg(SavedOrig);
1236fe6060f1SDimitry Andric   }
12370b57cec5SDimitry Andric   LIS->InsertMachineInstrInMaps(*MI);
1238fe6060f1SDimitry Andric   StateTransition[MI] = NonStrictState;
12390b57cec5SDimitry Andric }
12400b57cec5SDimitry Andric 
processBlock(MachineBasicBlock & MBB,bool IsEntry)1241fe6060f1SDimitry Andric void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
12420b57cec5SDimitry Andric   auto BII = Blocks.find(&MBB);
12430b57cec5SDimitry Andric   if (BII == Blocks.end())
12440b57cec5SDimitry Andric     return;
12450b57cec5SDimitry Andric 
1246fe6060f1SDimitry Andric   BlockInfo &BI = BII->second;
12470b57cec5SDimitry Andric 
12480b57cec5SDimitry Andric   // This is a non-entry block that is WQM throughout, so no need to do
12490b57cec5SDimitry Andric   // anything.
1250fe6060f1SDimitry Andric   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1251fe6060f1SDimitry Andric     BI.InitialState = StateWQM;
12520b57cec5SDimitry Andric     return;
1253fe6060f1SDimitry Andric   }
12540b57cec5SDimitry Andric 
12550b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
12560b57cec5SDimitry Andric                     << ":\n");
12570b57cec5SDimitry Andric 
1258fe6060f1SDimitry Andric   Register SavedWQMReg;
1259fe6060f1SDimitry Andric   Register SavedNonStrictReg;
1260fe6060f1SDimitry Andric   bool WQMFromExec = IsEntry;
1261fe6060f1SDimitry Andric   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1262fe6060f1SDimitry Andric   char NonStrictState = 0;
12630b57cec5SDimitry Andric   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
12640b57cec5SDimitry Andric 
12650b57cec5SDimitry Andric   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1266fe6060f1SDimitry Andric   if (IsEntry) {
1267e8d8bef9SDimitry Andric     // Skip the instruction that saves LiveMask
12685f757f3fSDimitry Andric     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
12695f757f3fSDimitry Andric         II->getOperand(1).getReg() == TRI->getExec())
1270e8d8bef9SDimitry Andric       ++II;
1271e8d8bef9SDimitry Andric   }
12720b57cec5SDimitry Andric 
12730b57cec5SDimitry Andric   // This stores the first instruction where it's safe to switch from WQM to
12740b57cec5SDimitry Andric   // Exact or vice versa.
12750b57cec5SDimitry Andric   MachineBasicBlock::iterator FirstWQM = IE;
12760b57cec5SDimitry Andric 
1277fe6060f1SDimitry Andric   // This stores the first instruction where it's safe to switch from Strict
1278fe6060f1SDimitry Andric   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1279fe6060f1SDimitry Andric   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1280fe6060f1SDimitry Andric   // be safe to switch to/from WQM as well.
1281fe6060f1SDimitry Andric   MachineBasicBlock::iterator FirstStrict = IE;
1282fe6060f1SDimitry Andric 
1283fe6060f1SDimitry Andric   // Record initial state is block information.
1284fe6060f1SDimitry Andric   BI.InitialState = State;
1285e8d8bef9SDimitry Andric 
12860b57cec5SDimitry Andric   for (;;) {
12870b57cec5SDimitry Andric     MachineBasicBlock::iterator Next = II;
1288fe6060f1SDimitry Andric     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
12890b57cec5SDimitry Andric     char OutNeeds = 0;
12900b57cec5SDimitry Andric 
12910b57cec5SDimitry Andric     if (FirstWQM == IE)
12920b57cec5SDimitry Andric       FirstWQM = II;
12930b57cec5SDimitry Andric 
1294fe6060f1SDimitry Andric     if (FirstStrict == IE)
1295fe6060f1SDimitry Andric       FirstStrict = II;
12960b57cec5SDimitry Andric 
12970b57cec5SDimitry Andric     // First, figure out the allowed states (Needs) based on the propagated
12980b57cec5SDimitry Andric     // flags.
12990b57cec5SDimitry Andric     if (II != IE) {
13000b57cec5SDimitry Andric       MachineInstr &MI = *II;
13010b57cec5SDimitry Andric 
13025ffd83dbSDimitry Andric       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
13030b57cec5SDimitry Andric         auto III = Instructions.find(&MI);
13040b57cec5SDimitry Andric         if (III != Instructions.end()) {
1305fe6060f1SDimitry Andric           if (III->second.Needs & StateStrictWWM)
1306fe6060f1SDimitry Andric             Needs = StateStrictWWM;
1307fe6060f1SDimitry Andric           else if (III->second.Needs & StateStrictWQM)
1308fe6060f1SDimitry Andric             Needs = StateStrictWQM;
13090b57cec5SDimitry Andric           else if (III->second.Needs & StateWQM)
13100b57cec5SDimitry Andric             Needs = StateWQM;
13110b57cec5SDimitry Andric           else
13120b57cec5SDimitry Andric             Needs &= ~III->second.Disabled;
13130b57cec5SDimitry Andric           OutNeeds = III->second.OutNeeds;
13140b57cec5SDimitry Andric         }
13150b57cec5SDimitry Andric       } else {
13160b57cec5SDimitry Andric         // If the instruction doesn't actually need a correct EXEC, then we can
1317fe6060f1SDimitry Andric         // safely leave Strict mode enabled.
1318fe6060f1SDimitry Andric         Needs = StateExact | StateWQM | StateStrict;
13190b57cec5SDimitry Andric       }
13200b57cec5SDimitry Andric 
132106c3fb27SDimitry Andric       // Exact mode exit can occur in terminators, but must be before branches.
132206c3fb27SDimitry Andric       if (MI.isBranch() && OutNeeds == StateExact)
13230b57cec5SDimitry Andric         Needs = StateExact;
13240b57cec5SDimitry Andric 
13250b57cec5SDimitry Andric       ++Next;
13260b57cec5SDimitry Andric     } else {
13270b57cec5SDimitry Andric       // End of basic block
13280b57cec5SDimitry Andric       if (BI.OutNeeds & StateWQM)
13290b57cec5SDimitry Andric         Needs = StateWQM;
13300b57cec5SDimitry Andric       else if (BI.OutNeeds == StateExact)
13310b57cec5SDimitry Andric         Needs = StateExact;
13320b57cec5SDimitry Andric       else
13330b57cec5SDimitry Andric         Needs = StateWQM | StateExact;
13340b57cec5SDimitry Andric     }
13350b57cec5SDimitry Andric 
13360b57cec5SDimitry Andric     // Now, transition if necessary.
13370b57cec5SDimitry Andric     if (!(Needs & State)) {
13380b57cec5SDimitry Andric       MachineBasicBlock::iterator First;
1339fe6060f1SDimitry Andric       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1340fe6060f1SDimitry Andric           State == StateStrictWQM || Needs == StateStrictWQM) {
1341fe6060f1SDimitry Andric         // We must switch to or from Strict mode.
1342fe6060f1SDimitry Andric         First = FirstStrict;
13430b57cec5SDimitry Andric       } else {
1344fe6060f1SDimitry Andric         // We only need to switch to/from WQM, so we can use FirstWQM.
13450b57cec5SDimitry Andric         First = FirstWQM;
13460b57cec5SDimitry Andric       }
13470b57cec5SDimitry Andric 
1348fe6060f1SDimitry Andric       // Whether we need to save SCC depends on start and end states.
1349fe6060f1SDimitry Andric       bool SaveSCC = false;
1350fe6060f1SDimitry Andric       switch (State) {
1351fe6060f1SDimitry Andric       case StateExact:
1352fe6060f1SDimitry Andric       case StateStrictWWM:
1353fe6060f1SDimitry Andric       case StateStrictWQM:
1354fe6060f1SDimitry Andric         // Exact/Strict -> Strict: save SCC
1355fe6060f1SDimitry Andric         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1356fe6060f1SDimitry Andric         // Exact/Strict -> Exact: no save
1357fe6060f1SDimitry Andric         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1358fe6060f1SDimitry Andric         break;
1359fe6060f1SDimitry Andric       case StateWQM:
1360fe6060f1SDimitry Andric         // WQM -> Exact/Strict: save SCC
1361fe6060f1SDimitry Andric         SaveSCC = !(Needs & StateWQM);
1362fe6060f1SDimitry Andric         break;
1363fe6060f1SDimitry Andric       default:
1364fe6060f1SDimitry Andric         llvm_unreachable("Unknown state");
1365fe6060f1SDimitry Andric         break;
1366fe6060f1SDimitry Andric       }
13670b57cec5SDimitry Andric       MachineBasicBlock::iterator Before =
1368fe6060f1SDimitry Andric           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
13690b57cec5SDimitry Andric 
1370fe6060f1SDimitry Andric       if (State & StateStrict) {
1371fe6060f1SDimitry Andric         assert(State == StateStrictWWM || State == StateStrictWQM);
1372fe6060f1SDimitry Andric         assert(SavedNonStrictReg);
1373fe6060f1SDimitry Andric         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1374fe6060f1SDimitry Andric 
1375fe6060f1SDimitry Andric         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1376fe6060f1SDimitry Andric         SavedNonStrictReg = 0;
1377fe6060f1SDimitry Andric         State = NonStrictState;
13780b57cec5SDimitry Andric       }
13790b57cec5SDimitry Andric 
1380fe6060f1SDimitry Andric       if (Needs & StateStrict) {
1381fe6060f1SDimitry Andric         NonStrictState = State;
1382fe6060f1SDimitry Andric         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1383fe6060f1SDimitry Andric         assert(!SavedNonStrictReg);
1384fe6060f1SDimitry Andric         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1385fe6060f1SDimitry Andric 
1386fe6060f1SDimitry Andric         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1387fe6060f1SDimitry Andric         State = Needs;
1388fe6060f1SDimitry Andric 
13890b57cec5SDimitry Andric       } else {
13900b57cec5SDimitry Andric         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
13915ffd83dbSDimitry Andric           if (!WQMFromExec && (OutNeeds & StateWQM)) {
13925ffd83dbSDimitry Andric             assert(!SavedWQMReg);
13930b57cec5SDimitry Andric             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
13945ffd83dbSDimitry Andric           }
13950b57cec5SDimitry Andric 
1396fe6060f1SDimitry Andric           toExact(MBB, Before, SavedWQMReg);
13970b57cec5SDimitry Andric           State = StateExact;
13980b57cec5SDimitry Andric         } else if (State == StateExact && (Needs & StateWQM) &&
13990b57cec5SDimitry Andric                    !(Needs & StateExact)) {
14000b57cec5SDimitry Andric           assert(WQMFromExec == (SavedWQMReg == 0));
14010b57cec5SDimitry Andric 
14020b57cec5SDimitry Andric           toWQM(MBB, Before, SavedWQMReg);
14030b57cec5SDimitry Andric 
14040b57cec5SDimitry Andric           if (SavedWQMReg) {
14050b57cec5SDimitry Andric             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
14060b57cec5SDimitry Andric             SavedWQMReg = 0;
14070b57cec5SDimitry Andric           }
14080b57cec5SDimitry Andric           State = StateWQM;
14090b57cec5SDimitry Andric         } else {
1410fe6060f1SDimitry Andric           // We can get here if we transitioned from StrictWWM to a
1411fe6060f1SDimitry Andric           // non-StrictWWM state that already matches our needs, but we
1412fe6060f1SDimitry Andric           // shouldn't need to do anything.
14130b57cec5SDimitry Andric           assert(Needs & State);
14140b57cec5SDimitry Andric         }
14150b57cec5SDimitry Andric       }
14160b57cec5SDimitry Andric     }
14170b57cec5SDimitry Andric 
1418fe6060f1SDimitry Andric     if (Needs != (StateExact | StateWQM | StateStrict)) {
14190b57cec5SDimitry Andric       if (Needs != (StateExact | StateWQM))
14200b57cec5SDimitry Andric         FirstWQM = IE;
1421fe6060f1SDimitry Andric       FirstStrict = IE;
14220b57cec5SDimitry Andric     }
14230b57cec5SDimitry Andric 
14240b57cec5SDimitry Andric     if (II == IE)
14250b57cec5SDimitry Andric       break;
1426e8d8bef9SDimitry Andric 
14270b57cec5SDimitry Andric     II = Next;
14280b57cec5SDimitry Andric   }
14295ffd83dbSDimitry Andric   assert(!SavedWQMReg);
1430fe6060f1SDimitry Andric   assert(!SavedNonStrictReg);
14310b57cec5SDimitry Andric }
14320b57cec5SDimitry Andric 
lowerLiveMaskQueries()1433*0fca6ea1SDimitry Andric bool SIWholeQuadMode::lowerLiveMaskQueries() {
14340b57cec5SDimitry Andric   for (MachineInstr *MI : LiveMaskQueries) {
14350b57cec5SDimitry Andric     const DebugLoc &DL = MI->getDebugLoc();
14368bcb0991SDimitry Andric     Register Dest = MI->getOperand(0).getReg();
1437e8d8bef9SDimitry Andric 
14380b57cec5SDimitry Andric     MachineInstr *Copy =
14390b57cec5SDimitry Andric         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
14400b57cec5SDimitry Andric             .addReg(LiveMaskReg);
14410b57cec5SDimitry Andric 
14420b57cec5SDimitry Andric     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
14430b57cec5SDimitry Andric     MI->eraseFromParent();
14440b57cec5SDimitry Andric   }
1445*0fca6ea1SDimitry Andric   return !LiveMaskQueries.empty();
14460b57cec5SDimitry Andric }
14470b57cec5SDimitry Andric 
lowerCopyInstrs()1448*0fca6ea1SDimitry Andric bool SIWholeQuadMode::lowerCopyInstrs() {
1449480093f4SDimitry Andric   for (MachineInstr *MI : LowerToMovInstrs) {
1450480093f4SDimitry Andric     assert(MI->getNumExplicitOperands() == 2);
14510b57cec5SDimitry Andric 
14528bcb0991SDimitry Andric     const Register Reg = MI->getOperand(0).getReg();
14530b57cec5SDimitry Andric 
1454e8d8bef9SDimitry Andric     const TargetRegisterClass *regClass =
1455bdd1243dSDimitry Andric         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1456bdd1243dSDimitry Andric     if (TRI->isVGPRClass(regClass)) {
14570b57cec5SDimitry Andric       const unsigned MovOp = TII->getMovOpcode(regClass);
14580b57cec5SDimitry Andric       MI->setDesc(TII->get(MovOp));
14590b57cec5SDimitry Andric 
1460fe6060f1SDimitry Andric       // Check that it already implicitly depends on exec (like all VALU movs
1461fe6060f1SDimitry Andric       // should do).
1462fe6060f1SDimitry Andric       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1463fe6060f1SDimitry Andric         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1464fe6060f1SDimitry Andric       }));
1465fe6060f1SDimitry Andric     } else {
1466e8d8bef9SDimitry Andric       // Remove early-clobber and exec dependency from simple SGPR copies.
1467e8d8bef9SDimitry Andric       // This allows some to be eliminated during/post RA.
1468e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1469e8d8bef9SDimitry Andric       if (MI->getOperand(0).isEarlyClobber()) {
1470e8d8bef9SDimitry Andric         LIS->removeInterval(Reg);
1471e8d8bef9SDimitry Andric         MI->getOperand(0).setIsEarlyClobber(false);
1472e8d8bef9SDimitry Andric         LIS->createAndComputeVirtRegInterval(Reg);
1473e8d8bef9SDimitry Andric       }
1474*0fca6ea1SDimitry Andric       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1475e8d8bef9SDimitry Andric       while (Index >= 0) {
147681ad6265SDimitry Andric         MI->removeOperand(Index);
1477*0fca6ea1SDimitry Andric         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1478e8d8bef9SDimitry Andric       }
14790b57cec5SDimitry Andric       MI->setDesc(TII->get(AMDGPU::COPY));
1480e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "  -> " << *MI);
14810b57cec5SDimitry Andric     }
14820b57cec5SDimitry Andric   }
1483480093f4SDimitry Andric   for (MachineInstr *MI : LowerToCopyInstrs) {
1484480093f4SDimitry Andric     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1485480093f4SDimitry Andric         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1486480093f4SDimitry Andric       assert(MI->getNumExplicitOperands() == 3);
1487480093f4SDimitry Andric       // the only reason we should be here is V_SET_INACTIVE has
1488480093f4SDimitry Andric       // an undef input so it is being replaced by a simple copy.
1489480093f4SDimitry Andric       // There should be a second undef source that we should remove.
1490480093f4SDimitry Andric       assert(MI->getOperand(2).isUndef());
149181ad6265SDimitry Andric       MI->removeOperand(2);
1492480093f4SDimitry Andric       MI->untieRegOperand(1);
1493480093f4SDimitry Andric     } else {
1494480093f4SDimitry Andric       assert(MI->getNumExplicitOperands() == 2);
1495480093f4SDimitry Andric     }
1496480093f4SDimitry Andric 
149706c3fb27SDimitry Andric     unsigned CopyOp = MI->getOperand(1).isReg()
149806c3fb27SDimitry Andric                           ? (unsigned)AMDGPU::COPY
149906c3fb27SDimitry Andric                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
150006c3fb27SDimitry Andric                                 *MRI, MI->getOperand(0)));
150106c3fb27SDimitry Andric     MI->setDesc(TII->get(CopyOp));
1502480093f4SDimitry Andric   }
1503*0fca6ea1SDimitry Andric   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
15040b57cec5SDimitry Andric }
15050b57cec5SDimitry Andric 
lowerKillInstrs(bool IsWQM)1506*0fca6ea1SDimitry Andric bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1507fe6060f1SDimitry Andric   for (MachineInstr *MI : KillInstrs) {
1508fe6060f1SDimitry Andric     MachineBasicBlock *MBB = MI->getParent();
1509fe6060f1SDimitry Andric     MachineInstr *SplitPoint = nullptr;
1510fe6060f1SDimitry Andric     switch (MI->getOpcode()) {
1511fe6060f1SDimitry Andric     case AMDGPU::SI_DEMOTE_I1:
1512fe6060f1SDimitry Andric     case AMDGPU::SI_KILL_I1_TERMINATOR:
1513fe6060f1SDimitry Andric       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1514fe6060f1SDimitry Andric       break;
1515fe6060f1SDimitry Andric     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1516fe6060f1SDimitry Andric       SplitPoint = lowerKillF32(*MBB, *MI);
1517fe6060f1SDimitry Andric       break;
1518fe6060f1SDimitry Andric     }
1519fe6060f1SDimitry Andric     if (SplitPoint)
1520fe6060f1SDimitry Andric       splitBlock(MBB, SplitPoint);
1521fe6060f1SDimitry Andric   }
1522*0fca6ea1SDimitry Andric   return !KillInstrs.empty();
1523*0fca6ea1SDimitry Andric }
1524*0fca6ea1SDimitry Andric 
lowerInitExec(MachineInstr & MI)1525*0fca6ea1SDimitry Andric void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1526*0fca6ea1SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
1527*0fca6ea1SDimitry Andric   bool IsWave32 = ST->isWave32();
1528*0fca6ea1SDimitry Andric 
1529*0fca6ea1SDimitry Andric   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1530*0fca6ea1SDimitry Andric     // This should be before all vector instructions.
1531*0fca6ea1SDimitry Andric     MachineInstr *InitMI =
1532*0fca6ea1SDimitry Andric         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1533*0fca6ea1SDimitry Andric                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1534*0fca6ea1SDimitry Andric                 Exec)
1535*0fca6ea1SDimitry Andric             .addImm(MI.getOperand(0).getImm());
1536*0fca6ea1SDimitry Andric     if (LIS) {
1537*0fca6ea1SDimitry Andric       LIS->RemoveMachineInstrFromMaps(MI);
1538*0fca6ea1SDimitry Andric       LIS->InsertMachineInstrInMaps(*InitMI);
1539*0fca6ea1SDimitry Andric     }
1540*0fca6ea1SDimitry Andric     MI.eraseFromParent();
1541*0fca6ea1SDimitry Andric     return;
1542*0fca6ea1SDimitry Andric   }
1543*0fca6ea1SDimitry Andric 
1544*0fca6ea1SDimitry Andric   // Extract the thread count from an SGPR input and set EXEC accordingly.
1545*0fca6ea1SDimitry Andric   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1546*0fca6ea1SDimitry Andric   //
1547*0fca6ea1SDimitry Andric   // S_BFE_U32 count, input, {shift, 7}
1548*0fca6ea1SDimitry Andric   // S_BFM_B64 exec, count, 0
1549*0fca6ea1SDimitry Andric   // S_CMP_EQ_U32 count, 64
1550*0fca6ea1SDimitry Andric   // S_CMOV_B64 exec, -1
1551*0fca6ea1SDimitry Andric   Register InputReg = MI.getOperand(0).getReg();
1552*0fca6ea1SDimitry Andric   MachineInstr *FirstMI = &*MBB->begin();
1553*0fca6ea1SDimitry Andric   if (InputReg.isVirtual()) {
1554*0fca6ea1SDimitry Andric     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1555*0fca6ea1SDimitry Andric     assert(DefInstr && DefInstr->isCopy());
1556*0fca6ea1SDimitry Andric     if (DefInstr->getParent() == MBB) {
1557*0fca6ea1SDimitry Andric       if (DefInstr != FirstMI) {
1558*0fca6ea1SDimitry Andric         // If the `InputReg` is defined in current block, we also need to
1559*0fca6ea1SDimitry Andric         // move that instruction to the beginning of the block.
1560*0fca6ea1SDimitry Andric         DefInstr->removeFromParent();
1561*0fca6ea1SDimitry Andric         MBB->insert(FirstMI, DefInstr);
1562*0fca6ea1SDimitry Andric         if (LIS)
1563*0fca6ea1SDimitry Andric           LIS->handleMove(*DefInstr);
1564*0fca6ea1SDimitry Andric       } else {
1565*0fca6ea1SDimitry Andric         // If first instruction is definition then move pointer after it.
1566*0fca6ea1SDimitry Andric         FirstMI = &*std::next(FirstMI->getIterator());
1567*0fca6ea1SDimitry Andric       }
1568*0fca6ea1SDimitry Andric     }
1569*0fca6ea1SDimitry Andric   }
1570*0fca6ea1SDimitry Andric 
1571*0fca6ea1SDimitry Andric   // Insert instruction sequence at block beginning (before vector operations).
1572*0fca6ea1SDimitry Andric   const DebugLoc DL = MI.getDebugLoc();
1573*0fca6ea1SDimitry Andric   const unsigned WavefrontSize = ST->getWavefrontSize();
1574*0fca6ea1SDimitry Andric   const unsigned Mask = (WavefrontSize << 1) - 1;
1575*0fca6ea1SDimitry Andric   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1576*0fca6ea1SDimitry Andric   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1577*0fca6ea1SDimitry Andric                    .addReg(InputReg)
1578*0fca6ea1SDimitry Andric                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1579*0fca6ea1SDimitry Andric   auto BfmMI =
1580*0fca6ea1SDimitry Andric       BuildMI(*MBB, FirstMI, DL,
1581*0fca6ea1SDimitry Andric               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1582*0fca6ea1SDimitry Andric           .addReg(CountReg)
1583*0fca6ea1SDimitry Andric           .addImm(0);
1584*0fca6ea1SDimitry Andric   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1585*0fca6ea1SDimitry Andric                    .addReg(CountReg, RegState::Kill)
1586*0fca6ea1SDimitry Andric                    .addImm(WavefrontSize);
1587*0fca6ea1SDimitry Andric   auto CmovMI =
1588*0fca6ea1SDimitry Andric       BuildMI(*MBB, FirstMI, DL,
1589*0fca6ea1SDimitry Andric               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1590*0fca6ea1SDimitry Andric               Exec)
1591*0fca6ea1SDimitry Andric           .addImm(-1);
1592*0fca6ea1SDimitry Andric 
1593*0fca6ea1SDimitry Andric   if (!LIS) {
1594*0fca6ea1SDimitry Andric     MI.eraseFromParent();
1595*0fca6ea1SDimitry Andric     return;
1596*0fca6ea1SDimitry Andric   }
1597*0fca6ea1SDimitry Andric 
1598*0fca6ea1SDimitry Andric   LIS->RemoveMachineInstrFromMaps(MI);
1599*0fca6ea1SDimitry Andric   MI.eraseFromParent();
1600*0fca6ea1SDimitry Andric 
1601*0fca6ea1SDimitry Andric   LIS->InsertMachineInstrInMaps(*BfeMI);
1602*0fca6ea1SDimitry Andric   LIS->InsertMachineInstrInMaps(*BfmMI);
1603*0fca6ea1SDimitry Andric   LIS->InsertMachineInstrInMaps(*CmpMI);
1604*0fca6ea1SDimitry Andric   LIS->InsertMachineInstrInMaps(*CmovMI);
1605*0fca6ea1SDimitry Andric 
1606*0fca6ea1SDimitry Andric   LIS->removeInterval(InputReg);
1607*0fca6ea1SDimitry Andric   LIS->createAndComputeVirtRegInterval(InputReg);
1608*0fca6ea1SDimitry Andric   LIS->createAndComputeVirtRegInterval(CountReg);
1609*0fca6ea1SDimitry Andric }
1610*0fca6ea1SDimitry Andric 
1611*0fca6ea1SDimitry Andric /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1612*0fca6ea1SDimitry Andric /// for instructions that depend on EXEC.
1613*0fca6ea1SDimitry Andric MachineBasicBlock::iterator
lowerInitExecInstrs(MachineBasicBlock & Entry,bool & Changed)1614*0fca6ea1SDimitry Andric SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1615*0fca6ea1SDimitry Andric   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1616*0fca6ea1SDimitry Andric 
1617*0fca6ea1SDimitry Andric   for (MachineInstr *MI : InitExecInstrs) {
1618*0fca6ea1SDimitry Andric     // Try to handle undefined cases gracefully:
1619*0fca6ea1SDimitry Andric     // - multiple INIT_EXEC instructions
1620*0fca6ea1SDimitry Andric     // - INIT_EXEC instructions not in the entry block
1621*0fca6ea1SDimitry Andric     if (MI->getParent() == &Entry)
1622*0fca6ea1SDimitry Andric       InsertPt = std::next(MI->getIterator());
1623*0fca6ea1SDimitry Andric 
1624*0fca6ea1SDimitry Andric     lowerInitExec(*MI);
1625*0fca6ea1SDimitry Andric     Changed = true;
1626*0fca6ea1SDimitry Andric   }
1627*0fca6ea1SDimitry Andric 
1628*0fca6ea1SDimitry Andric   return InsertPt;
1629fe6060f1SDimitry Andric }
1630fe6060f1SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)16310b57cec5SDimitry Andric bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1632fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1633fe6060f1SDimitry Andric                     << " ------------- \n");
1634fe6060f1SDimitry Andric   LLVM_DEBUG(MF.dump(););
1635fe6060f1SDimitry Andric 
16360b57cec5SDimitry Andric   Instructions.clear();
16370b57cec5SDimitry Andric   Blocks.clear();
16380b57cec5SDimitry Andric   LiveMaskQueries.clear();
16390b57cec5SDimitry Andric   LowerToCopyInstrs.clear();
1640480093f4SDimitry Andric   LowerToMovInstrs.clear();
1641fe6060f1SDimitry Andric   KillInstrs.clear();
1642*0fca6ea1SDimitry Andric   InitExecInstrs.clear();
1643fe6060f1SDimitry Andric   StateTransition.clear();
16440b57cec5SDimitry Andric 
16450b57cec5SDimitry Andric   ST = &MF.getSubtarget<GCNSubtarget>();
16460b57cec5SDimitry Andric 
16470b57cec5SDimitry Andric   TII = ST->getInstrInfo();
16480b57cec5SDimitry Andric   TRI = &TII->getRegisterInfo();
16490b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
1650*0fca6ea1SDimitry Andric   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1651*0fca6ea1SDimitry Andric   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1652*0fca6ea1SDimitry Andric   MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1653*0fca6ea1SDimitry Andric   auto *PDTWrapper =
1654*0fca6ea1SDimitry Andric       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1655*0fca6ea1SDimitry Andric   PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
16560b57cec5SDimitry Andric 
1657e8d8bef9SDimitry Andric   if (ST->isWave32()) {
1658e8d8bef9SDimitry Andric     AndOpc = AMDGPU::S_AND_B32;
165906c3fb27SDimitry Andric     AndTermOpc = AMDGPU::S_AND_B32_term;
1660fe6060f1SDimitry Andric     AndN2Opc = AMDGPU::S_ANDN2_B32;
1661fe6060f1SDimitry Andric     XorOpc = AMDGPU::S_XOR_B32;
1662fe6060f1SDimitry Andric     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
166306c3fb27SDimitry Andric     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1664fe6060f1SDimitry Andric     WQMOpc = AMDGPU::S_WQM_B32;
1665e8d8bef9SDimitry Andric     Exec = AMDGPU::EXEC_LO;
1666e8d8bef9SDimitry Andric   } else {
1667e8d8bef9SDimitry Andric     AndOpc = AMDGPU::S_AND_B64;
166806c3fb27SDimitry Andric     AndTermOpc = AMDGPU::S_AND_B64_term;
1669fe6060f1SDimitry Andric     AndN2Opc = AMDGPU::S_ANDN2_B64;
1670fe6060f1SDimitry Andric     XorOpc = AMDGPU::S_XOR_B64;
1671fe6060f1SDimitry Andric     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
167206c3fb27SDimitry Andric     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1673fe6060f1SDimitry Andric     WQMOpc = AMDGPU::S_WQM_B64;
1674e8d8bef9SDimitry Andric     Exec = AMDGPU::EXEC;
1675e8d8bef9SDimitry Andric   }
1676e8d8bef9SDimitry Andric 
1677fe6060f1SDimitry Andric   const char GlobalFlags = analyzeFunction(MF);
1678*0fca6ea1SDimitry Andric   bool Changed = false;
1679fe6060f1SDimitry Andric 
1680fe6060f1SDimitry Andric   LiveMaskReg = Exec;
1681fe6060f1SDimitry Andric 
16820b57cec5SDimitry Andric   MachineBasicBlock &Entry = MF.front();
1683*0fca6ea1SDimitry Andric   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
16840b57cec5SDimitry Andric 
1685fe6060f1SDimitry Andric   // Store a copy of the original live mask when required
1686*0fca6ea1SDimitry Andric   const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1687*0fca6ea1SDimitry Andric   const bool HasWaveModes = GlobalFlags & ~StateExact;
1688*0fca6ea1SDimitry Andric   const bool HasKills = !KillInstrs.empty();
1689*0fca6ea1SDimitry Andric   const bool UsesWQM = GlobalFlags & StateWQM;
1690*0fca6ea1SDimitry Andric   if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
16910b57cec5SDimitry Andric     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1692fe6060f1SDimitry Andric     MachineInstr *MI =
1693fe6060f1SDimitry Andric         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
16940b57cec5SDimitry Andric             .addReg(Exec);
16950b57cec5SDimitry Andric     LIS->InsertMachineInstrInMaps(*MI);
1696*0fca6ea1SDimitry Andric     Changed = true;
16970b57cec5SDimitry Andric   }
16980b57cec5SDimitry Andric 
16990b57cec5SDimitry Andric   LLVM_DEBUG(printInfo());
17000b57cec5SDimitry Andric 
1701*0fca6ea1SDimitry Andric   Changed |= lowerLiveMaskQueries();
1702*0fca6ea1SDimitry Andric   Changed |= lowerCopyInstrs();
17030b57cec5SDimitry Andric 
1704*0fca6ea1SDimitry Andric   if (!HasWaveModes) {
1705*0fca6ea1SDimitry Andric     // No wave mode execution
1706*0fca6ea1SDimitry Andric     Changed |= lowerKillInstrs(false);
1707*0fca6ea1SDimitry Andric   } else if (GlobalFlags == StateWQM) {
1708fe6060f1SDimitry Andric     // Shader only needs WQM
1709fe6060f1SDimitry Andric     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1710fe6060f1SDimitry Andric                   .addReg(Exec);
1711fe6060f1SDimitry Andric     LIS->InsertMachineInstrInMaps(*MI);
1712fe6060f1SDimitry Andric     lowerKillInstrs(true);
1713*0fca6ea1SDimitry Andric     Changed = true;
1714fe6060f1SDimitry Andric   } else {
1715*0fca6ea1SDimitry Andric     // Wave mode switching requires full lowering pass.
17160b57cec5SDimitry Andric     for (auto BII : Blocks)
1717fe6060f1SDimitry Andric       processBlock(*BII.first, BII.first == &Entry);
1718fe6060f1SDimitry Andric     // Lowering blocks causes block splitting so perform as a second pass.
1719fe6060f1SDimitry Andric     for (auto BII : Blocks)
1720fe6060f1SDimitry Andric       lowerBlock(*BII.first);
1721*0fca6ea1SDimitry Andric     Changed = true;
1722fe6060f1SDimitry Andric   }
17230b57cec5SDimitry Andric 
1724fe6060f1SDimitry Andric   // Compute live range for live mask
1725fe6060f1SDimitry Andric   if (LiveMaskReg != Exec)
17265ffd83dbSDimitry Andric     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
17275ffd83dbSDimitry Andric 
17280b57cec5SDimitry Andric   // Physical registers like SCC aren't tracked by default anyway, so just
17290b57cec5SDimitry Andric   // removing the ranges we computed is the simplest option for maintaining
17300b57cec5SDimitry Andric   // the analysis results.
173181ad6265SDimitry Andric   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
17320b57cec5SDimitry Andric 
1733fe6060f1SDimitry Andric   // If we performed any kills then recompute EXEC
1734*0fca6ea1SDimitry Andric   if (!KillInstrs.empty() || !InitExecInstrs.empty())
173581ad6265SDimitry Andric     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1736fe6060f1SDimitry Andric 
1737*0fca6ea1SDimitry Andric   return Changed;
17380b57cec5SDimitry Andric }
1739