10b57cec5SDimitry Andric //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
10fe6060f1SDimitry Andric /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11fe6060f1SDimitry Andric /// for pixel shaders, and strict whole wavefront mode for all programs.
12fe6060f1SDimitry Andric ///
13fe6060f1SDimitry Andric /// The "strict" prefix indicates that inactive lanes do not take part in
14fe6060f1SDimitry Andric /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15fe6060f1SDimitry Andric /// always be enabled irrespective of control flow decisions. Conversely in
16fe6060f1SDimitry Andric /// non-strict WQM inactive lanes may control flow decisions.
170b57cec5SDimitry Andric ///
180b57cec5SDimitry Andric /// Whole quad mode is required for derivative computations, but it interferes
19fe6060f1SDimitry Andric /// with shader side effects (stores and atomics). It ensures that WQM is
20fe6060f1SDimitry Andric /// enabled when necessary, but disabled around stores and atomics.
210b57cec5SDimitry Andric ///
220b57cec5SDimitry Andric /// When necessary, this pass creates a function prolog
230b57cec5SDimitry Andric ///
240b57cec5SDimitry Andric /// S_MOV_B64 LiveMask, EXEC
250b57cec5SDimitry Andric /// S_WQM_B64 EXEC, EXEC
260b57cec5SDimitry Andric ///
270b57cec5SDimitry Andric /// to enter WQM at the top of the function and surrounds blocks of Exact
280b57cec5SDimitry Andric /// instructions by
290b57cec5SDimitry Andric ///
300b57cec5SDimitry Andric /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
310b57cec5SDimitry Andric /// ...
320b57cec5SDimitry Andric /// S_MOV_B64 EXEC, Tmp
330b57cec5SDimitry Andric ///
34fe6060f1SDimitry Andric /// We also compute when a sequence of instructions requires strict whole
35fe6060f1SDimitry Andric /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
360b57cec5SDimitry Andric ///
370b57cec5SDimitry Andric /// S_OR_SAVEEXEC_B64 Tmp, -1
380b57cec5SDimitry Andric /// ...
390b57cec5SDimitry Andric /// S_MOV_B64 EXEC, Tmp
400b57cec5SDimitry Andric ///
41fe6060f1SDimitry Andric /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42fe6060f1SDimitry Andric /// we use a similar save and restore mechanism and force whole quad mode for
43fe6060f1SDimitry Andric /// those instructions:
44fe6060f1SDimitry Andric ///
45fe6060f1SDimitry Andric /// S_MOV_B64 Tmp, EXEC
46fe6060f1SDimitry Andric /// S_WQM_B64 EXEC, EXEC
47fe6060f1SDimitry Andric /// ...
48fe6060f1SDimitry Andric /// S_MOV_B64 EXEC, Tmp
49fe6060f1SDimitry Andric ///
500b57cec5SDimitry Andric /// In order to avoid excessive switching during sequences of Exact
510b57cec5SDimitry Andric /// instructions, the pass first analyzes which instructions must be run in WQM
520b57cec5SDimitry Andric /// (aka which instructions produce values that lead to derivative
530b57cec5SDimitry Andric /// computations).
540b57cec5SDimitry Andric ///
550b57cec5SDimitry Andric /// Basic blocks are always exited in WQM as long as some successor needs WQM.
560b57cec5SDimitry Andric ///
570b57cec5SDimitry Andric /// There is room for improvement given better control flow analysis:
580b57cec5SDimitry Andric ///
590b57cec5SDimitry Andric /// (1) at the top level (outside of control flow statements, and as long as
600b57cec5SDimitry Andric /// kill hasn't been used), one SGPR can be saved by recovering WQM from
610b57cec5SDimitry Andric /// the LiveMask (this is implemented for the entry block).
620b57cec5SDimitry Andric ///
630b57cec5SDimitry Andric /// (2) when entire regions (e.g. if-else blocks or entire loops) only
640b57cec5SDimitry Andric /// consist of exact and don't-care instructions, the switch only has to
650b57cec5SDimitry Andric /// be done at the entry and exit points rather than potentially in each
660b57cec5SDimitry Andric /// block of the region.
670b57cec5SDimitry Andric ///
680b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
690b57cec5SDimitry Andric
700b57cec5SDimitry Andric #include "AMDGPU.h"
71e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
72480093f4SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
735ffd83dbSDimitry Andric #include "llvm/ADT/MapVector.h"
740b57cec5SDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
750b57cec5SDimitry Andric #include "llvm/CodeGen/LiveIntervals.h"
760b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
77fe6060f1SDimitry Andric #include "llvm/CodeGen/MachineDominators.h"
780b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
790b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
80fe6060f1SDimitry Andric #include "llvm/CodeGen/MachinePostDominators.h"
810b57cec5SDimitry Andric #include "llvm/IR/CallingConv.h"
82480093f4SDimitry Andric #include "llvm/InitializePasses.h"
830b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h"
840b57cec5SDimitry Andric
850b57cec5SDimitry Andric using namespace llvm;
860b57cec5SDimitry Andric
870b57cec5SDimitry Andric #define DEBUG_TYPE "si-wqm"
880b57cec5SDimitry Andric
890b57cec5SDimitry Andric namespace {
900b57cec5SDimitry Andric
910b57cec5SDimitry Andric enum {
920b57cec5SDimitry Andric StateWQM = 0x1,
93fe6060f1SDimitry Andric StateStrictWWM = 0x2,
94fe6060f1SDimitry Andric StateStrictWQM = 0x4,
95fe6060f1SDimitry Andric StateExact = 0x8,
96fe6060f1SDimitry Andric StateStrict = StateStrictWWM | StateStrictWQM,
970b57cec5SDimitry Andric };
980b57cec5SDimitry Andric
990b57cec5SDimitry Andric struct PrintState {
1000b57cec5SDimitry Andric public:
1010b57cec5SDimitry Andric int State;
1020b57cec5SDimitry Andric
PrintState__anonf56fbe7e0111::PrintState1030b57cec5SDimitry Andric explicit PrintState(int State) : State(State) {}
1040b57cec5SDimitry Andric };
1050b57cec5SDimitry Andric
1060b57cec5SDimitry Andric #ifndef NDEBUG
operator <<(raw_ostream & OS,const PrintState & PS)1070b57cec5SDimitry Andric static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
1080b57cec5SDimitry Andric
109fe6060f1SDimitry Andric static const std::pair<char, const char *> Mapping[] = {
110bdd1243dSDimitry Andric std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111bdd1243dSDimitry Andric std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112fe6060f1SDimitry Andric char State = PS.State;
113fe6060f1SDimitry Andric for (auto M : Mapping) {
114fe6060f1SDimitry Andric if (State & M.first) {
115fe6060f1SDimitry Andric OS << M.second;
116fe6060f1SDimitry Andric State &= ~M.first;
117fe6060f1SDimitry Andric
118fe6060f1SDimitry Andric if (State)
119fe6060f1SDimitry Andric OS << '|';
120fe6060f1SDimitry Andric }
121fe6060f1SDimitry Andric }
122fe6060f1SDimitry Andric assert(State == 0);
1230b57cec5SDimitry Andric return OS;
1240b57cec5SDimitry Andric }
1250b57cec5SDimitry Andric #endif
1260b57cec5SDimitry Andric
1270b57cec5SDimitry Andric struct InstrInfo {
1280b57cec5SDimitry Andric char Needs = 0;
1290b57cec5SDimitry Andric char Disabled = 0;
1300b57cec5SDimitry Andric char OutNeeds = 0;
1310b57cec5SDimitry Andric };
1320b57cec5SDimitry Andric
1330b57cec5SDimitry Andric struct BlockInfo {
1340b57cec5SDimitry Andric char Needs = 0;
1350b57cec5SDimitry Andric char InNeeds = 0;
1360b57cec5SDimitry Andric char OutNeeds = 0;
137fe6060f1SDimitry Andric char InitialState = 0;
138fe6060f1SDimitry Andric bool NeedsLowering = false;
1390b57cec5SDimitry Andric };
1400b57cec5SDimitry Andric
1410b57cec5SDimitry Andric struct WorkItem {
1420b57cec5SDimitry Andric MachineBasicBlock *MBB = nullptr;
1430b57cec5SDimitry Andric MachineInstr *MI = nullptr;
1440b57cec5SDimitry Andric
1450b57cec5SDimitry Andric WorkItem() = default;
WorkItem__anonf56fbe7e0111::WorkItem1460b57cec5SDimitry Andric WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem__anonf56fbe7e0111::WorkItem1470b57cec5SDimitry Andric WorkItem(MachineInstr *MI) : MI(MI) {}
1480b57cec5SDimitry Andric };
1490b57cec5SDimitry Andric
1500b57cec5SDimitry Andric class SIWholeQuadMode : public MachineFunctionPass {
1510b57cec5SDimitry Andric private:
1520b57cec5SDimitry Andric const SIInstrInfo *TII;
1530b57cec5SDimitry Andric const SIRegisterInfo *TRI;
1540b57cec5SDimitry Andric const GCNSubtarget *ST;
1550b57cec5SDimitry Andric MachineRegisterInfo *MRI;
1560b57cec5SDimitry Andric LiveIntervals *LIS;
157fe6060f1SDimitry Andric MachineDominatorTree *MDT;
158fe6060f1SDimitry Andric MachinePostDominatorTree *PDT;
1590b57cec5SDimitry Andric
160e8d8bef9SDimitry Andric unsigned AndOpc;
16106c3fb27SDimitry Andric unsigned AndTermOpc;
162fe6060f1SDimitry Andric unsigned AndN2Opc;
163fe6060f1SDimitry Andric unsigned XorOpc;
164fe6060f1SDimitry Andric unsigned AndSaveExecOpc;
16506c3fb27SDimitry Andric unsigned AndSaveExecTermOpc;
166fe6060f1SDimitry Andric unsigned WQMOpc;
167fe6060f1SDimitry Andric Register Exec;
168fe6060f1SDimitry Andric Register LiveMaskReg;
169e8d8bef9SDimitry Andric
1700b57cec5SDimitry Andric DenseMap<const MachineInstr *, InstrInfo> Instructions;
1715ffd83dbSDimitry Andric MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172fe6060f1SDimitry Andric
173fe6060f1SDimitry Andric // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174fe6060f1SDimitry Andric DenseMap<const MachineInstr *, char> StateTransition;
175fe6060f1SDimitry Andric
176fe6060f1SDimitry Andric SmallVector<MachineInstr *, 2> LiveMaskQueries;
177480093f4SDimitry Andric SmallVector<MachineInstr *, 4> LowerToMovInstrs;
1780b57cec5SDimitry Andric SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179fe6060f1SDimitry Andric SmallVector<MachineInstr *, 4> KillInstrs;
180*0fca6ea1SDimitry Andric SmallVector<MachineInstr *, 4> InitExecInstrs;
1810b57cec5SDimitry Andric
1820b57cec5SDimitry Andric void printInfo();
1830b57cec5SDimitry Andric
1840b57cec5SDimitry Andric void markInstruction(MachineInstr &MI, char Flag,
1850b57cec5SDimitry Andric std::vector<WorkItem> &Worklist);
186e8d8bef9SDimitry Andric void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187e8d8bef9SDimitry Andric unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188fe6060f1SDimitry Andric void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189fe6060f1SDimitry Andric std::vector<WorkItem> &Worklist);
1900b57cec5SDimitry Andric void markInstructionUses(const MachineInstr &MI, char Flag,
1910b57cec5SDimitry Andric std::vector<WorkItem> &Worklist);
1920b57cec5SDimitry Andric char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
1930b57cec5SDimitry Andric void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
1940b57cec5SDimitry Andric void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
1950b57cec5SDimitry Andric char analyzeFunction(MachineFunction &MF);
1960b57cec5SDimitry Andric
1970b57cec5SDimitry Andric MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
1980b57cec5SDimitry Andric MachineBasicBlock::iterator Before);
1990b57cec5SDimitry Andric MachineBasicBlock::iterator
2000b57cec5SDimitry Andric prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
2010b57cec5SDimitry Andric MachineBasicBlock::iterator Last, bool PreferLast,
2020b57cec5SDimitry Andric bool SaveSCC);
2030b57cec5SDimitry Andric void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204fe6060f1SDimitry Andric Register SaveWQM);
2050b57cec5SDimitry Andric void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206fe6060f1SDimitry Andric Register SavedWQM);
207fe6060f1SDimitry Andric void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208fe6060f1SDimitry Andric Register SaveOrig, char StrictStateNeeded);
209fe6060f1SDimitry Andric void fromStrictMode(MachineBasicBlock &MBB,
210fe6060f1SDimitry Andric MachineBasicBlock::iterator Before, Register SavedOrig,
211fe6060f1SDimitry Andric char NonStrictState, char CurrentStrictState);
2120b57cec5SDimitry Andric
213fe6060f1SDimitry Andric MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214fe6060f1SDimitry Andric
215fe6060f1SDimitry Andric MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216fe6060f1SDimitry Andric bool IsWQM);
217fe6060f1SDimitry Andric MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218fe6060f1SDimitry Andric
219fe6060f1SDimitry Andric void lowerBlock(MachineBasicBlock &MBB);
220fe6060f1SDimitry Andric void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221fe6060f1SDimitry Andric
222*0fca6ea1SDimitry Andric bool lowerLiveMaskQueries();
223*0fca6ea1SDimitry Andric bool lowerCopyInstrs();
224*0fca6ea1SDimitry Andric bool lowerKillInstrs(bool IsWQM);
225*0fca6ea1SDimitry Andric void lowerInitExec(MachineInstr &MI);
226*0fca6ea1SDimitry Andric MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
227*0fca6ea1SDimitry Andric bool &Changed);
2280b57cec5SDimitry Andric
2290b57cec5SDimitry Andric public:
2300b57cec5SDimitry Andric static char ID;
2310b57cec5SDimitry Andric
SIWholeQuadMode()2320b57cec5SDimitry Andric SIWholeQuadMode() :
2330b57cec5SDimitry Andric MachineFunctionPass(ID) { }
2340b57cec5SDimitry Andric
2350b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
2360b57cec5SDimitry Andric
getPassName() const2370b57cec5SDimitry Andric StringRef getPassName() const override { return "SI Whole Quad Mode"; }
2380b57cec5SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const2390b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
240*0fca6ea1SDimitry Andric AU.addRequired<LiveIntervalsWrapperPass>();
241*0fca6ea1SDimitry Andric AU.addPreserved<SlotIndexesWrapperPass>();
242*0fca6ea1SDimitry Andric AU.addPreserved<LiveIntervalsWrapperPass>();
243*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>();
244*0fca6ea1SDimitry Andric AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
2450b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
2460b57cec5SDimitry Andric }
247fe6060f1SDimitry Andric
getClearedProperties() const248fe6060f1SDimitry Andric MachineFunctionProperties getClearedProperties() const override {
249fe6060f1SDimitry Andric return MachineFunctionProperties().set(
250fe6060f1SDimitry Andric MachineFunctionProperties::Property::IsSSA);
251fe6060f1SDimitry Andric }
2520b57cec5SDimitry Andric };
2530b57cec5SDimitry Andric
2540b57cec5SDimitry Andric } // end anonymous namespace
2550b57cec5SDimitry Andric
2560b57cec5SDimitry Andric char SIWholeQuadMode::ID = 0;
2570b57cec5SDimitry Andric
2580b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
2590b57cec5SDimitry Andric false)
260*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
261*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
262*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
2630b57cec5SDimitry Andric INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
2640b57cec5SDimitry Andric false)
2650b57cec5SDimitry Andric
2660b57cec5SDimitry Andric char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
2670b57cec5SDimitry Andric
createSIWholeQuadModePass()2680b57cec5SDimitry Andric FunctionPass *llvm::createSIWholeQuadModePass() {
2690b57cec5SDimitry Andric return new SIWholeQuadMode;
2700b57cec5SDimitry Andric }
2710b57cec5SDimitry Andric
2720b57cec5SDimitry Andric #ifndef NDEBUG
printInfo()2730b57cec5SDimitry Andric LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
2740b57cec5SDimitry Andric for (const auto &BII : Blocks) {
2750b57cec5SDimitry Andric dbgs() << "\n"
2760b57cec5SDimitry Andric << printMBBReference(*BII.first) << ":\n"
2770b57cec5SDimitry Andric << " InNeeds = " << PrintState(BII.second.InNeeds)
2780b57cec5SDimitry Andric << ", Needs = " << PrintState(BII.second.Needs)
2790b57cec5SDimitry Andric << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
2800b57cec5SDimitry Andric
2810b57cec5SDimitry Andric for (const MachineInstr &MI : *BII.first) {
2820b57cec5SDimitry Andric auto III = Instructions.find(&MI);
283*0fca6ea1SDimitry Andric if (III != Instructions.end()) {
2840b57cec5SDimitry Andric dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
2850b57cec5SDimitry Andric << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
2860b57cec5SDimitry Andric }
2870b57cec5SDimitry Andric }
2880b57cec5SDimitry Andric }
289*0fca6ea1SDimitry Andric }
2900b57cec5SDimitry Andric #endif
2910b57cec5SDimitry Andric
markInstruction(MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)2920b57cec5SDimitry Andric void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
2930b57cec5SDimitry Andric std::vector<WorkItem> &Worklist) {
2940b57cec5SDimitry Andric InstrInfo &II = Instructions[&MI];
2950b57cec5SDimitry Andric
2960b57cec5SDimitry Andric assert(!(Flag & StateExact) && Flag != 0);
2970b57cec5SDimitry Andric
2980b57cec5SDimitry Andric // Remove any disabled states from the flag. The user that required it gets
2990b57cec5SDimitry Andric // an undefined value in the helper lanes. For example, this can happen if
3000b57cec5SDimitry Andric // the result of an atomic is used by instruction that requires WQM, where
3010b57cec5SDimitry Andric // ignoring the request for WQM is correct as per the relevant specs.
3020b57cec5SDimitry Andric Flag &= ~II.Disabled;
3030b57cec5SDimitry Andric
3040b57cec5SDimitry Andric // Ignore if the flag is already encompassed by the existing needs, or we
3050b57cec5SDimitry Andric // just disabled everything.
3060b57cec5SDimitry Andric if ((II.Needs & Flag) == Flag)
3070b57cec5SDimitry Andric return;
3080b57cec5SDimitry Andric
309fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
3100b57cec5SDimitry Andric II.Needs |= Flag;
311*0fca6ea1SDimitry Andric Worklist.emplace_back(&MI);
3120b57cec5SDimitry Andric }
3130b57cec5SDimitry Andric
314e8d8bef9SDimitry Andric /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
markDefs(const MachineInstr & UseMI,LiveRange & LR,Register Reg,unsigned SubReg,char Flag,std::vector<WorkItem> & Worklist)315e8d8bef9SDimitry Andric void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316e8d8bef9SDimitry Andric Register Reg, unsigned SubReg, char Flag,
317e8d8bef9SDimitry Andric std::vector<WorkItem> &Worklist) {
318e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319e8d8bef9SDimitry Andric
320e8d8bef9SDimitry Andric LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321fe6060f1SDimitry Andric const VNInfo *Value = UseLRQ.valueIn();
322fe6060f1SDimitry Andric if (!Value)
323e8d8bef9SDimitry Andric return;
324e8d8bef9SDimitry Andric
325fe6060f1SDimitry Andric // Note: this code assumes that lane masks on AMDGPU completely
326fe6060f1SDimitry Andric // cover registers.
327fe6060f1SDimitry Andric const LaneBitmask UseLanes =
328fe6060f1SDimitry Andric SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329fe6060f1SDimitry Andric : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330fe6060f1SDimitry Andric : LaneBitmask::getNone());
331fe6060f1SDimitry Andric
332fe6060f1SDimitry Andric // Perform a depth-first iteration of the LiveRange graph marking defs.
333fe6060f1SDimitry Andric // Stop processing of a given branch when all use lanes have been defined.
334fe6060f1SDimitry Andric // The first definition stops processing for a physical register.
335fe6060f1SDimitry Andric struct PhiEntry {
336fe6060f1SDimitry Andric const VNInfo *Phi;
337fe6060f1SDimitry Andric unsigned PredIdx;
338fe6060f1SDimitry Andric LaneBitmask DefinedLanes;
339fe6060f1SDimitry Andric
340fe6060f1SDimitry Andric PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341fe6060f1SDimitry Andric : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342fe6060f1SDimitry Andric };
343fe6060f1SDimitry Andric using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344fe6060f1SDimitry Andric SmallVector<PhiEntry, 2> PhiStack;
345fe6060f1SDimitry Andric SmallSet<VisitKey, 4> Visited;
346fe6060f1SDimitry Andric LaneBitmask DefinedLanes;
347fe6060f1SDimitry Andric unsigned NextPredIdx = 0; // Only used for processing phi nodes
348e8d8bef9SDimitry Andric do {
349fe6060f1SDimitry Andric const VNInfo *NextValue = nullptr;
350fe6060f1SDimitry Andric const VisitKey Key(Value, DefinedLanes);
351fe6060f1SDimitry Andric
35281ad6265SDimitry Andric if (Visited.insert(Key).second) {
353fe6060f1SDimitry Andric // On first visit to a phi then start processing first predecessor
354fe6060f1SDimitry Andric NextPredIdx = 0;
355fe6060f1SDimitry Andric }
356e8d8bef9SDimitry Andric
357e8d8bef9SDimitry Andric if (Value->isPHIDef()) {
358fe6060f1SDimitry Andric // Each predecessor node in the phi must be processed as a subgraph
359e8d8bef9SDimitry Andric const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360e8d8bef9SDimitry Andric assert(MBB && "Phi-def has no defining MBB");
361fe6060f1SDimitry Andric
362fe6060f1SDimitry Andric // Find next predecessor to process
363fe6060f1SDimitry Andric unsigned Idx = NextPredIdx;
364fe6060f1SDimitry Andric auto PI = MBB->pred_begin() + Idx;
365fe6060f1SDimitry Andric auto PE = MBB->pred_end();
366fe6060f1SDimitry Andric for (; PI != PE && !NextValue; ++PI, ++Idx) {
367e8d8bef9SDimitry Andric if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368fe6060f1SDimitry Andric if (!Visited.count(VisitKey(VN, DefinedLanes)))
369fe6060f1SDimitry Andric NextValue = VN;
370e8d8bef9SDimitry Andric }
371e8d8bef9SDimitry Andric }
372fe6060f1SDimitry Andric
373fe6060f1SDimitry Andric // If there are more predecessors to process; add phi to stack
374fe6060f1SDimitry Andric if (PI != PE)
375fe6060f1SDimitry Andric PhiStack.emplace_back(Value, Idx, DefinedLanes);
376e8d8bef9SDimitry Andric } else {
377e8d8bef9SDimitry Andric MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378e8d8bef9SDimitry Andric assert(MI && "Def has no defining instruction");
379e8d8bef9SDimitry Andric
380fe6060f1SDimitry Andric if (Reg.isVirtual()) {
381e8d8bef9SDimitry Andric // Iterate over all operands to find relevant definitions
382fe6060f1SDimitry Andric bool HasDef = false;
38306c3fb27SDimitry Andric for (const MachineOperand &Op : MI->all_defs()) {
38406c3fb27SDimitry Andric if (Op.getReg() != Reg)
385e8d8bef9SDimitry Andric continue;
386e8d8bef9SDimitry Andric
387fe6060f1SDimitry Andric // Compute lanes defined and overlap with use
388fe6060f1SDimitry Andric LaneBitmask OpLanes =
389fe6060f1SDimitry Andric Op.isUndef() ? LaneBitmask::getAll()
390fe6060f1SDimitry Andric : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391fe6060f1SDimitry Andric LaneBitmask Overlap = (UseLanes & OpLanes);
392fe6060f1SDimitry Andric
393fe6060f1SDimitry Andric // Record if this instruction defined any of use
394fe6060f1SDimitry Andric HasDef |= Overlap.any();
395fe6060f1SDimitry Andric
396fe6060f1SDimitry Andric // Mark any lanes defined
397fe6060f1SDimitry Andric DefinedLanes |= OpLanes;
398fe6060f1SDimitry Andric }
399fe6060f1SDimitry Andric
400fe6060f1SDimitry Andric // Check if all lanes of use have been defined
401fe6060f1SDimitry Andric if ((DefinedLanes & UseLanes) != UseLanes) {
402fe6060f1SDimitry Andric // Definition not complete; need to process input value
403e8d8bef9SDimitry Andric LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404e8d8bef9SDimitry Andric if (const VNInfo *VN = LRQ.valueIn()) {
405fe6060f1SDimitry Andric if (!Visited.count(VisitKey(VN, DefinedLanes)))
406fe6060f1SDimitry Andric NextValue = VN;
407e8d8bef9SDimitry Andric }
408e8d8bef9SDimitry Andric }
409e8d8bef9SDimitry Andric
410fe6060f1SDimitry Andric // Only mark the instruction if it defines some part of the use
411fe6060f1SDimitry Andric if (HasDef)
412fe6060f1SDimitry Andric markInstruction(*MI, Flag, Worklist);
413fe6060f1SDimitry Andric } else {
414fe6060f1SDimitry Andric // For physical registers simply mark the defining instruction
415fe6060f1SDimitry Andric markInstruction(*MI, Flag, Worklist);
416fe6060f1SDimitry Andric }
417fe6060f1SDimitry Andric }
418fe6060f1SDimitry Andric
419fe6060f1SDimitry Andric if (!NextValue && !PhiStack.empty()) {
420fe6060f1SDimitry Andric // Reach end of chain; revert to processing last phi
421fe6060f1SDimitry Andric PhiEntry &Entry = PhiStack.back();
422fe6060f1SDimitry Andric NextValue = Entry.Phi;
423fe6060f1SDimitry Andric NextPredIdx = Entry.PredIdx;
424fe6060f1SDimitry Andric DefinedLanes = Entry.DefinedLanes;
425fe6060f1SDimitry Andric PhiStack.pop_back();
426fe6060f1SDimitry Andric }
427fe6060f1SDimitry Andric
428fe6060f1SDimitry Andric Value = NextValue;
429fe6060f1SDimitry Andric } while (Value);
430fe6060f1SDimitry Andric }
431fe6060f1SDimitry Andric
markOperand(const MachineInstr & MI,const MachineOperand & Op,char Flag,std::vector<WorkItem> & Worklist)432fe6060f1SDimitry Andric void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433fe6060f1SDimitry Andric const MachineOperand &Op, char Flag,
4340b57cec5SDimitry Andric std::vector<WorkItem> &Worklist) {
435fe6060f1SDimitry Andric assert(Op.isReg());
436fe6060f1SDimitry Andric Register Reg = Op.getReg();
437e8d8bef9SDimitry Andric
438fe6060f1SDimitry Andric // Ignore some hardware registers
439fe6060f1SDimitry Andric switch (Reg) {
440fe6060f1SDimitry Andric case AMDGPU::EXEC:
441fe6060f1SDimitry Andric case AMDGPU::EXEC_LO:
442fe6060f1SDimitry Andric return;
443fe6060f1SDimitry Andric default:
444fe6060f1SDimitry Andric break;
445fe6060f1SDimitry Andric }
446e8d8bef9SDimitry Andric
447fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448fe6060f1SDimitry Andric << " for " << MI);
449fe6060f1SDimitry Andric if (Reg.isVirtual()) {
450fe6060f1SDimitry Andric LiveRange &LR = LIS->getInterval(Reg);
451fe6060f1SDimitry Andric markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452fe6060f1SDimitry Andric } else {
4530b57cec5SDimitry Andric // Handle physical registers that we need to track; this is mostly relevant
4540b57cec5SDimitry Andric // for VCC, which can appear as the (implicit) input of a uniform branch,
4550b57cec5SDimitry Andric // e.g. when a loop counter is stored in a VGPR.
45606c3fb27SDimitry Andric for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
45706c3fb27SDimitry Andric LiveRange &LR = LIS->getRegUnit(Unit);
4580b57cec5SDimitry Andric const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
459*0fca6ea1SDimitry Andric if (Value)
46006c3fb27SDimitry Andric markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
461e8d8bef9SDimitry Andric }
4620b57cec5SDimitry Andric }
463fe6060f1SDimitry Andric }
4640b57cec5SDimitry Andric
465fe6060f1SDimitry Andric /// Mark all instructions defining the uses in \p MI with \p Flag.
markInstructionUses(const MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)466fe6060f1SDimitry Andric void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
467fe6060f1SDimitry Andric std::vector<WorkItem> &Worklist) {
468fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
469fe6060f1SDimitry Andric << MI);
470fe6060f1SDimitry Andric
47106c3fb27SDimitry Andric for (const MachineOperand &Use : MI.all_uses())
472fe6060f1SDimitry Andric markOperand(MI, Use, Flag, Worklist);
4730b57cec5SDimitry Andric }
4740b57cec5SDimitry Andric
4750b57cec5SDimitry Andric // Scan instructions to determine which ones require an Exact execmask and
4760b57cec5SDimitry Andric // which ones seed WQM requirements.
scanInstructions(MachineFunction & MF,std::vector<WorkItem> & Worklist)4770b57cec5SDimitry Andric char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
4780b57cec5SDimitry Andric std::vector<WorkItem> &Worklist) {
4790b57cec5SDimitry Andric char GlobalFlags = 0;
4800b57cec5SDimitry Andric bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
4810b57cec5SDimitry Andric SmallVector<MachineInstr *, 4> SetInactiveInstrs;
4828bcb0991SDimitry Andric SmallVector<MachineInstr *, 4> SoftWQMInstrs;
4834824e7fdSDimitry Andric bool HasImplicitDerivatives =
4844824e7fdSDimitry Andric MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
4850b57cec5SDimitry Andric
4860b57cec5SDimitry Andric // We need to visit the basic blocks in reverse post-order so that we visit
4870b57cec5SDimitry Andric // defs before uses, in particular so that we don't accidentally mark an
4880b57cec5SDimitry Andric // instruction as needing e.g. WQM before visiting it and realizing it needs
4890b57cec5SDimitry Andric // WQM disabled.
4900b57cec5SDimitry Andric ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
4910eae32dcSDimitry Andric for (MachineBasicBlock *MBB : RPOT) {
4920eae32dcSDimitry Andric BlockInfo &BBI = Blocks[MBB];
4930b57cec5SDimitry Andric
4940eae32dcSDimitry Andric for (MachineInstr &MI : *MBB) {
4950b57cec5SDimitry Andric InstrInfo &III = Instructions[&MI];
4960b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode();
4970b57cec5SDimitry Andric char Flags = 0;
4980b57cec5SDimitry Andric
4990b57cec5SDimitry Andric if (TII->isWQM(Opcode)) {
500fe6060f1SDimitry Andric // If LOD is not supported WQM is not needed.
5014824e7fdSDimitry Andric // Only generate implicit WQM if implicit derivatives are required.
5024824e7fdSDimitry Andric // This avoids inserting unintended WQM if a shader type without
5034824e7fdSDimitry Andric // implicit derivatives uses an image sampling instruction.
504*0fca6ea1SDimitry Andric if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
5050b57cec5SDimitry Andric // Sampling instructions don't need to produce results for all pixels
5060b57cec5SDimitry Andric // in a quad, they just require all inputs of a quad to have been
5070b57cec5SDimitry Andric // computed for derivatives.
5080b57cec5SDimitry Andric markInstructionUses(MI, StateWQM, Worklist);
5090b57cec5SDimitry Andric GlobalFlags |= StateWQM;
510*0fca6ea1SDimitry Andric }
5110b57cec5SDimitry Andric } else if (Opcode == AMDGPU::WQM) {
5120b57cec5SDimitry Andric // The WQM intrinsic requires its output to have all the helper lanes
5130b57cec5SDimitry Andric // correct, so we need it to be in WQM.
5140b57cec5SDimitry Andric Flags = StateWQM;
5150b57cec5SDimitry Andric LowerToCopyInstrs.push_back(&MI);
5168bcb0991SDimitry Andric } else if (Opcode == AMDGPU::SOFT_WQM) {
5178bcb0991SDimitry Andric LowerToCopyInstrs.push_back(&MI);
5188bcb0991SDimitry Andric SoftWQMInstrs.push_back(&MI);
519fe6060f1SDimitry Andric } else if (Opcode == AMDGPU::STRICT_WWM) {
520fe6060f1SDimitry Andric // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
521fe6060f1SDimitry Andric // it needs to be executed in WQM or Exact so that its copy doesn't
522fe6060f1SDimitry Andric // clobber inactive lanes.
523fe6060f1SDimitry Andric markInstructionUses(MI, StateStrictWWM, Worklist);
524fe6060f1SDimitry Andric GlobalFlags |= StateStrictWWM;
525fe6060f1SDimitry Andric LowerToMovInstrs.push_back(&MI);
52681ad6265SDimitry Andric } else if (Opcode == AMDGPU::STRICT_WQM ||
52781ad6265SDimitry Andric TII->isDualSourceBlendEXP(MI)) {
528fe6060f1SDimitry Andric // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
529fe6060f1SDimitry Andric // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
530fe6060f1SDimitry Andric // quads that have at least one active thread.
531fe6060f1SDimitry Andric markInstructionUses(MI, StateStrictWQM, Worklist);
532fe6060f1SDimitry Andric GlobalFlags |= StateStrictWQM;
53381ad6265SDimitry Andric
53481ad6265SDimitry Andric if (Opcode == AMDGPU::STRICT_WQM) {
535480093f4SDimitry Andric LowerToMovInstrs.push_back(&MI);
53681ad6265SDimitry Andric } else {
53781ad6265SDimitry Andric // Dual source blend export acts as implicit strict-wqm, its sources
53881ad6265SDimitry Andric // need to be shuffled in strict wqm, but the export itself needs to
53981ad6265SDimitry Andric // run in exact mode.
54081ad6265SDimitry Andric BBI.Needs |= StateExact;
54181ad6265SDimitry Andric if (!(BBI.InNeeds & StateExact)) {
54281ad6265SDimitry Andric BBI.InNeeds |= StateExact;
543*0fca6ea1SDimitry Andric Worklist.emplace_back(MBB);
54481ad6265SDimitry Andric }
54581ad6265SDimitry Andric GlobalFlags |= StateExact;
54681ad6265SDimitry Andric III.Disabled = StateWQM | StateStrict;
54781ad6265SDimitry Andric }
54881ad6265SDimitry Andric } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
5491db9f3b2SDimitry Andric Opcode == AMDGPU::DS_PARAM_LOAD ||
5501db9f3b2SDimitry Andric Opcode == AMDGPU::LDS_DIRECT_LOAD ||
5511db9f3b2SDimitry Andric Opcode == AMDGPU::DS_DIRECT_LOAD) {
55281ad6265SDimitry Andric // Mark these STRICTWQM, but only for the instruction, not its operands.
55381ad6265SDimitry Andric // This avoid unnecessarily marking M0 as requiring WQM.
554*0fca6ea1SDimitry Andric III.Needs |= StateStrictWQM;
55581ad6265SDimitry Andric GlobalFlags |= StateStrictWQM;
5560b57cec5SDimitry Andric } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
5570b57cec5SDimitry Andric Opcode == AMDGPU::V_SET_INACTIVE_B64) {
558fe6060f1SDimitry Andric III.Disabled = StateStrict;
5590b57cec5SDimitry Andric MachineOperand &Inactive = MI.getOperand(2);
5600b57cec5SDimitry Andric if (Inactive.isReg()) {
5610b57cec5SDimitry Andric if (Inactive.isUndef()) {
5620b57cec5SDimitry Andric LowerToCopyInstrs.push_back(&MI);
5630b57cec5SDimitry Andric } else {
564fe6060f1SDimitry Andric markOperand(MI, Inactive, StateStrictWWM, Worklist);
5650b57cec5SDimitry Andric }
5660b57cec5SDimitry Andric }
5670b57cec5SDimitry Andric SetInactiveInstrs.push_back(&MI);
5680b57cec5SDimitry Andric } else if (TII->isDisableWQM(MI)) {
5690b57cec5SDimitry Andric BBI.Needs |= StateExact;
5700b57cec5SDimitry Andric if (!(BBI.InNeeds & StateExact)) {
5710b57cec5SDimitry Andric BBI.InNeeds |= StateExact;
572*0fca6ea1SDimitry Andric Worklist.emplace_back(MBB);
5730b57cec5SDimitry Andric }
5740b57cec5SDimitry Andric GlobalFlags |= StateExact;
575fe6060f1SDimitry Andric III.Disabled = StateWQM | StateStrict;
576*0fca6ea1SDimitry Andric } else if (Opcode == AMDGPU::SI_PS_LIVE ||
577*0fca6ea1SDimitry Andric Opcode == AMDGPU::SI_LIVE_MASK) {
5780b57cec5SDimitry Andric LiveMaskQueries.push_back(&MI);
579fe6060f1SDimitry Andric } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
580fe6060f1SDimitry Andric Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
581fe6060f1SDimitry Andric Opcode == AMDGPU::SI_DEMOTE_I1) {
582fe6060f1SDimitry Andric KillInstrs.push_back(&MI);
583fe6060f1SDimitry Andric BBI.NeedsLowering = true;
584*0fca6ea1SDimitry Andric } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
585*0fca6ea1SDimitry Andric Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
586*0fca6ea1SDimitry Andric InitExecInstrs.push_back(&MI);
5870b57cec5SDimitry Andric } else if (WQMOutputs) {
5880b57cec5SDimitry Andric // The function is in machine SSA form, which means that physical
5890b57cec5SDimitry Andric // VGPRs correspond to shader inputs and outputs. Inputs are
5900b57cec5SDimitry Andric // only used, outputs are only defined.
591fe6060f1SDimitry Andric // FIXME: is this still valid?
5920b57cec5SDimitry Andric for (const MachineOperand &MO : MI.defs()) {
5938bcb0991SDimitry Andric Register Reg = MO.getReg();
594*0fca6ea1SDimitry Andric if (Reg.isPhysical() &&
595bdd1243dSDimitry Andric TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
5960b57cec5SDimitry Andric Flags = StateWQM;
5970b57cec5SDimitry Andric break;
5980b57cec5SDimitry Andric }
5990b57cec5SDimitry Andric }
6000b57cec5SDimitry Andric }
6010b57cec5SDimitry Andric
602*0fca6ea1SDimitry Andric if (Flags) {
6030b57cec5SDimitry Andric markInstruction(MI, Flags, Worklist);
6040b57cec5SDimitry Andric GlobalFlags |= Flags;
6050b57cec5SDimitry Andric }
6060b57cec5SDimitry Andric }
607*0fca6ea1SDimitry Andric }
6080b57cec5SDimitry Andric
6090b57cec5SDimitry Andric // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
6100b57cec5SDimitry Andric // ever used anywhere in the function. This implements the corresponding
6110b57cec5SDimitry Andric // semantics of @llvm.amdgcn.set.inactive.
6128bcb0991SDimitry Andric // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
6130b57cec5SDimitry Andric if (GlobalFlags & StateWQM) {
6140b57cec5SDimitry Andric for (MachineInstr *MI : SetInactiveInstrs)
6150b57cec5SDimitry Andric markInstruction(*MI, StateWQM, Worklist);
6168bcb0991SDimitry Andric for (MachineInstr *MI : SoftWQMInstrs)
6178bcb0991SDimitry Andric markInstruction(*MI, StateWQM, Worklist);
6180b57cec5SDimitry Andric }
6190b57cec5SDimitry Andric
6200b57cec5SDimitry Andric return GlobalFlags;
6210b57cec5SDimitry Andric }
6220b57cec5SDimitry Andric
propagateInstruction(MachineInstr & MI,std::vector<WorkItem> & Worklist)6230b57cec5SDimitry Andric void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
6240b57cec5SDimitry Andric std::vector<WorkItem>& Worklist) {
6250b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
6260b57cec5SDimitry Andric InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
6270b57cec5SDimitry Andric BlockInfo &BI = Blocks[MBB];
6280b57cec5SDimitry Andric
6290b57cec5SDimitry Andric // Control flow-type instructions and stores to temporary memory that are
6300b57cec5SDimitry Andric // followed by WQM computations must themselves be in WQM.
6310b57cec5SDimitry Andric if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
6320b57cec5SDimitry Andric (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
6330b57cec5SDimitry Andric Instructions[&MI].Needs = StateWQM;
6340b57cec5SDimitry Andric II.Needs = StateWQM;
6350b57cec5SDimitry Andric }
6360b57cec5SDimitry Andric
6370b57cec5SDimitry Andric // Propagate to block level
6380b57cec5SDimitry Andric if (II.Needs & StateWQM) {
6390b57cec5SDimitry Andric BI.Needs |= StateWQM;
6400b57cec5SDimitry Andric if (!(BI.InNeeds & StateWQM)) {
6410b57cec5SDimitry Andric BI.InNeeds |= StateWQM;
642*0fca6ea1SDimitry Andric Worklist.emplace_back(MBB);
6430b57cec5SDimitry Andric }
6440b57cec5SDimitry Andric }
6450b57cec5SDimitry Andric
6460b57cec5SDimitry Andric // Propagate backwards within block
6470b57cec5SDimitry Andric if (MachineInstr *PrevMI = MI.getPrevNode()) {
648fe6060f1SDimitry Andric char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
6490b57cec5SDimitry Andric if (!PrevMI->isPHI()) {
6500b57cec5SDimitry Andric InstrInfo &PrevII = Instructions[PrevMI];
6510b57cec5SDimitry Andric if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
6520b57cec5SDimitry Andric PrevII.OutNeeds |= InNeeds;
653*0fca6ea1SDimitry Andric Worklist.emplace_back(PrevMI);
6540b57cec5SDimitry Andric }
6550b57cec5SDimitry Andric }
6560b57cec5SDimitry Andric }
6570b57cec5SDimitry Andric
6580b57cec5SDimitry Andric // Propagate WQM flag to instruction inputs
6590b57cec5SDimitry Andric assert(!(II.Needs & StateExact));
6600b57cec5SDimitry Andric
6610b57cec5SDimitry Andric if (II.Needs != 0)
6620b57cec5SDimitry Andric markInstructionUses(MI, II.Needs, Worklist);
6630b57cec5SDimitry Andric
664fe6060f1SDimitry Andric // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
665fe6060f1SDimitry Andric // not require any WQM transitions.
666fe6060f1SDimitry Andric if (II.Needs & StateStrictWWM)
667fe6060f1SDimitry Andric BI.Needs |= StateStrictWWM;
668fe6060f1SDimitry Andric if (II.Needs & StateStrictWQM)
669fe6060f1SDimitry Andric BI.Needs |= StateStrictWQM;
6700b57cec5SDimitry Andric }
6710b57cec5SDimitry Andric
propagateBlock(MachineBasicBlock & MBB,std::vector<WorkItem> & Worklist)6720b57cec5SDimitry Andric void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
6730b57cec5SDimitry Andric std::vector<WorkItem>& Worklist) {
6740b57cec5SDimitry Andric BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
6750b57cec5SDimitry Andric
6760b57cec5SDimitry Andric // Propagate through instructions
6770b57cec5SDimitry Andric if (!MBB.empty()) {
6780b57cec5SDimitry Andric MachineInstr *LastMI = &*MBB.rbegin();
6790b57cec5SDimitry Andric InstrInfo &LastII = Instructions[LastMI];
6800b57cec5SDimitry Andric if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
6810b57cec5SDimitry Andric LastII.OutNeeds |= BI.OutNeeds;
682*0fca6ea1SDimitry Andric Worklist.emplace_back(LastMI);
6830b57cec5SDimitry Andric }
6840b57cec5SDimitry Andric }
6850b57cec5SDimitry Andric
6860b57cec5SDimitry Andric // Predecessor blocks must provide for our WQM/Exact needs.
6870b57cec5SDimitry Andric for (MachineBasicBlock *Pred : MBB.predecessors()) {
6880b57cec5SDimitry Andric BlockInfo &PredBI = Blocks[Pred];
6890b57cec5SDimitry Andric if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
6900b57cec5SDimitry Andric continue;
6910b57cec5SDimitry Andric
6920b57cec5SDimitry Andric PredBI.OutNeeds |= BI.InNeeds;
6930b57cec5SDimitry Andric PredBI.InNeeds |= BI.InNeeds;
694*0fca6ea1SDimitry Andric Worklist.emplace_back(Pred);
6950b57cec5SDimitry Andric }
6960b57cec5SDimitry Andric
6970b57cec5SDimitry Andric // All successors must be prepared to accept the same set of WQM/Exact data.
6980b57cec5SDimitry Andric for (MachineBasicBlock *Succ : MBB.successors()) {
6990b57cec5SDimitry Andric BlockInfo &SuccBI = Blocks[Succ];
7000b57cec5SDimitry Andric if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
7010b57cec5SDimitry Andric continue;
7020b57cec5SDimitry Andric
7030b57cec5SDimitry Andric SuccBI.InNeeds |= BI.OutNeeds;
704*0fca6ea1SDimitry Andric Worklist.emplace_back(Succ);
7050b57cec5SDimitry Andric }
7060b57cec5SDimitry Andric }
7070b57cec5SDimitry Andric
analyzeFunction(MachineFunction & MF)7080b57cec5SDimitry Andric char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
7090b57cec5SDimitry Andric std::vector<WorkItem> Worklist;
7100b57cec5SDimitry Andric char GlobalFlags = scanInstructions(MF, Worklist);
7110b57cec5SDimitry Andric
7120b57cec5SDimitry Andric while (!Worklist.empty()) {
7130b57cec5SDimitry Andric WorkItem WI = Worklist.back();
7140b57cec5SDimitry Andric Worklist.pop_back();
7150b57cec5SDimitry Andric
7160b57cec5SDimitry Andric if (WI.MI)
7170b57cec5SDimitry Andric propagateInstruction(*WI.MI, Worklist);
7180b57cec5SDimitry Andric else
7190b57cec5SDimitry Andric propagateBlock(*WI.MBB, Worklist);
7200b57cec5SDimitry Andric }
7210b57cec5SDimitry Andric
7220b57cec5SDimitry Andric return GlobalFlags;
7230b57cec5SDimitry Andric }
7240b57cec5SDimitry Andric
7250b57cec5SDimitry Andric MachineBasicBlock::iterator
saveSCC(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before)7260b57cec5SDimitry Andric SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
7270b57cec5SDimitry Andric MachineBasicBlock::iterator Before) {
7288bcb0991SDimitry Andric Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7290b57cec5SDimitry Andric
7300b57cec5SDimitry Andric MachineInstr *Save =
7310b57cec5SDimitry Andric BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
7320b57cec5SDimitry Andric .addReg(AMDGPU::SCC);
7330b57cec5SDimitry Andric MachineInstr *Restore =
7340b57cec5SDimitry Andric BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
7350b57cec5SDimitry Andric .addReg(SaveReg);
7360b57cec5SDimitry Andric
7370b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*Save);
7380b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*Restore);
7390b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(SaveReg);
7400b57cec5SDimitry Andric
7410b57cec5SDimitry Andric return Restore;
7420b57cec5SDimitry Andric }
7430b57cec5SDimitry Andric
splitBlock(MachineBasicBlock * BB,MachineInstr * TermMI)744fe6060f1SDimitry Andric MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
745fe6060f1SDimitry Andric MachineInstr *TermMI) {
746fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
747fe6060f1SDimitry Andric << *TermMI << "\n");
748fe6060f1SDimitry Andric
749fe6060f1SDimitry Andric MachineBasicBlock *SplitBB =
750fe6060f1SDimitry Andric BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
751fe6060f1SDimitry Andric
752fe6060f1SDimitry Andric // Convert last instruction in block to a terminator.
753fe6060f1SDimitry Andric // Note: this only covers the expected patterns
754fe6060f1SDimitry Andric unsigned NewOpcode = 0;
755fe6060f1SDimitry Andric switch (TermMI->getOpcode()) {
756fe6060f1SDimitry Andric case AMDGPU::S_AND_B32:
757fe6060f1SDimitry Andric NewOpcode = AMDGPU::S_AND_B32_term;
758fe6060f1SDimitry Andric break;
759fe6060f1SDimitry Andric case AMDGPU::S_AND_B64:
760fe6060f1SDimitry Andric NewOpcode = AMDGPU::S_AND_B64_term;
761fe6060f1SDimitry Andric break;
762fe6060f1SDimitry Andric case AMDGPU::S_MOV_B32:
763fe6060f1SDimitry Andric NewOpcode = AMDGPU::S_MOV_B32_term;
764fe6060f1SDimitry Andric break;
765fe6060f1SDimitry Andric case AMDGPU::S_MOV_B64:
766fe6060f1SDimitry Andric NewOpcode = AMDGPU::S_MOV_B64_term;
767fe6060f1SDimitry Andric break;
768fe6060f1SDimitry Andric default:
769fe6060f1SDimitry Andric break;
770fe6060f1SDimitry Andric }
771fe6060f1SDimitry Andric if (NewOpcode)
772fe6060f1SDimitry Andric TermMI->setDesc(TII->get(NewOpcode));
773fe6060f1SDimitry Andric
774fe6060f1SDimitry Andric if (SplitBB != BB) {
775fe6060f1SDimitry Andric // Update dominator trees
776fe6060f1SDimitry Andric using DomTreeT = DomTreeBase<MachineBasicBlock>;
777fe6060f1SDimitry Andric SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
778fe6060f1SDimitry Andric for (MachineBasicBlock *Succ : SplitBB->successors()) {
779fe6060f1SDimitry Andric DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
780fe6060f1SDimitry Andric DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
781fe6060f1SDimitry Andric }
782fe6060f1SDimitry Andric DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
783fe6060f1SDimitry Andric if (MDT)
784fe6060f1SDimitry Andric MDT->getBase().applyUpdates(DTUpdates);
785fe6060f1SDimitry Andric if (PDT)
786*0fca6ea1SDimitry Andric PDT->applyUpdates(DTUpdates);
787fe6060f1SDimitry Andric
788fe6060f1SDimitry Andric // Link blocks
789fe6060f1SDimitry Andric MachineInstr *MI =
790fe6060f1SDimitry Andric BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
791fe6060f1SDimitry Andric .addMBB(SplitBB);
792fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
793fe6060f1SDimitry Andric }
794fe6060f1SDimitry Andric
795fe6060f1SDimitry Andric return SplitBB;
796fe6060f1SDimitry Andric }
797fe6060f1SDimitry Andric
lowerKillF32(MachineBasicBlock & MBB,MachineInstr & MI)798fe6060f1SDimitry Andric MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
799fe6060f1SDimitry Andric MachineInstr &MI) {
800*0fca6ea1SDimitry Andric assert(LiveMaskReg.isVirtual());
801*0fca6ea1SDimitry Andric
802fe6060f1SDimitry Andric const DebugLoc &DL = MI.getDebugLoc();
803fe6060f1SDimitry Andric unsigned Opcode = 0;
804fe6060f1SDimitry Andric
805fe6060f1SDimitry Andric assert(MI.getOperand(0).isReg());
806fe6060f1SDimitry Andric
807fe6060f1SDimitry Andric // Comparison is for live lanes; however here we compute the inverse
808fe6060f1SDimitry Andric // (killed lanes). This is because VCMP will always generate 0 bits
809fe6060f1SDimitry Andric // for inactive lanes so a mask of live lanes would not be correct
810fe6060f1SDimitry Andric // inside control flow.
811fe6060f1SDimitry Andric // Invert the comparison by swapping the operands and adjusting
812fe6060f1SDimitry Andric // the comparison codes.
813fe6060f1SDimitry Andric
814fe6060f1SDimitry Andric switch (MI.getOperand(2).getImm()) {
815fe6060f1SDimitry Andric case ISD::SETUEQ:
816fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_LG_F32_e64;
817fe6060f1SDimitry Andric break;
818fe6060f1SDimitry Andric case ISD::SETUGT:
819fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_GE_F32_e64;
820fe6060f1SDimitry Andric break;
821fe6060f1SDimitry Andric case ISD::SETUGE:
822fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_GT_F32_e64;
823fe6060f1SDimitry Andric break;
824fe6060f1SDimitry Andric case ISD::SETULT:
825fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_LE_F32_e64;
826fe6060f1SDimitry Andric break;
827fe6060f1SDimitry Andric case ISD::SETULE:
828fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_LT_F32_e64;
829fe6060f1SDimitry Andric break;
830fe6060f1SDimitry Andric case ISD::SETUNE:
831fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_EQ_F32_e64;
832fe6060f1SDimitry Andric break;
833fe6060f1SDimitry Andric case ISD::SETO:
834fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_O_F32_e64;
835fe6060f1SDimitry Andric break;
836fe6060f1SDimitry Andric case ISD::SETUO:
837fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_U_F32_e64;
838fe6060f1SDimitry Andric break;
839fe6060f1SDimitry Andric case ISD::SETOEQ:
840fe6060f1SDimitry Andric case ISD::SETEQ:
841fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
842fe6060f1SDimitry Andric break;
843fe6060f1SDimitry Andric case ISD::SETOGT:
844fe6060f1SDimitry Andric case ISD::SETGT:
845fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_NLT_F32_e64;
846fe6060f1SDimitry Andric break;
847fe6060f1SDimitry Andric case ISD::SETOGE:
848fe6060f1SDimitry Andric case ISD::SETGE:
849fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_NLE_F32_e64;
850fe6060f1SDimitry Andric break;
851fe6060f1SDimitry Andric case ISD::SETOLT:
852fe6060f1SDimitry Andric case ISD::SETLT:
853fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_NGT_F32_e64;
854fe6060f1SDimitry Andric break;
855fe6060f1SDimitry Andric case ISD::SETOLE:
856fe6060f1SDimitry Andric case ISD::SETLE:
857fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_NGE_F32_e64;
858fe6060f1SDimitry Andric break;
859fe6060f1SDimitry Andric case ISD::SETONE:
860fe6060f1SDimitry Andric case ISD::SETNE:
861fe6060f1SDimitry Andric Opcode = AMDGPU::V_CMP_NLG_F32_e64;
862fe6060f1SDimitry Andric break;
863fe6060f1SDimitry Andric default:
864fe6060f1SDimitry Andric llvm_unreachable("invalid ISD:SET cond code");
865fe6060f1SDimitry Andric }
866fe6060f1SDimitry Andric
867fe6060f1SDimitry Andric // Pick opcode based on comparison type.
868fe6060f1SDimitry Andric MachineInstr *VcmpMI;
869fe6060f1SDimitry Andric const MachineOperand &Op0 = MI.getOperand(0);
870fe6060f1SDimitry Andric const MachineOperand &Op1 = MI.getOperand(1);
87104eeddc0SDimitry Andric
87204eeddc0SDimitry Andric // VCC represents lanes killed.
87304eeddc0SDimitry Andric Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
87404eeddc0SDimitry Andric
875fe6060f1SDimitry Andric if (TRI->isVGPR(*MRI, Op0.getReg())) {
876fe6060f1SDimitry Andric Opcode = AMDGPU::getVOPe32(Opcode);
877fe6060f1SDimitry Andric VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
878fe6060f1SDimitry Andric } else {
879fe6060f1SDimitry Andric VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
88004eeddc0SDimitry Andric .addReg(VCC, RegState::Define)
881fe6060f1SDimitry Andric .addImm(0) // src0 modifiers
882fe6060f1SDimitry Andric .add(Op1)
883fe6060f1SDimitry Andric .addImm(0) // src1 modifiers
884fe6060f1SDimitry Andric .add(Op0)
885fe6060f1SDimitry Andric .addImm(0); // omod
886fe6060f1SDimitry Andric }
887fe6060f1SDimitry Andric
888fe6060f1SDimitry Andric MachineInstr *MaskUpdateMI =
889fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
890fe6060f1SDimitry Andric .addReg(LiveMaskReg)
891fe6060f1SDimitry Andric .addReg(VCC);
892fe6060f1SDimitry Andric
893fe6060f1SDimitry Andric // State of SCC represents whether any lanes are live in mask,
894fe6060f1SDimitry Andric // if SCC is 0 then no lanes will be alive anymore.
895fe6060f1SDimitry Andric MachineInstr *EarlyTermMI =
896fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
897fe6060f1SDimitry Andric
898fe6060f1SDimitry Andric MachineInstr *ExecMaskMI =
899fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
900fe6060f1SDimitry Andric
901fe6060f1SDimitry Andric assert(MBB.succ_size() == 1);
902fe6060f1SDimitry Andric MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
903fe6060f1SDimitry Andric .addMBB(*MBB.succ_begin());
904fe6060f1SDimitry Andric
905fe6060f1SDimitry Andric // Update live intervals
906fe6060f1SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
907fe6060f1SDimitry Andric MBB.remove(&MI);
908fe6060f1SDimitry Andric
909fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
910fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*ExecMaskMI);
911fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*EarlyTermMI);
912fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*NewTerm);
913fe6060f1SDimitry Andric
914fe6060f1SDimitry Andric return NewTerm;
915fe6060f1SDimitry Andric }
916fe6060f1SDimitry Andric
lowerKillI1(MachineBasicBlock & MBB,MachineInstr & MI,bool IsWQM)917fe6060f1SDimitry Andric MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
918fe6060f1SDimitry Andric MachineInstr &MI, bool IsWQM) {
919*0fca6ea1SDimitry Andric assert(LiveMaskReg.isVirtual());
920*0fca6ea1SDimitry Andric
921fe6060f1SDimitry Andric const DebugLoc &DL = MI.getDebugLoc();
922fe6060f1SDimitry Andric MachineInstr *MaskUpdateMI = nullptr;
923fe6060f1SDimitry Andric
924fe6060f1SDimitry Andric const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
925fe6060f1SDimitry Andric const MachineOperand &Op = MI.getOperand(0);
926fe6060f1SDimitry Andric int64_t KillVal = MI.getOperand(1).getImm();
927fe6060f1SDimitry Andric MachineInstr *ComputeKilledMaskMI = nullptr;
928fe6060f1SDimitry Andric Register CndReg = !Op.isImm() ? Op.getReg() : Register();
929fe6060f1SDimitry Andric Register TmpReg;
930fe6060f1SDimitry Andric
931fe6060f1SDimitry Andric // Is this a static or dynamic kill?
932fe6060f1SDimitry Andric if (Op.isImm()) {
933fe6060f1SDimitry Andric if (Op.getImm() == KillVal) {
934fe6060f1SDimitry Andric // Static: all active lanes are killed
935fe6060f1SDimitry Andric MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
936fe6060f1SDimitry Andric .addReg(LiveMaskReg)
937fe6060f1SDimitry Andric .addReg(Exec);
938fe6060f1SDimitry Andric } else {
939fe6060f1SDimitry Andric // Static: kill does nothing
940fe6060f1SDimitry Andric MachineInstr *NewTerm = nullptr;
941fe6060f1SDimitry Andric if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
942fe6060f1SDimitry Andric LIS->RemoveMachineInstrFromMaps(MI);
943fe6060f1SDimitry Andric } else {
944fe6060f1SDimitry Andric assert(MBB.succ_size() == 1);
945fe6060f1SDimitry Andric NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
946fe6060f1SDimitry Andric .addMBB(*MBB.succ_begin());
947fe6060f1SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
948fe6060f1SDimitry Andric }
949fe6060f1SDimitry Andric MBB.remove(&MI);
950fe6060f1SDimitry Andric return NewTerm;
951fe6060f1SDimitry Andric }
952fe6060f1SDimitry Andric } else {
953fe6060f1SDimitry Andric if (!KillVal) {
954fe6060f1SDimitry Andric // Op represents live lanes after kill,
955fe6060f1SDimitry Andric // so exec mask needs to be factored in.
956fe6060f1SDimitry Andric TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
957fe6060f1SDimitry Andric ComputeKilledMaskMI =
958fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
959fe6060f1SDimitry Andric MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
960fe6060f1SDimitry Andric .addReg(LiveMaskReg)
961fe6060f1SDimitry Andric .addReg(TmpReg);
962fe6060f1SDimitry Andric } else {
963fe6060f1SDimitry Andric // Op represents lanes to kill
964fe6060f1SDimitry Andric MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
965fe6060f1SDimitry Andric .addReg(LiveMaskReg)
966fe6060f1SDimitry Andric .add(Op);
967fe6060f1SDimitry Andric }
968fe6060f1SDimitry Andric }
969fe6060f1SDimitry Andric
970fe6060f1SDimitry Andric // State of SCC represents whether any lanes are live in mask,
971fe6060f1SDimitry Andric // if SCC is 0 then no lanes will be alive anymore.
972fe6060f1SDimitry Andric MachineInstr *EarlyTermMI =
973fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
974fe6060f1SDimitry Andric
975fe6060f1SDimitry Andric // In the case we got this far some lanes are still live,
976fe6060f1SDimitry Andric // update EXEC to deactivate lanes as appropriate.
977fe6060f1SDimitry Andric MachineInstr *NewTerm;
978fe6060f1SDimitry Andric MachineInstr *WQMMaskMI = nullptr;
979fe6060f1SDimitry Andric Register LiveMaskWQM;
980fe6060f1SDimitry Andric if (IsDemote) {
98181ad6265SDimitry Andric // Demote - deactivate quads with only helper lanes
982fe6060f1SDimitry Andric LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
983fe6060f1SDimitry Andric WQMMaskMI =
984fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
985fe6060f1SDimitry Andric NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
986fe6060f1SDimitry Andric .addReg(Exec)
987fe6060f1SDimitry Andric .addReg(LiveMaskWQM);
988fe6060f1SDimitry Andric } else {
98981ad6265SDimitry Andric // Kill - deactivate lanes no longer in live mask
990fe6060f1SDimitry Andric if (Op.isImm()) {
991fe6060f1SDimitry Andric unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
992fe6060f1SDimitry Andric NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
993fe6060f1SDimitry Andric } else if (!IsWQM) {
994fe6060f1SDimitry Andric NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
995fe6060f1SDimitry Andric .addReg(Exec)
996fe6060f1SDimitry Andric .addReg(LiveMaskReg);
997fe6060f1SDimitry Andric } else {
998fe6060f1SDimitry Andric unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
999fe6060f1SDimitry Andric NewTerm =
1000fe6060f1SDimitry Andric BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1001fe6060f1SDimitry Andric }
1002fe6060f1SDimitry Andric }
1003fe6060f1SDimitry Andric
1004fe6060f1SDimitry Andric // Update live intervals
1005fe6060f1SDimitry Andric LIS->RemoveMachineInstrFromMaps(MI);
1006fe6060f1SDimitry Andric MBB.remove(&MI);
1007fe6060f1SDimitry Andric assert(EarlyTermMI);
1008fe6060f1SDimitry Andric assert(MaskUpdateMI);
1009fe6060f1SDimitry Andric assert(NewTerm);
1010fe6060f1SDimitry Andric if (ComputeKilledMaskMI)
1011fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1012fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1013fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1014fe6060f1SDimitry Andric if (WQMMaskMI)
1015fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1016fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*NewTerm);
1017fe6060f1SDimitry Andric
1018fe6060f1SDimitry Andric if (CndReg) {
1019fe6060f1SDimitry Andric LIS->removeInterval(CndReg);
1020fe6060f1SDimitry Andric LIS->createAndComputeVirtRegInterval(CndReg);
1021fe6060f1SDimitry Andric }
1022fe6060f1SDimitry Andric if (TmpReg)
1023fe6060f1SDimitry Andric LIS->createAndComputeVirtRegInterval(TmpReg);
1024fe6060f1SDimitry Andric if (LiveMaskWQM)
1025fe6060f1SDimitry Andric LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1026fe6060f1SDimitry Andric
1027fe6060f1SDimitry Andric return NewTerm;
1028fe6060f1SDimitry Andric }
1029fe6060f1SDimitry Andric
1030fe6060f1SDimitry Andric // Replace (or supplement) instructions accessing live mask.
1031fe6060f1SDimitry Andric // This can only happen once all the live mask registers have been created
1032fe6060f1SDimitry Andric // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
lowerBlock(MachineBasicBlock & MBB)1033fe6060f1SDimitry Andric void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1034fe6060f1SDimitry Andric auto BII = Blocks.find(&MBB);
1035fe6060f1SDimitry Andric if (BII == Blocks.end())
1036fe6060f1SDimitry Andric return;
1037fe6060f1SDimitry Andric
1038fe6060f1SDimitry Andric const BlockInfo &BI = BII->second;
1039fe6060f1SDimitry Andric if (!BI.NeedsLowering)
1040fe6060f1SDimitry Andric return;
1041fe6060f1SDimitry Andric
1042fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1043fe6060f1SDimitry Andric
1044fe6060f1SDimitry Andric SmallVector<MachineInstr *, 4> SplitPoints;
1045fe6060f1SDimitry Andric char State = BI.InitialState;
1046fe6060f1SDimitry Andric
1047349cc55cSDimitry Andric for (MachineInstr &MI : llvm::make_early_inc_range(
1048349cc55cSDimitry Andric llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1049fe6060f1SDimitry Andric if (StateTransition.count(&MI))
1050fe6060f1SDimitry Andric State = StateTransition[&MI];
1051fe6060f1SDimitry Andric
1052fe6060f1SDimitry Andric MachineInstr *SplitPoint = nullptr;
1053fe6060f1SDimitry Andric switch (MI.getOpcode()) {
1054fe6060f1SDimitry Andric case AMDGPU::SI_DEMOTE_I1:
1055fe6060f1SDimitry Andric case AMDGPU::SI_KILL_I1_TERMINATOR:
1056fe6060f1SDimitry Andric SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1057fe6060f1SDimitry Andric break;
1058fe6060f1SDimitry Andric case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1059fe6060f1SDimitry Andric SplitPoint = lowerKillF32(MBB, MI);
1060fe6060f1SDimitry Andric break;
1061fe6060f1SDimitry Andric default:
1062fe6060f1SDimitry Andric break;
1063fe6060f1SDimitry Andric }
1064fe6060f1SDimitry Andric if (SplitPoint)
1065fe6060f1SDimitry Andric SplitPoints.push_back(SplitPoint);
1066fe6060f1SDimitry Andric }
1067fe6060f1SDimitry Andric
1068fe6060f1SDimitry Andric // Perform splitting after instruction scan to simplify iteration.
1069fe6060f1SDimitry Andric if (!SplitPoints.empty()) {
1070fe6060f1SDimitry Andric MachineBasicBlock *BB = &MBB;
1071fe6060f1SDimitry Andric for (MachineInstr *MI : SplitPoints) {
1072fe6060f1SDimitry Andric BB = splitBlock(BB, MI);
1073fe6060f1SDimitry Andric }
1074fe6060f1SDimitry Andric }
1075fe6060f1SDimitry Andric }
1076fe6060f1SDimitry Andric
10770b57cec5SDimitry Andric // Return an iterator in the (inclusive) range [First, Last] at which
10780b57cec5SDimitry Andric // instructions can be safely inserted, keeping in mind that some of the
10790b57cec5SDimitry Andric // instructions we want to add necessarily clobber SCC.
prepareInsertion(MachineBasicBlock & MBB,MachineBasicBlock::iterator First,MachineBasicBlock::iterator Last,bool PreferLast,bool SaveSCC)10800b57cec5SDimitry Andric MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
10810b57cec5SDimitry Andric MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
10820b57cec5SDimitry Andric MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
10830b57cec5SDimitry Andric if (!SaveSCC)
10840b57cec5SDimitry Andric return PreferLast ? Last : First;
10850b57cec5SDimitry Andric
1086e8d8bef9SDimitry Andric LiveRange &LR =
108706c3fb27SDimitry Andric LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
10880b57cec5SDimitry Andric auto MBBE = MBB.end();
10890b57cec5SDimitry Andric SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
10900b57cec5SDimitry Andric : LIS->getMBBEndIdx(&MBB);
10910b57cec5SDimitry Andric SlotIndex LastIdx =
10920b57cec5SDimitry Andric Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
10930b57cec5SDimitry Andric SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
10940b57cec5SDimitry Andric const LiveRange::Segment *S;
10950b57cec5SDimitry Andric
10960b57cec5SDimitry Andric for (;;) {
10970b57cec5SDimitry Andric S = LR.getSegmentContaining(Idx);
10980b57cec5SDimitry Andric if (!S)
10990b57cec5SDimitry Andric break;
11000b57cec5SDimitry Andric
11010b57cec5SDimitry Andric if (PreferLast) {
11020b57cec5SDimitry Andric SlotIndex Next = S->start.getBaseIndex();
11030b57cec5SDimitry Andric if (Next < FirstIdx)
11040b57cec5SDimitry Andric break;
11050b57cec5SDimitry Andric Idx = Next;
11060b57cec5SDimitry Andric } else {
1107e8d8bef9SDimitry Andric MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1108e8d8bef9SDimitry Andric assert(EndMI && "Segment does not end on valid instruction");
1109e8d8bef9SDimitry Andric auto NextI = std::next(EndMI->getIterator());
1110e8d8bef9SDimitry Andric if (NextI == MBB.end())
1111e8d8bef9SDimitry Andric break;
1112e8d8bef9SDimitry Andric SlotIndex Next = LIS->getInstructionIndex(*NextI);
11130b57cec5SDimitry Andric if (Next > LastIdx)
11140b57cec5SDimitry Andric break;
11150b57cec5SDimitry Andric Idx = Next;
11160b57cec5SDimitry Andric }
11170b57cec5SDimitry Andric }
11180b57cec5SDimitry Andric
11190b57cec5SDimitry Andric MachineBasicBlock::iterator MBBI;
11200b57cec5SDimitry Andric
11210b57cec5SDimitry Andric if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
11220b57cec5SDimitry Andric MBBI = MI;
11230b57cec5SDimitry Andric else {
11240b57cec5SDimitry Andric assert(Idx == LIS->getMBBEndIdx(&MBB));
11250b57cec5SDimitry Andric MBBI = MBB.end();
11260b57cec5SDimitry Andric }
11270b57cec5SDimitry Andric
1128e8d8bef9SDimitry Andric // Move insertion point past any operations modifying EXEC.
1129e8d8bef9SDimitry Andric // This assumes that the value of SCC defined by any of these operations
1130e8d8bef9SDimitry Andric // does not need to be preserved.
1131e8d8bef9SDimitry Andric while (MBBI != Last) {
1132e8d8bef9SDimitry Andric bool IsExecDef = false;
113306c3fb27SDimitry Andric for (const MachineOperand &MO : MBBI->all_defs()) {
1134e8d8bef9SDimitry Andric IsExecDef |=
1135e8d8bef9SDimitry Andric MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1136e8d8bef9SDimitry Andric }
1137e8d8bef9SDimitry Andric if (!IsExecDef)
1138e8d8bef9SDimitry Andric break;
1139e8d8bef9SDimitry Andric MBBI++;
1140e8d8bef9SDimitry Andric S = nullptr;
1141e8d8bef9SDimitry Andric }
1142e8d8bef9SDimitry Andric
11430b57cec5SDimitry Andric if (S)
11440b57cec5SDimitry Andric MBBI = saveSCC(MBB, MBBI);
11450b57cec5SDimitry Andric
11460b57cec5SDimitry Andric return MBBI;
11470b57cec5SDimitry Andric }
11480b57cec5SDimitry Andric
toExact(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveWQM)11490b57cec5SDimitry Andric void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
11500b57cec5SDimitry Andric MachineBasicBlock::iterator Before,
1151fe6060f1SDimitry Andric Register SaveWQM) {
1152*0fca6ea1SDimitry Andric assert(LiveMaskReg.isVirtual());
1153*0fca6ea1SDimitry Andric
115406c3fb27SDimitry Andric bool IsTerminator = Before == MBB.end();
115506c3fb27SDimitry Andric if (!IsTerminator) {
115606c3fb27SDimitry Andric auto FirstTerm = MBB.getFirstTerminator();
115706c3fb27SDimitry Andric if (FirstTerm != MBB.end()) {
115806c3fb27SDimitry Andric SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
115906c3fb27SDimitry Andric SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
116006c3fb27SDimitry Andric IsTerminator = BeforeIdx > FirstTermIdx;
116106c3fb27SDimitry Andric }
116206c3fb27SDimitry Andric }
116306c3fb27SDimitry Andric
11640b57cec5SDimitry Andric MachineInstr *MI;
11650b57cec5SDimitry Andric
11660b57cec5SDimitry Andric if (SaveWQM) {
116706c3fb27SDimitry Andric unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
116806c3fb27SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
11690b57cec5SDimitry Andric .addReg(LiveMaskReg);
11700b57cec5SDimitry Andric } else {
117106c3fb27SDimitry Andric unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
117206c3fb27SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
11730b57cec5SDimitry Andric .addReg(Exec)
11740b57cec5SDimitry Andric .addReg(LiveMaskReg);
11750b57cec5SDimitry Andric }
11760b57cec5SDimitry Andric
11770b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
1178fe6060f1SDimitry Andric StateTransition[MI] = StateExact;
11790b57cec5SDimitry Andric }
11800b57cec5SDimitry Andric
toWQM(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedWQM)11810b57cec5SDimitry Andric void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
11820b57cec5SDimitry Andric MachineBasicBlock::iterator Before,
1183fe6060f1SDimitry Andric Register SavedWQM) {
11840b57cec5SDimitry Andric MachineInstr *MI;
11850b57cec5SDimitry Andric
11860b57cec5SDimitry Andric if (SavedWQM) {
11870b57cec5SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
11880b57cec5SDimitry Andric .addReg(SavedWQM);
11890b57cec5SDimitry Andric } else {
1190fe6060f1SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
11910b57cec5SDimitry Andric }
11920b57cec5SDimitry Andric
11930b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
1194fe6060f1SDimitry Andric StateTransition[MI] = StateWQM;
11950b57cec5SDimitry Andric }
11960b57cec5SDimitry Andric
toStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveOrig,char StrictStateNeeded)1197fe6060f1SDimitry Andric void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
11980b57cec5SDimitry Andric MachineBasicBlock::iterator Before,
1199fe6060f1SDimitry Andric Register SaveOrig, char StrictStateNeeded) {
12000b57cec5SDimitry Andric MachineInstr *MI;
12010b57cec5SDimitry Andric assert(SaveOrig);
1202fe6060f1SDimitry Andric assert(StrictStateNeeded == StateStrictWWM ||
1203fe6060f1SDimitry Andric StrictStateNeeded == StateStrictWQM);
1204fe6060f1SDimitry Andric
1205fe6060f1SDimitry Andric if (StrictStateNeeded == StateStrictWWM) {
1206fe6060f1SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1207fe6060f1SDimitry Andric SaveOrig)
12080b57cec5SDimitry Andric .addImm(-1);
1209fe6060f1SDimitry Andric } else {
1210fe6060f1SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1211fe6060f1SDimitry Andric SaveOrig)
1212fe6060f1SDimitry Andric .addImm(-1);
1213fe6060f1SDimitry Andric }
12140b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
1215bdd1243dSDimitry Andric StateTransition[MI] = StrictStateNeeded;
12160b57cec5SDimitry Andric }
12170b57cec5SDimitry Andric
fromStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedOrig,char NonStrictState,char CurrentStrictState)1218fe6060f1SDimitry Andric void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
12190b57cec5SDimitry Andric MachineBasicBlock::iterator Before,
1220fe6060f1SDimitry Andric Register SavedOrig, char NonStrictState,
1221fe6060f1SDimitry Andric char CurrentStrictState) {
12220b57cec5SDimitry Andric MachineInstr *MI;
12230b57cec5SDimitry Andric
12240b57cec5SDimitry Andric assert(SavedOrig);
1225fe6060f1SDimitry Andric assert(CurrentStrictState == StateStrictWWM ||
1226fe6060f1SDimitry Andric CurrentStrictState == StateStrictWQM);
1227fe6060f1SDimitry Andric
1228fe6060f1SDimitry Andric if (CurrentStrictState == StateStrictWWM) {
1229fe6060f1SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1230fe6060f1SDimitry Andric Exec)
12310b57cec5SDimitry Andric .addReg(SavedOrig);
1232fe6060f1SDimitry Andric } else {
1233fe6060f1SDimitry Andric MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1234fe6060f1SDimitry Andric Exec)
1235fe6060f1SDimitry Andric .addReg(SavedOrig);
1236fe6060f1SDimitry Andric }
12370b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
1238fe6060f1SDimitry Andric StateTransition[MI] = NonStrictState;
12390b57cec5SDimitry Andric }
12400b57cec5SDimitry Andric
processBlock(MachineBasicBlock & MBB,bool IsEntry)1241fe6060f1SDimitry Andric void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
12420b57cec5SDimitry Andric auto BII = Blocks.find(&MBB);
12430b57cec5SDimitry Andric if (BII == Blocks.end())
12440b57cec5SDimitry Andric return;
12450b57cec5SDimitry Andric
1246fe6060f1SDimitry Andric BlockInfo &BI = BII->second;
12470b57cec5SDimitry Andric
12480b57cec5SDimitry Andric // This is a non-entry block that is WQM throughout, so no need to do
12490b57cec5SDimitry Andric // anything.
1250fe6060f1SDimitry Andric if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1251fe6060f1SDimitry Andric BI.InitialState = StateWQM;
12520b57cec5SDimitry Andric return;
1253fe6060f1SDimitry Andric }
12540b57cec5SDimitry Andric
12550b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
12560b57cec5SDimitry Andric << ":\n");
12570b57cec5SDimitry Andric
1258fe6060f1SDimitry Andric Register SavedWQMReg;
1259fe6060f1SDimitry Andric Register SavedNonStrictReg;
1260fe6060f1SDimitry Andric bool WQMFromExec = IsEntry;
1261fe6060f1SDimitry Andric char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1262fe6060f1SDimitry Andric char NonStrictState = 0;
12630b57cec5SDimitry Andric const TargetRegisterClass *BoolRC = TRI->getBoolRC();
12640b57cec5SDimitry Andric
12650b57cec5SDimitry Andric auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1266fe6060f1SDimitry Andric if (IsEntry) {
1267e8d8bef9SDimitry Andric // Skip the instruction that saves LiveMask
12685f757f3fSDimitry Andric if (II != IE && II->getOpcode() == AMDGPU::COPY &&
12695f757f3fSDimitry Andric II->getOperand(1).getReg() == TRI->getExec())
1270e8d8bef9SDimitry Andric ++II;
1271e8d8bef9SDimitry Andric }
12720b57cec5SDimitry Andric
12730b57cec5SDimitry Andric // This stores the first instruction where it's safe to switch from WQM to
12740b57cec5SDimitry Andric // Exact or vice versa.
12750b57cec5SDimitry Andric MachineBasicBlock::iterator FirstWQM = IE;
12760b57cec5SDimitry Andric
1277fe6060f1SDimitry Andric // This stores the first instruction where it's safe to switch from Strict
1278fe6060f1SDimitry Andric // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1279fe6060f1SDimitry Andric // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1280fe6060f1SDimitry Andric // be safe to switch to/from WQM as well.
1281fe6060f1SDimitry Andric MachineBasicBlock::iterator FirstStrict = IE;
1282fe6060f1SDimitry Andric
1283fe6060f1SDimitry Andric // Record initial state is block information.
1284fe6060f1SDimitry Andric BI.InitialState = State;
1285e8d8bef9SDimitry Andric
12860b57cec5SDimitry Andric for (;;) {
12870b57cec5SDimitry Andric MachineBasicBlock::iterator Next = II;
1288fe6060f1SDimitry Andric char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
12890b57cec5SDimitry Andric char OutNeeds = 0;
12900b57cec5SDimitry Andric
12910b57cec5SDimitry Andric if (FirstWQM == IE)
12920b57cec5SDimitry Andric FirstWQM = II;
12930b57cec5SDimitry Andric
1294fe6060f1SDimitry Andric if (FirstStrict == IE)
1295fe6060f1SDimitry Andric FirstStrict = II;
12960b57cec5SDimitry Andric
12970b57cec5SDimitry Andric // First, figure out the allowed states (Needs) based on the propagated
12980b57cec5SDimitry Andric // flags.
12990b57cec5SDimitry Andric if (II != IE) {
13000b57cec5SDimitry Andric MachineInstr &MI = *II;
13010b57cec5SDimitry Andric
13025ffd83dbSDimitry Andric if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
13030b57cec5SDimitry Andric auto III = Instructions.find(&MI);
13040b57cec5SDimitry Andric if (III != Instructions.end()) {
1305fe6060f1SDimitry Andric if (III->second.Needs & StateStrictWWM)
1306fe6060f1SDimitry Andric Needs = StateStrictWWM;
1307fe6060f1SDimitry Andric else if (III->second.Needs & StateStrictWQM)
1308fe6060f1SDimitry Andric Needs = StateStrictWQM;
13090b57cec5SDimitry Andric else if (III->second.Needs & StateWQM)
13100b57cec5SDimitry Andric Needs = StateWQM;
13110b57cec5SDimitry Andric else
13120b57cec5SDimitry Andric Needs &= ~III->second.Disabled;
13130b57cec5SDimitry Andric OutNeeds = III->second.OutNeeds;
13140b57cec5SDimitry Andric }
13150b57cec5SDimitry Andric } else {
13160b57cec5SDimitry Andric // If the instruction doesn't actually need a correct EXEC, then we can
1317fe6060f1SDimitry Andric // safely leave Strict mode enabled.
1318fe6060f1SDimitry Andric Needs = StateExact | StateWQM | StateStrict;
13190b57cec5SDimitry Andric }
13200b57cec5SDimitry Andric
132106c3fb27SDimitry Andric // Exact mode exit can occur in terminators, but must be before branches.
132206c3fb27SDimitry Andric if (MI.isBranch() && OutNeeds == StateExact)
13230b57cec5SDimitry Andric Needs = StateExact;
13240b57cec5SDimitry Andric
13250b57cec5SDimitry Andric ++Next;
13260b57cec5SDimitry Andric } else {
13270b57cec5SDimitry Andric // End of basic block
13280b57cec5SDimitry Andric if (BI.OutNeeds & StateWQM)
13290b57cec5SDimitry Andric Needs = StateWQM;
13300b57cec5SDimitry Andric else if (BI.OutNeeds == StateExact)
13310b57cec5SDimitry Andric Needs = StateExact;
13320b57cec5SDimitry Andric else
13330b57cec5SDimitry Andric Needs = StateWQM | StateExact;
13340b57cec5SDimitry Andric }
13350b57cec5SDimitry Andric
13360b57cec5SDimitry Andric // Now, transition if necessary.
13370b57cec5SDimitry Andric if (!(Needs & State)) {
13380b57cec5SDimitry Andric MachineBasicBlock::iterator First;
1339fe6060f1SDimitry Andric if (State == StateStrictWWM || Needs == StateStrictWWM ||
1340fe6060f1SDimitry Andric State == StateStrictWQM || Needs == StateStrictWQM) {
1341fe6060f1SDimitry Andric // We must switch to or from Strict mode.
1342fe6060f1SDimitry Andric First = FirstStrict;
13430b57cec5SDimitry Andric } else {
1344fe6060f1SDimitry Andric // We only need to switch to/from WQM, so we can use FirstWQM.
13450b57cec5SDimitry Andric First = FirstWQM;
13460b57cec5SDimitry Andric }
13470b57cec5SDimitry Andric
1348fe6060f1SDimitry Andric // Whether we need to save SCC depends on start and end states.
1349fe6060f1SDimitry Andric bool SaveSCC = false;
1350fe6060f1SDimitry Andric switch (State) {
1351fe6060f1SDimitry Andric case StateExact:
1352fe6060f1SDimitry Andric case StateStrictWWM:
1353fe6060f1SDimitry Andric case StateStrictWQM:
1354fe6060f1SDimitry Andric // Exact/Strict -> Strict: save SCC
1355fe6060f1SDimitry Andric // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1356fe6060f1SDimitry Andric // Exact/Strict -> Exact: no save
1357fe6060f1SDimitry Andric SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1358fe6060f1SDimitry Andric break;
1359fe6060f1SDimitry Andric case StateWQM:
1360fe6060f1SDimitry Andric // WQM -> Exact/Strict: save SCC
1361fe6060f1SDimitry Andric SaveSCC = !(Needs & StateWQM);
1362fe6060f1SDimitry Andric break;
1363fe6060f1SDimitry Andric default:
1364fe6060f1SDimitry Andric llvm_unreachable("Unknown state");
1365fe6060f1SDimitry Andric break;
1366fe6060f1SDimitry Andric }
13670b57cec5SDimitry Andric MachineBasicBlock::iterator Before =
1368fe6060f1SDimitry Andric prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
13690b57cec5SDimitry Andric
1370fe6060f1SDimitry Andric if (State & StateStrict) {
1371fe6060f1SDimitry Andric assert(State == StateStrictWWM || State == StateStrictWQM);
1372fe6060f1SDimitry Andric assert(SavedNonStrictReg);
1373fe6060f1SDimitry Andric fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1374fe6060f1SDimitry Andric
1375fe6060f1SDimitry Andric LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1376fe6060f1SDimitry Andric SavedNonStrictReg = 0;
1377fe6060f1SDimitry Andric State = NonStrictState;
13780b57cec5SDimitry Andric }
13790b57cec5SDimitry Andric
1380fe6060f1SDimitry Andric if (Needs & StateStrict) {
1381fe6060f1SDimitry Andric NonStrictState = State;
1382fe6060f1SDimitry Andric assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1383fe6060f1SDimitry Andric assert(!SavedNonStrictReg);
1384fe6060f1SDimitry Andric SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1385fe6060f1SDimitry Andric
1386fe6060f1SDimitry Andric toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1387fe6060f1SDimitry Andric State = Needs;
1388fe6060f1SDimitry Andric
13890b57cec5SDimitry Andric } else {
13900b57cec5SDimitry Andric if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
13915ffd83dbSDimitry Andric if (!WQMFromExec && (OutNeeds & StateWQM)) {
13925ffd83dbSDimitry Andric assert(!SavedWQMReg);
13930b57cec5SDimitry Andric SavedWQMReg = MRI->createVirtualRegister(BoolRC);
13945ffd83dbSDimitry Andric }
13950b57cec5SDimitry Andric
1396fe6060f1SDimitry Andric toExact(MBB, Before, SavedWQMReg);
13970b57cec5SDimitry Andric State = StateExact;
13980b57cec5SDimitry Andric } else if (State == StateExact && (Needs & StateWQM) &&
13990b57cec5SDimitry Andric !(Needs & StateExact)) {
14000b57cec5SDimitry Andric assert(WQMFromExec == (SavedWQMReg == 0));
14010b57cec5SDimitry Andric
14020b57cec5SDimitry Andric toWQM(MBB, Before, SavedWQMReg);
14030b57cec5SDimitry Andric
14040b57cec5SDimitry Andric if (SavedWQMReg) {
14050b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(SavedWQMReg);
14060b57cec5SDimitry Andric SavedWQMReg = 0;
14070b57cec5SDimitry Andric }
14080b57cec5SDimitry Andric State = StateWQM;
14090b57cec5SDimitry Andric } else {
1410fe6060f1SDimitry Andric // We can get here if we transitioned from StrictWWM to a
1411fe6060f1SDimitry Andric // non-StrictWWM state that already matches our needs, but we
1412fe6060f1SDimitry Andric // shouldn't need to do anything.
14130b57cec5SDimitry Andric assert(Needs & State);
14140b57cec5SDimitry Andric }
14150b57cec5SDimitry Andric }
14160b57cec5SDimitry Andric }
14170b57cec5SDimitry Andric
1418fe6060f1SDimitry Andric if (Needs != (StateExact | StateWQM | StateStrict)) {
14190b57cec5SDimitry Andric if (Needs != (StateExact | StateWQM))
14200b57cec5SDimitry Andric FirstWQM = IE;
1421fe6060f1SDimitry Andric FirstStrict = IE;
14220b57cec5SDimitry Andric }
14230b57cec5SDimitry Andric
14240b57cec5SDimitry Andric if (II == IE)
14250b57cec5SDimitry Andric break;
1426e8d8bef9SDimitry Andric
14270b57cec5SDimitry Andric II = Next;
14280b57cec5SDimitry Andric }
14295ffd83dbSDimitry Andric assert(!SavedWQMReg);
1430fe6060f1SDimitry Andric assert(!SavedNonStrictReg);
14310b57cec5SDimitry Andric }
14320b57cec5SDimitry Andric
lowerLiveMaskQueries()1433*0fca6ea1SDimitry Andric bool SIWholeQuadMode::lowerLiveMaskQueries() {
14340b57cec5SDimitry Andric for (MachineInstr *MI : LiveMaskQueries) {
14350b57cec5SDimitry Andric const DebugLoc &DL = MI->getDebugLoc();
14368bcb0991SDimitry Andric Register Dest = MI->getOperand(0).getReg();
1437e8d8bef9SDimitry Andric
14380b57cec5SDimitry Andric MachineInstr *Copy =
14390b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
14400b57cec5SDimitry Andric .addReg(LiveMaskReg);
14410b57cec5SDimitry Andric
14420b57cec5SDimitry Andric LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
14430b57cec5SDimitry Andric MI->eraseFromParent();
14440b57cec5SDimitry Andric }
1445*0fca6ea1SDimitry Andric return !LiveMaskQueries.empty();
14460b57cec5SDimitry Andric }
14470b57cec5SDimitry Andric
lowerCopyInstrs()1448*0fca6ea1SDimitry Andric bool SIWholeQuadMode::lowerCopyInstrs() {
1449480093f4SDimitry Andric for (MachineInstr *MI : LowerToMovInstrs) {
1450480093f4SDimitry Andric assert(MI->getNumExplicitOperands() == 2);
14510b57cec5SDimitry Andric
14528bcb0991SDimitry Andric const Register Reg = MI->getOperand(0).getReg();
14530b57cec5SDimitry Andric
1454e8d8bef9SDimitry Andric const TargetRegisterClass *regClass =
1455bdd1243dSDimitry Andric TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1456bdd1243dSDimitry Andric if (TRI->isVGPRClass(regClass)) {
14570b57cec5SDimitry Andric const unsigned MovOp = TII->getMovOpcode(regClass);
14580b57cec5SDimitry Andric MI->setDesc(TII->get(MovOp));
14590b57cec5SDimitry Andric
1460fe6060f1SDimitry Andric // Check that it already implicitly depends on exec (like all VALU movs
1461fe6060f1SDimitry Andric // should do).
1462fe6060f1SDimitry Andric assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1463fe6060f1SDimitry Andric return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1464fe6060f1SDimitry Andric }));
1465fe6060f1SDimitry Andric } else {
1466e8d8bef9SDimitry Andric // Remove early-clobber and exec dependency from simple SGPR copies.
1467e8d8bef9SDimitry Andric // This allows some to be eliminated during/post RA.
1468e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1469e8d8bef9SDimitry Andric if (MI->getOperand(0).isEarlyClobber()) {
1470e8d8bef9SDimitry Andric LIS->removeInterval(Reg);
1471e8d8bef9SDimitry Andric MI->getOperand(0).setIsEarlyClobber(false);
1472e8d8bef9SDimitry Andric LIS->createAndComputeVirtRegInterval(Reg);
1473e8d8bef9SDimitry Andric }
1474*0fca6ea1SDimitry Andric int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1475e8d8bef9SDimitry Andric while (Index >= 0) {
147681ad6265SDimitry Andric MI->removeOperand(Index);
1477*0fca6ea1SDimitry Andric Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1478e8d8bef9SDimitry Andric }
14790b57cec5SDimitry Andric MI->setDesc(TII->get(AMDGPU::COPY));
1480e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << " -> " << *MI);
14810b57cec5SDimitry Andric }
14820b57cec5SDimitry Andric }
1483480093f4SDimitry Andric for (MachineInstr *MI : LowerToCopyInstrs) {
1484480093f4SDimitry Andric if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1485480093f4SDimitry Andric MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1486480093f4SDimitry Andric assert(MI->getNumExplicitOperands() == 3);
1487480093f4SDimitry Andric // the only reason we should be here is V_SET_INACTIVE has
1488480093f4SDimitry Andric // an undef input so it is being replaced by a simple copy.
1489480093f4SDimitry Andric // There should be a second undef source that we should remove.
1490480093f4SDimitry Andric assert(MI->getOperand(2).isUndef());
149181ad6265SDimitry Andric MI->removeOperand(2);
1492480093f4SDimitry Andric MI->untieRegOperand(1);
1493480093f4SDimitry Andric } else {
1494480093f4SDimitry Andric assert(MI->getNumExplicitOperands() == 2);
1495480093f4SDimitry Andric }
1496480093f4SDimitry Andric
149706c3fb27SDimitry Andric unsigned CopyOp = MI->getOperand(1).isReg()
149806c3fb27SDimitry Andric ? (unsigned)AMDGPU::COPY
149906c3fb27SDimitry Andric : TII->getMovOpcode(TRI->getRegClassForOperandReg(
150006c3fb27SDimitry Andric *MRI, MI->getOperand(0)));
150106c3fb27SDimitry Andric MI->setDesc(TII->get(CopyOp));
1502480093f4SDimitry Andric }
1503*0fca6ea1SDimitry Andric return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
15040b57cec5SDimitry Andric }
15050b57cec5SDimitry Andric
lowerKillInstrs(bool IsWQM)1506*0fca6ea1SDimitry Andric bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1507fe6060f1SDimitry Andric for (MachineInstr *MI : KillInstrs) {
1508fe6060f1SDimitry Andric MachineBasicBlock *MBB = MI->getParent();
1509fe6060f1SDimitry Andric MachineInstr *SplitPoint = nullptr;
1510fe6060f1SDimitry Andric switch (MI->getOpcode()) {
1511fe6060f1SDimitry Andric case AMDGPU::SI_DEMOTE_I1:
1512fe6060f1SDimitry Andric case AMDGPU::SI_KILL_I1_TERMINATOR:
1513fe6060f1SDimitry Andric SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1514fe6060f1SDimitry Andric break;
1515fe6060f1SDimitry Andric case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1516fe6060f1SDimitry Andric SplitPoint = lowerKillF32(*MBB, *MI);
1517fe6060f1SDimitry Andric break;
1518fe6060f1SDimitry Andric }
1519fe6060f1SDimitry Andric if (SplitPoint)
1520fe6060f1SDimitry Andric splitBlock(MBB, SplitPoint);
1521fe6060f1SDimitry Andric }
1522*0fca6ea1SDimitry Andric return !KillInstrs.empty();
1523*0fca6ea1SDimitry Andric }
1524*0fca6ea1SDimitry Andric
lowerInitExec(MachineInstr & MI)1525*0fca6ea1SDimitry Andric void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1526*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
1527*0fca6ea1SDimitry Andric bool IsWave32 = ST->isWave32();
1528*0fca6ea1SDimitry Andric
1529*0fca6ea1SDimitry Andric if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1530*0fca6ea1SDimitry Andric // This should be before all vector instructions.
1531*0fca6ea1SDimitry Andric MachineInstr *InitMI =
1532*0fca6ea1SDimitry Andric BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1533*0fca6ea1SDimitry Andric TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1534*0fca6ea1SDimitry Andric Exec)
1535*0fca6ea1SDimitry Andric .addImm(MI.getOperand(0).getImm());
1536*0fca6ea1SDimitry Andric if (LIS) {
1537*0fca6ea1SDimitry Andric LIS->RemoveMachineInstrFromMaps(MI);
1538*0fca6ea1SDimitry Andric LIS->InsertMachineInstrInMaps(*InitMI);
1539*0fca6ea1SDimitry Andric }
1540*0fca6ea1SDimitry Andric MI.eraseFromParent();
1541*0fca6ea1SDimitry Andric return;
1542*0fca6ea1SDimitry Andric }
1543*0fca6ea1SDimitry Andric
1544*0fca6ea1SDimitry Andric // Extract the thread count from an SGPR input and set EXEC accordingly.
1545*0fca6ea1SDimitry Andric // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1546*0fca6ea1SDimitry Andric //
1547*0fca6ea1SDimitry Andric // S_BFE_U32 count, input, {shift, 7}
1548*0fca6ea1SDimitry Andric // S_BFM_B64 exec, count, 0
1549*0fca6ea1SDimitry Andric // S_CMP_EQ_U32 count, 64
1550*0fca6ea1SDimitry Andric // S_CMOV_B64 exec, -1
1551*0fca6ea1SDimitry Andric Register InputReg = MI.getOperand(0).getReg();
1552*0fca6ea1SDimitry Andric MachineInstr *FirstMI = &*MBB->begin();
1553*0fca6ea1SDimitry Andric if (InputReg.isVirtual()) {
1554*0fca6ea1SDimitry Andric MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1555*0fca6ea1SDimitry Andric assert(DefInstr && DefInstr->isCopy());
1556*0fca6ea1SDimitry Andric if (DefInstr->getParent() == MBB) {
1557*0fca6ea1SDimitry Andric if (DefInstr != FirstMI) {
1558*0fca6ea1SDimitry Andric // If the `InputReg` is defined in current block, we also need to
1559*0fca6ea1SDimitry Andric // move that instruction to the beginning of the block.
1560*0fca6ea1SDimitry Andric DefInstr->removeFromParent();
1561*0fca6ea1SDimitry Andric MBB->insert(FirstMI, DefInstr);
1562*0fca6ea1SDimitry Andric if (LIS)
1563*0fca6ea1SDimitry Andric LIS->handleMove(*DefInstr);
1564*0fca6ea1SDimitry Andric } else {
1565*0fca6ea1SDimitry Andric // If first instruction is definition then move pointer after it.
1566*0fca6ea1SDimitry Andric FirstMI = &*std::next(FirstMI->getIterator());
1567*0fca6ea1SDimitry Andric }
1568*0fca6ea1SDimitry Andric }
1569*0fca6ea1SDimitry Andric }
1570*0fca6ea1SDimitry Andric
1571*0fca6ea1SDimitry Andric // Insert instruction sequence at block beginning (before vector operations).
1572*0fca6ea1SDimitry Andric const DebugLoc DL = MI.getDebugLoc();
1573*0fca6ea1SDimitry Andric const unsigned WavefrontSize = ST->getWavefrontSize();
1574*0fca6ea1SDimitry Andric const unsigned Mask = (WavefrontSize << 1) - 1;
1575*0fca6ea1SDimitry Andric Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1576*0fca6ea1SDimitry Andric auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1577*0fca6ea1SDimitry Andric .addReg(InputReg)
1578*0fca6ea1SDimitry Andric .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1579*0fca6ea1SDimitry Andric auto BfmMI =
1580*0fca6ea1SDimitry Andric BuildMI(*MBB, FirstMI, DL,
1581*0fca6ea1SDimitry Andric TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1582*0fca6ea1SDimitry Andric .addReg(CountReg)
1583*0fca6ea1SDimitry Andric .addImm(0);
1584*0fca6ea1SDimitry Andric auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1585*0fca6ea1SDimitry Andric .addReg(CountReg, RegState::Kill)
1586*0fca6ea1SDimitry Andric .addImm(WavefrontSize);
1587*0fca6ea1SDimitry Andric auto CmovMI =
1588*0fca6ea1SDimitry Andric BuildMI(*MBB, FirstMI, DL,
1589*0fca6ea1SDimitry Andric TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1590*0fca6ea1SDimitry Andric Exec)
1591*0fca6ea1SDimitry Andric .addImm(-1);
1592*0fca6ea1SDimitry Andric
1593*0fca6ea1SDimitry Andric if (!LIS) {
1594*0fca6ea1SDimitry Andric MI.eraseFromParent();
1595*0fca6ea1SDimitry Andric return;
1596*0fca6ea1SDimitry Andric }
1597*0fca6ea1SDimitry Andric
1598*0fca6ea1SDimitry Andric LIS->RemoveMachineInstrFromMaps(MI);
1599*0fca6ea1SDimitry Andric MI.eraseFromParent();
1600*0fca6ea1SDimitry Andric
1601*0fca6ea1SDimitry Andric LIS->InsertMachineInstrInMaps(*BfeMI);
1602*0fca6ea1SDimitry Andric LIS->InsertMachineInstrInMaps(*BfmMI);
1603*0fca6ea1SDimitry Andric LIS->InsertMachineInstrInMaps(*CmpMI);
1604*0fca6ea1SDimitry Andric LIS->InsertMachineInstrInMaps(*CmovMI);
1605*0fca6ea1SDimitry Andric
1606*0fca6ea1SDimitry Andric LIS->removeInterval(InputReg);
1607*0fca6ea1SDimitry Andric LIS->createAndComputeVirtRegInterval(InputReg);
1608*0fca6ea1SDimitry Andric LIS->createAndComputeVirtRegInterval(CountReg);
1609*0fca6ea1SDimitry Andric }
1610*0fca6ea1SDimitry Andric
1611*0fca6ea1SDimitry Andric /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1612*0fca6ea1SDimitry Andric /// for instructions that depend on EXEC.
1613*0fca6ea1SDimitry Andric MachineBasicBlock::iterator
lowerInitExecInstrs(MachineBasicBlock & Entry,bool & Changed)1614*0fca6ea1SDimitry Andric SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1615*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1616*0fca6ea1SDimitry Andric
1617*0fca6ea1SDimitry Andric for (MachineInstr *MI : InitExecInstrs) {
1618*0fca6ea1SDimitry Andric // Try to handle undefined cases gracefully:
1619*0fca6ea1SDimitry Andric // - multiple INIT_EXEC instructions
1620*0fca6ea1SDimitry Andric // - INIT_EXEC instructions not in the entry block
1621*0fca6ea1SDimitry Andric if (MI->getParent() == &Entry)
1622*0fca6ea1SDimitry Andric InsertPt = std::next(MI->getIterator());
1623*0fca6ea1SDimitry Andric
1624*0fca6ea1SDimitry Andric lowerInitExec(*MI);
1625*0fca6ea1SDimitry Andric Changed = true;
1626*0fca6ea1SDimitry Andric }
1627*0fca6ea1SDimitry Andric
1628*0fca6ea1SDimitry Andric return InsertPt;
1629fe6060f1SDimitry Andric }
1630fe6060f1SDimitry Andric
runOnMachineFunction(MachineFunction & MF)16310b57cec5SDimitry Andric bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1632fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1633fe6060f1SDimitry Andric << " ------------- \n");
1634fe6060f1SDimitry Andric LLVM_DEBUG(MF.dump(););
1635fe6060f1SDimitry Andric
16360b57cec5SDimitry Andric Instructions.clear();
16370b57cec5SDimitry Andric Blocks.clear();
16380b57cec5SDimitry Andric LiveMaskQueries.clear();
16390b57cec5SDimitry Andric LowerToCopyInstrs.clear();
1640480093f4SDimitry Andric LowerToMovInstrs.clear();
1641fe6060f1SDimitry Andric KillInstrs.clear();
1642*0fca6ea1SDimitry Andric InitExecInstrs.clear();
1643fe6060f1SDimitry Andric StateTransition.clear();
16440b57cec5SDimitry Andric
16450b57cec5SDimitry Andric ST = &MF.getSubtarget<GCNSubtarget>();
16460b57cec5SDimitry Andric
16470b57cec5SDimitry Andric TII = ST->getInstrInfo();
16480b57cec5SDimitry Andric TRI = &TII->getRegisterInfo();
16490b57cec5SDimitry Andric MRI = &MF.getRegInfo();
1650*0fca6ea1SDimitry Andric LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1651*0fca6ea1SDimitry Andric auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1652*0fca6ea1SDimitry Andric MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1653*0fca6ea1SDimitry Andric auto *PDTWrapper =
1654*0fca6ea1SDimitry Andric getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1655*0fca6ea1SDimitry Andric PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
16560b57cec5SDimitry Andric
1657e8d8bef9SDimitry Andric if (ST->isWave32()) {
1658e8d8bef9SDimitry Andric AndOpc = AMDGPU::S_AND_B32;
165906c3fb27SDimitry Andric AndTermOpc = AMDGPU::S_AND_B32_term;
1660fe6060f1SDimitry Andric AndN2Opc = AMDGPU::S_ANDN2_B32;
1661fe6060f1SDimitry Andric XorOpc = AMDGPU::S_XOR_B32;
1662fe6060f1SDimitry Andric AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
166306c3fb27SDimitry Andric AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1664fe6060f1SDimitry Andric WQMOpc = AMDGPU::S_WQM_B32;
1665e8d8bef9SDimitry Andric Exec = AMDGPU::EXEC_LO;
1666e8d8bef9SDimitry Andric } else {
1667e8d8bef9SDimitry Andric AndOpc = AMDGPU::S_AND_B64;
166806c3fb27SDimitry Andric AndTermOpc = AMDGPU::S_AND_B64_term;
1669fe6060f1SDimitry Andric AndN2Opc = AMDGPU::S_ANDN2_B64;
1670fe6060f1SDimitry Andric XorOpc = AMDGPU::S_XOR_B64;
1671fe6060f1SDimitry Andric AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
167206c3fb27SDimitry Andric AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1673fe6060f1SDimitry Andric WQMOpc = AMDGPU::S_WQM_B64;
1674e8d8bef9SDimitry Andric Exec = AMDGPU::EXEC;
1675e8d8bef9SDimitry Andric }
1676e8d8bef9SDimitry Andric
1677fe6060f1SDimitry Andric const char GlobalFlags = analyzeFunction(MF);
1678*0fca6ea1SDimitry Andric bool Changed = false;
1679fe6060f1SDimitry Andric
1680fe6060f1SDimitry Andric LiveMaskReg = Exec;
1681fe6060f1SDimitry Andric
16820b57cec5SDimitry Andric MachineBasicBlock &Entry = MF.front();
1683*0fca6ea1SDimitry Andric MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
16840b57cec5SDimitry Andric
1685fe6060f1SDimitry Andric // Store a copy of the original live mask when required
1686*0fca6ea1SDimitry Andric const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1687*0fca6ea1SDimitry Andric const bool HasWaveModes = GlobalFlags & ~StateExact;
1688*0fca6ea1SDimitry Andric const bool HasKills = !KillInstrs.empty();
1689*0fca6ea1SDimitry Andric const bool UsesWQM = GlobalFlags & StateWQM;
1690*0fca6ea1SDimitry Andric if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
16910b57cec5SDimitry Andric LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1692fe6060f1SDimitry Andric MachineInstr *MI =
1693fe6060f1SDimitry Andric BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
16940b57cec5SDimitry Andric .addReg(Exec);
16950b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
1696*0fca6ea1SDimitry Andric Changed = true;
16970b57cec5SDimitry Andric }
16980b57cec5SDimitry Andric
16990b57cec5SDimitry Andric LLVM_DEBUG(printInfo());
17000b57cec5SDimitry Andric
1701*0fca6ea1SDimitry Andric Changed |= lowerLiveMaskQueries();
1702*0fca6ea1SDimitry Andric Changed |= lowerCopyInstrs();
17030b57cec5SDimitry Andric
1704*0fca6ea1SDimitry Andric if (!HasWaveModes) {
1705*0fca6ea1SDimitry Andric // No wave mode execution
1706*0fca6ea1SDimitry Andric Changed |= lowerKillInstrs(false);
1707*0fca6ea1SDimitry Andric } else if (GlobalFlags == StateWQM) {
1708fe6060f1SDimitry Andric // Shader only needs WQM
1709fe6060f1SDimitry Andric auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1710fe6060f1SDimitry Andric .addReg(Exec);
1711fe6060f1SDimitry Andric LIS->InsertMachineInstrInMaps(*MI);
1712fe6060f1SDimitry Andric lowerKillInstrs(true);
1713*0fca6ea1SDimitry Andric Changed = true;
1714fe6060f1SDimitry Andric } else {
1715*0fca6ea1SDimitry Andric // Wave mode switching requires full lowering pass.
17160b57cec5SDimitry Andric for (auto BII : Blocks)
1717fe6060f1SDimitry Andric processBlock(*BII.first, BII.first == &Entry);
1718fe6060f1SDimitry Andric // Lowering blocks causes block splitting so perform as a second pass.
1719fe6060f1SDimitry Andric for (auto BII : Blocks)
1720fe6060f1SDimitry Andric lowerBlock(*BII.first);
1721*0fca6ea1SDimitry Andric Changed = true;
1722fe6060f1SDimitry Andric }
17230b57cec5SDimitry Andric
1724fe6060f1SDimitry Andric // Compute live range for live mask
1725fe6060f1SDimitry Andric if (LiveMaskReg != Exec)
17265ffd83dbSDimitry Andric LIS->createAndComputeVirtRegInterval(LiveMaskReg);
17275ffd83dbSDimitry Andric
17280b57cec5SDimitry Andric // Physical registers like SCC aren't tracked by default anyway, so just
17290b57cec5SDimitry Andric // removing the ranges we computed is the simplest option for maintaining
17300b57cec5SDimitry Andric // the analysis results.
173181ad6265SDimitry Andric LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
17320b57cec5SDimitry Andric
1733fe6060f1SDimitry Andric // If we performed any kills then recompute EXEC
1734*0fca6ea1SDimitry Andric if (!KillInstrs.empty() || !InitExecInstrs.empty())
173581ad6265SDimitry Andric LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1736fe6060f1SDimitry Andric
1737*0fca6ea1SDimitry Andric return Changed;
17380b57cec5SDimitry Andric }
1739