1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 /// ...
32 /// S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
38 /// ...
39 /// S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
47 /// ...
48 /// S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
62 ///
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84
85 using namespace llvm;
86
87 #define DEBUG_TYPE "si-wqm"
88
89 namespace {
90
91 enum {
92 StateWQM = 0x1,
93 StateStrictWWM = 0x2,
94 StateStrictWQM = 0x4,
95 StateExact = 0x8,
96 StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98
99 struct PrintState {
100 public:
101 int State;
102
PrintState__anonf56fbe7e0111::PrintState103 explicit PrintState(int State) : State(State) {}
104 };
105
106 #ifndef NDEBUG
operator <<(raw_ostream & OS,const PrintState & PS)107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109 static const std::pair<char, const char *> Mapping[] = {
110 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112 char State = PS.State;
113 for (auto M : Mapping) {
114 if (State & M.first) {
115 OS << M.second;
116 State &= ~M.first;
117
118 if (State)
119 OS << '|';
120 }
121 }
122 assert(State == 0);
123 return OS;
124 }
125 #endif
126
127 struct InstrInfo {
128 char Needs = 0;
129 char Disabled = 0;
130 char OutNeeds = 0;
131 };
132
133 struct BlockInfo {
134 char Needs = 0;
135 char InNeeds = 0;
136 char OutNeeds = 0;
137 char InitialState = 0;
138 bool NeedsLowering = false;
139 };
140
141 struct WorkItem {
142 MachineBasicBlock *MBB = nullptr;
143 MachineInstr *MI = nullptr;
144
145 WorkItem() = default;
WorkItem__anonf56fbe7e0111::WorkItem146 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem__anonf56fbe7e0111::WorkItem147 WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152 const SIInstrInfo *TII;
153 const SIRegisterInfo *TRI;
154 const GCNSubtarget *ST;
155 MachineRegisterInfo *MRI;
156 LiveIntervals *LIS;
157 MachineDominatorTree *MDT;
158 MachinePostDominatorTree *PDT;
159
160 unsigned AndOpc;
161 unsigned AndTermOpc;
162 unsigned AndN2Opc;
163 unsigned XorOpc;
164 unsigned AndSaveExecOpc;
165 unsigned AndSaveExecTermOpc;
166 unsigned WQMOpc;
167 Register Exec;
168 Register LiveMaskReg;
169
170 DenseMap<const MachineInstr *, InstrInfo> Instructions;
171 MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172
173 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174 DenseMap<const MachineInstr *, char> StateTransition;
175
176 SmallVector<MachineInstr *, 2> LiveMaskQueries;
177 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178 SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179 SmallVector<MachineInstr *, 4> KillInstrs;
180 SmallVector<MachineInstr *, 4> InitExecInstrs;
181
182 void printInfo();
183
184 void markInstruction(MachineInstr &MI, char Flag,
185 std::vector<WorkItem> &Worklist);
186 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189 std::vector<WorkItem> &Worklist);
190 void markInstructionUses(const MachineInstr &MI, char Flag,
191 std::vector<WorkItem> &Worklist);
192 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195 char analyzeFunction(MachineFunction &MF);
196
197 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198 MachineBasicBlock::iterator Before);
199 MachineBasicBlock::iterator
200 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201 MachineBasicBlock::iterator Last, bool PreferLast,
202 bool SaveSCC);
203 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204 Register SaveWQM);
205 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206 Register SavedWQM);
207 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208 Register SaveOrig, char StrictStateNeeded);
209 void fromStrictMode(MachineBasicBlock &MBB,
210 MachineBasicBlock::iterator Before, Register SavedOrig,
211 char NonStrictState, char CurrentStrictState);
212
213 MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214
215 MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216 bool IsWQM);
217 MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218
219 void lowerBlock(MachineBasicBlock &MBB);
220 void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221
222 bool lowerLiveMaskQueries();
223 bool lowerCopyInstrs();
224 bool lowerKillInstrs(bool IsWQM);
225 void lowerInitExec(MachineInstr &MI);
226 MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
227 bool &Changed);
228
229 public:
230 static char ID;
231
SIWholeQuadMode()232 SIWholeQuadMode() :
233 MachineFunctionPass(ID) { }
234
235 bool runOnMachineFunction(MachineFunction &MF) override;
236
getPassName() const237 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
238
getAnalysisUsage(AnalysisUsage & AU) const239 void getAnalysisUsage(AnalysisUsage &AU) const override {
240 AU.addRequired<LiveIntervalsWrapperPass>();
241 AU.addPreserved<SlotIndexesWrapperPass>();
242 AU.addPreserved<LiveIntervalsWrapperPass>();
243 AU.addPreserved<MachineDominatorTreeWrapperPass>();
244 AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
245 MachineFunctionPass::getAnalysisUsage(AU);
246 }
247
getClearedProperties() const248 MachineFunctionProperties getClearedProperties() const override {
249 return MachineFunctionProperties().set(
250 MachineFunctionProperties::Property::IsSSA);
251 }
252 };
253
254 } // end anonymous namespace
255
256 char SIWholeQuadMode::ID = 0;
257
258 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
259 false)
260 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
261 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
262 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
263 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
264 false)
265
266 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
267
createSIWholeQuadModePass()268 FunctionPass *llvm::createSIWholeQuadModePass() {
269 return new SIWholeQuadMode;
270 }
271
272 #ifndef NDEBUG
printInfo()273 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
274 for (const auto &BII : Blocks) {
275 dbgs() << "\n"
276 << printMBBReference(*BII.first) << ":\n"
277 << " InNeeds = " << PrintState(BII.second.InNeeds)
278 << ", Needs = " << PrintState(BII.second.Needs)
279 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
280
281 for (const MachineInstr &MI : *BII.first) {
282 auto III = Instructions.find(&MI);
283 if (III != Instructions.end()) {
284 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
285 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
286 }
287 }
288 }
289 }
290 #endif
291
markInstruction(MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293 std::vector<WorkItem> &Worklist) {
294 InstrInfo &II = Instructions[&MI];
295
296 assert(!(Flag & StateExact) && Flag != 0);
297
298 // Remove any disabled states from the flag. The user that required it gets
299 // an undefined value in the helper lanes. For example, this can happen if
300 // the result of an atomic is used by instruction that requires WQM, where
301 // ignoring the request for WQM is correct as per the relevant specs.
302 Flag &= ~II.Disabled;
303
304 // Ignore if the flag is already encompassed by the existing needs, or we
305 // just disabled everything.
306 if ((II.Needs & Flag) == Flag)
307 return;
308
309 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310 II.Needs |= Flag;
311 Worklist.emplace_back(&MI);
312 }
313
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
markDefs(const MachineInstr & UseMI,LiveRange & LR,Register Reg,unsigned SubReg,char Flag,std::vector<WorkItem> & Worklist)315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316 Register Reg, unsigned SubReg, char Flag,
317 std::vector<WorkItem> &Worklist) {
318 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319
320 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321 const VNInfo *Value = UseLRQ.valueIn();
322 if (!Value)
323 return;
324
325 // Note: this code assumes that lane masks on AMDGPU completely
326 // cover registers.
327 const LaneBitmask UseLanes =
328 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330 : LaneBitmask::getNone());
331
332 // Perform a depth-first iteration of the LiveRange graph marking defs.
333 // Stop processing of a given branch when all use lanes have been defined.
334 // The first definition stops processing for a physical register.
335 struct PhiEntry {
336 const VNInfo *Phi;
337 unsigned PredIdx;
338 LaneBitmask DefinedLanes;
339
340 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342 };
343 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344 SmallVector<PhiEntry, 2> PhiStack;
345 SmallSet<VisitKey, 4> Visited;
346 LaneBitmask DefinedLanes;
347 unsigned NextPredIdx = 0; // Only used for processing phi nodes
348 do {
349 const VNInfo *NextValue = nullptr;
350 const VisitKey Key(Value, DefinedLanes);
351
352 if (Visited.insert(Key).second) {
353 // On first visit to a phi then start processing first predecessor
354 NextPredIdx = 0;
355 }
356
357 if (Value->isPHIDef()) {
358 // Each predecessor node in the phi must be processed as a subgraph
359 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360 assert(MBB && "Phi-def has no defining MBB");
361
362 // Find next predecessor to process
363 unsigned Idx = NextPredIdx;
364 auto PI = MBB->pred_begin() + Idx;
365 auto PE = MBB->pred_end();
366 for (; PI != PE && !NextValue; ++PI, ++Idx) {
367 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368 if (!Visited.count(VisitKey(VN, DefinedLanes)))
369 NextValue = VN;
370 }
371 }
372
373 // If there are more predecessors to process; add phi to stack
374 if (PI != PE)
375 PhiStack.emplace_back(Value, Idx, DefinedLanes);
376 } else {
377 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378 assert(MI && "Def has no defining instruction");
379
380 if (Reg.isVirtual()) {
381 // Iterate over all operands to find relevant definitions
382 bool HasDef = false;
383 for (const MachineOperand &Op : MI->all_defs()) {
384 if (Op.getReg() != Reg)
385 continue;
386
387 // Compute lanes defined and overlap with use
388 LaneBitmask OpLanes =
389 Op.isUndef() ? LaneBitmask::getAll()
390 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391 LaneBitmask Overlap = (UseLanes & OpLanes);
392
393 // Record if this instruction defined any of use
394 HasDef |= Overlap.any();
395
396 // Mark any lanes defined
397 DefinedLanes |= OpLanes;
398 }
399
400 // Check if all lanes of use have been defined
401 if ((DefinedLanes & UseLanes) != UseLanes) {
402 // Definition not complete; need to process input value
403 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404 if (const VNInfo *VN = LRQ.valueIn()) {
405 if (!Visited.count(VisitKey(VN, DefinedLanes)))
406 NextValue = VN;
407 }
408 }
409
410 // Only mark the instruction if it defines some part of the use
411 if (HasDef)
412 markInstruction(*MI, Flag, Worklist);
413 } else {
414 // For physical registers simply mark the defining instruction
415 markInstruction(*MI, Flag, Worklist);
416 }
417 }
418
419 if (!NextValue && !PhiStack.empty()) {
420 // Reach end of chain; revert to processing last phi
421 PhiEntry &Entry = PhiStack.back();
422 NextValue = Entry.Phi;
423 NextPredIdx = Entry.PredIdx;
424 DefinedLanes = Entry.DefinedLanes;
425 PhiStack.pop_back();
426 }
427
428 Value = NextValue;
429 } while (Value);
430 }
431
markOperand(const MachineInstr & MI,const MachineOperand & Op,char Flag,std::vector<WorkItem> & Worklist)432 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433 const MachineOperand &Op, char Flag,
434 std::vector<WorkItem> &Worklist) {
435 assert(Op.isReg());
436 Register Reg = Op.getReg();
437
438 // Ignore some hardware registers
439 switch (Reg) {
440 case AMDGPU::EXEC:
441 case AMDGPU::EXEC_LO:
442 return;
443 default:
444 break;
445 }
446
447 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448 << " for " << MI);
449 if (Reg.isVirtual()) {
450 LiveRange &LR = LIS->getInterval(Reg);
451 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452 } else {
453 // Handle physical registers that we need to track; this is mostly relevant
454 // for VCC, which can appear as the (implicit) input of a uniform branch,
455 // e.g. when a loop counter is stored in a VGPR.
456 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
457 LiveRange &LR = LIS->getRegUnit(Unit);
458 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
459 if (Value)
460 markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
461 }
462 }
463 }
464
465 /// Mark all instructions defining the uses in \p MI with \p Flag.
markInstructionUses(const MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)466 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
467 std::vector<WorkItem> &Worklist) {
468 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
469 << MI);
470
471 for (const MachineOperand &Use : MI.all_uses())
472 markOperand(MI, Use, Flag, Worklist);
473 }
474
475 // Scan instructions to determine which ones require an Exact execmask and
476 // which ones seed WQM requirements.
scanInstructions(MachineFunction & MF,std::vector<WorkItem> & Worklist)477 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
478 std::vector<WorkItem> &Worklist) {
479 char GlobalFlags = 0;
480 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
481 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
482 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
483 bool HasImplicitDerivatives =
484 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
485
486 // We need to visit the basic blocks in reverse post-order so that we visit
487 // defs before uses, in particular so that we don't accidentally mark an
488 // instruction as needing e.g. WQM before visiting it and realizing it needs
489 // WQM disabled.
490 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
491 for (MachineBasicBlock *MBB : RPOT) {
492 BlockInfo &BBI = Blocks[MBB];
493
494 for (MachineInstr &MI : *MBB) {
495 InstrInfo &III = Instructions[&MI];
496 unsigned Opcode = MI.getOpcode();
497 char Flags = 0;
498
499 if (TII->isWQM(Opcode)) {
500 // If LOD is not supported WQM is not needed.
501 // Only generate implicit WQM if implicit derivatives are required.
502 // This avoids inserting unintended WQM if a shader type without
503 // implicit derivatives uses an image sampling instruction.
504 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
505 // Sampling instructions don't need to produce results for all pixels
506 // in a quad, they just require all inputs of a quad to have been
507 // computed for derivatives.
508 markInstructionUses(MI, StateWQM, Worklist);
509 GlobalFlags |= StateWQM;
510 }
511 } else if (Opcode == AMDGPU::WQM) {
512 // The WQM intrinsic requires its output to have all the helper lanes
513 // correct, so we need it to be in WQM.
514 Flags = StateWQM;
515 LowerToCopyInstrs.push_back(&MI);
516 } else if (Opcode == AMDGPU::SOFT_WQM) {
517 LowerToCopyInstrs.push_back(&MI);
518 SoftWQMInstrs.push_back(&MI);
519 } else if (Opcode == AMDGPU::STRICT_WWM) {
520 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
521 // it needs to be executed in WQM or Exact so that its copy doesn't
522 // clobber inactive lanes.
523 markInstructionUses(MI, StateStrictWWM, Worklist);
524 GlobalFlags |= StateStrictWWM;
525 LowerToMovInstrs.push_back(&MI);
526 } else if (Opcode == AMDGPU::STRICT_WQM ||
527 TII->isDualSourceBlendEXP(MI)) {
528 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
529 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
530 // quads that have at least one active thread.
531 markInstructionUses(MI, StateStrictWQM, Worklist);
532 GlobalFlags |= StateStrictWQM;
533
534 if (Opcode == AMDGPU::STRICT_WQM) {
535 LowerToMovInstrs.push_back(&MI);
536 } else {
537 // Dual source blend export acts as implicit strict-wqm, its sources
538 // need to be shuffled in strict wqm, but the export itself needs to
539 // run in exact mode.
540 BBI.Needs |= StateExact;
541 if (!(BBI.InNeeds & StateExact)) {
542 BBI.InNeeds |= StateExact;
543 Worklist.emplace_back(MBB);
544 }
545 GlobalFlags |= StateExact;
546 III.Disabled = StateWQM | StateStrict;
547 }
548 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
549 Opcode == AMDGPU::DS_PARAM_LOAD ||
550 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
551 Opcode == AMDGPU::DS_DIRECT_LOAD) {
552 // Mark these STRICTWQM, but only for the instruction, not its operands.
553 // This avoid unnecessarily marking M0 as requiring WQM.
554 III.Needs |= StateStrictWQM;
555 GlobalFlags |= StateStrictWQM;
556 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
557 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
558 III.Disabled = StateStrict;
559 MachineOperand &Inactive = MI.getOperand(2);
560 if (Inactive.isReg()) {
561 if (Inactive.isUndef()) {
562 LowerToCopyInstrs.push_back(&MI);
563 } else {
564 markOperand(MI, Inactive, StateStrictWWM, Worklist);
565 }
566 }
567 SetInactiveInstrs.push_back(&MI);
568 } else if (TII->isDisableWQM(MI)) {
569 BBI.Needs |= StateExact;
570 if (!(BBI.InNeeds & StateExact)) {
571 BBI.InNeeds |= StateExact;
572 Worklist.emplace_back(MBB);
573 }
574 GlobalFlags |= StateExact;
575 III.Disabled = StateWQM | StateStrict;
576 } else if (Opcode == AMDGPU::SI_PS_LIVE ||
577 Opcode == AMDGPU::SI_LIVE_MASK) {
578 LiveMaskQueries.push_back(&MI);
579 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
580 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
581 Opcode == AMDGPU::SI_DEMOTE_I1) {
582 KillInstrs.push_back(&MI);
583 BBI.NeedsLowering = true;
584 } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
585 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
586 InitExecInstrs.push_back(&MI);
587 } else if (WQMOutputs) {
588 // The function is in machine SSA form, which means that physical
589 // VGPRs correspond to shader inputs and outputs. Inputs are
590 // only used, outputs are only defined.
591 // FIXME: is this still valid?
592 for (const MachineOperand &MO : MI.defs()) {
593 Register Reg = MO.getReg();
594 if (Reg.isPhysical() &&
595 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
596 Flags = StateWQM;
597 break;
598 }
599 }
600 }
601
602 if (Flags) {
603 markInstruction(MI, Flags, Worklist);
604 GlobalFlags |= Flags;
605 }
606 }
607 }
608
609 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
610 // ever used anywhere in the function. This implements the corresponding
611 // semantics of @llvm.amdgcn.set.inactive.
612 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
613 if (GlobalFlags & StateWQM) {
614 for (MachineInstr *MI : SetInactiveInstrs)
615 markInstruction(*MI, StateWQM, Worklist);
616 for (MachineInstr *MI : SoftWQMInstrs)
617 markInstruction(*MI, StateWQM, Worklist);
618 }
619
620 return GlobalFlags;
621 }
622
propagateInstruction(MachineInstr & MI,std::vector<WorkItem> & Worklist)623 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
624 std::vector<WorkItem>& Worklist) {
625 MachineBasicBlock *MBB = MI.getParent();
626 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
627 BlockInfo &BI = Blocks[MBB];
628
629 // Control flow-type instructions and stores to temporary memory that are
630 // followed by WQM computations must themselves be in WQM.
631 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
632 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
633 Instructions[&MI].Needs = StateWQM;
634 II.Needs = StateWQM;
635 }
636
637 // Propagate to block level
638 if (II.Needs & StateWQM) {
639 BI.Needs |= StateWQM;
640 if (!(BI.InNeeds & StateWQM)) {
641 BI.InNeeds |= StateWQM;
642 Worklist.emplace_back(MBB);
643 }
644 }
645
646 // Propagate backwards within block
647 if (MachineInstr *PrevMI = MI.getPrevNode()) {
648 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
649 if (!PrevMI->isPHI()) {
650 InstrInfo &PrevII = Instructions[PrevMI];
651 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
652 PrevII.OutNeeds |= InNeeds;
653 Worklist.emplace_back(PrevMI);
654 }
655 }
656 }
657
658 // Propagate WQM flag to instruction inputs
659 assert(!(II.Needs & StateExact));
660
661 if (II.Needs != 0)
662 markInstructionUses(MI, II.Needs, Worklist);
663
664 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
665 // not require any WQM transitions.
666 if (II.Needs & StateStrictWWM)
667 BI.Needs |= StateStrictWWM;
668 if (II.Needs & StateStrictWQM)
669 BI.Needs |= StateStrictWQM;
670 }
671
propagateBlock(MachineBasicBlock & MBB,std::vector<WorkItem> & Worklist)672 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
673 std::vector<WorkItem>& Worklist) {
674 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
675
676 // Propagate through instructions
677 if (!MBB.empty()) {
678 MachineInstr *LastMI = &*MBB.rbegin();
679 InstrInfo &LastII = Instructions[LastMI];
680 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
681 LastII.OutNeeds |= BI.OutNeeds;
682 Worklist.emplace_back(LastMI);
683 }
684 }
685
686 // Predecessor blocks must provide for our WQM/Exact needs.
687 for (MachineBasicBlock *Pred : MBB.predecessors()) {
688 BlockInfo &PredBI = Blocks[Pred];
689 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
690 continue;
691
692 PredBI.OutNeeds |= BI.InNeeds;
693 PredBI.InNeeds |= BI.InNeeds;
694 Worklist.emplace_back(Pred);
695 }
696
697 // All successors must be prepared to accept the same set of WQM/Exact data.
698 for (MachineBasicBlock *Succ : MBB.successors()) {
699 BlockInfo &SuccBI = Blocks[Succ];
700 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
701 continue;
702
703 SuccBI.InNeeds |= BI.OutNeeds;
704 Worklist.emplace_back(Succ);
705 }
706 }
707
analyzeFunction(MachineFunction & MF)708 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
709 std::vector<WorkItem> Worklist;
710 char GlobalFlags = scanInstructions(MF, Worklist);
711
712 while (!Worklist.empty()) {
713 WorkItem WI = Worklist.back();
714 Worklist.pop_back();
715
716 if (WI.MI)
717 propagateInstruction(*WI.MI, Worklist);
718 else
719 propagateBlock(*WI.MBB, Worklist);
720 }
721
722 return GlobalFlags;
723 }
724
725 MachineBasicBlock::iterator
saveSCC(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before)726 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
727 MachineBasicBlock::iterator Before) {
728 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
729
730 MachineInstr *Save =
731 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
732 .addReg(AMDGPU::SCC);
733 MachineInstr *Restore =
734 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
735 .addReg(SaveReg);
736
737 LIS->InsertMachineInstrInMaps(*Save);
738 LIS->InsertMachineInstrInMaps(*Restore);
739 LIS->createAndComputeVirtRegInterval(SaveReg);
740
741 return Restore;
742 }
743
splitBlock(MachineBasicBlock * BB,MachineInstr * TermMI)744 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
745 MachineInstr *TermMI) {
746 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
747 << *TermMI << "\n");
748
749 MachineBasicBlock *SplitBB =
750 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
751
752 // Convert last instruction in block to a terminator.
753 // Note: this only covers the expected patterns
754 unsigned NewOpcode = 0;
755 switch (TermMI->getOpcode()) {
756 case AMDGPU::S_AND_B32:
757 NewOpcode = AMDGPU::S_AND_B32_term;
758 break;
759 case AMDGPU::S_AND_B64:
760 NewOpcode = AMDGPU::S_AND_B64_term;
761 break;
762 case AMDGPU::S_MOV_B32:
763 NewOpcode = AMDGPU::S_MOV_B32_term;
764 break;
765 case AMDGPU::S_MOV_B64:
766 NewOpcode = AMDGPU::S_MOV_B64_term;
767 break;
768 default:
769 break;
770 }
771 if (NewOpcode)
772 TermMI->setDesc(TII->get(NewOpcode));
773
774 if (SplitBB != BB) {
775 // Update dominator trees
776 using DomTreeT = DomTreeBase<MachineBasicBlock>;
777 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
778 for (MachineBasicBlock *Succ : SplitBB->successors()) {
779 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
780 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
781 }
782 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
783 if (MDT)
784 MDT->getBase().applyUpdates(DTUpdates);
785 if (PDT)
786 PDT->applyUpdates(DTUpdates);
787
788 // Link blocks
789 MachineInstr *MI =
790 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
791 .addMBB(SplitBB);
792 LIS->InsertMachineInstrInMaps(*MI);
793 }
794
795 return SplitBB;
796 }
797
lowerKillF32(MachineBasicBlock & MBB,MachineInstr & MI)798 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
799 MachineInstr &MI) {
800 assert(LiveMaskReg.isVirtual());
801
802 const DebugLoc &DL = MI.getDebugLoc();
803 unsigned Opcode = 0;
804
805 assert(MI.getOperand(0).isReg());
806
807 // Comparison is for live lanes; however here we compute the inverse
808 // (killed lanes). This is because VCMP will always generate 0 bits
809 // for inactive lanes so a mask of live lanes would not be correct
810 // inside control flow.
811 // Invert the comparison by swapping the operands and adjusting
812 // the comparison codes.
813
814 switch (MI.getOperand(2).getImm()) {
815 case ISD::SETUEQ:
816 Opcode = AMDGPU::V_CMP_LG_F32_e64;
817 break;
818 case ISD::SETUGT:
819 Opcode = AMDGPU::V_CMP_GE_F32_e64;
820 break;
821 case ISD::SETUGE:
822 Opcode = AMDGPU::V_CMP_GT_F32_e64;
823 break;
824 case ISD::SETULT:
825 Opcode = AMDGPU::V_CMP_LE_F32_e64;
826 break;
827 case ISD::SETULE:
828 Opcode = AMDGPU::V_CMP_LT_F32_e64;
829 break;
830 case ISD::SETUNE:
831 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
832 break;
833 case ISD::SETO:
834 Opcode = AMDGPU::V_CMP_O_F32_e64;
835 break;
836 case ISD::SETUO:
837 Opcode = AMDGPU::V_CMP_U_F32_e64;
838 break;
839 case ISD::SETOEQ:
840 case ISD::SETEQ:
841 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
842 break;
843 case ISD::SETOGT:
844 case ISD::SETGT:
845 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
846 break;
847 case ISD::SETOGE:
848 case ISD::SETGE:
849 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
850 break;
851 case ISD::SETOLT:
852 case ISD::SETLT:
853 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
854 break;
855 case ISD::SETOLE:
856 case ISD::SETLE:
857 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
858 break;
859 case ISD::SETONE:
860 case ISD::SETNE:
861 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
862 break;
863 default:
864 llvm_unreachable("invalid ISD:SET cond code");
865 }
866
867 // Pick opcode based on comparison type.
868 MachineInstr *VcmpMI;
869 const MachineOperand &Op0 = MI.getOperand(0);
870 const MachineOperand &Op1 = MI.getOperand(1);
871
872 // VCC represents lanes killed.
873 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
874
875 if (TRI->isVGPR(*MRI, Op0.getReg())) {
876 Opcode = AMDGPU::getVOPe32(Opcode);
877 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
878 } else {
879 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
880 .addReg(VCC, RegState::Define)
881 .addImm(0) // src0 modifiers
882 .add(Op1)
883 .addImm(0) // src1 modifiers
884 .add(Op0)
885 .addImm(0); // omod
886 }
887
888 MachineInstr *MaskUpdateMI =
889 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
890 .addReg(LiveMaskReg)
891 .addReg(VCC);
892
893 // State of SCC represents whether any lanes are live in mask,
894 // if SCC is 0 then no lanes will be alive anymore.
895 MachineInstr *EarlyTermMI =
896 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
897
898 MachineInstr *ExecMaskMI =
899 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
900
901 assert(MBB.succ_size() == 1);
902 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
903 .addMBB(*MBB.succ_begin());
904
905 // Update live intervals
906 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
907 MBB.remove(&MI);
908
909 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
910 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
911 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
912 LIS->InsertMachineInstrInMaps(*NewTerm);
913
914 return NewTerm;
915 }
916
lowerKillI1(MachineBasicBlock & MBB,MachineInstr & MI,bool IsWQM)917 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
918 MachineInstr &MI, bool IsWQM) {
919 assert(LiveMaskReg.isVirtual());
920
921 const DebugLoc &DL = MI.getDebugLoc();
922 MachineInstr *MaskUpdateMI = nullptr;
923
924 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
925 const MachineOperand &Op = MI.getOperand(0);
926 int64_t KillVal = MI.getOperand(1).getImm();
927 MachineInstr *ComputeKilledMaskMI = nullptr;
928 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
929 Register TmpReg;
930
931 // Is this a static or dynamic kill?
932 if (Op.isImm()) {
933 if (Op.getImm() == KillVal) {
934 // Static: all active lanes are killed
935 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
936 .addReg(LiveMaskReg)
937 .addReg(Exec);
938 } else {
939 // Static: kill does nothing
940 MachineInstr *NewTerm = nullptr;
941 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
942 LIS->RemoveMachineInstrFromMaps(MI);
943 } else {
944 assert(MBB.succ_size() == 1);
945 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
946 .addMBB(*MBB.succ_begin());
947 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
948 }
949 MBB.remove(&MI);
950 return NewTerm;
951 }
952 } else {
953 if (!KillVal) {
954 // Op represents live lanes after kill,
955 // so exec mask needs to be factored in.
956 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
957 ComputeKilledMaskMI =
958 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
959 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
960 .addReg(LiveMaskReg)
961 .addReg(TmpReg);
962 } else {
963 // Op represents lanes to kill
964 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
965 .addReg(LiveMaskReg)
966 .add(Op);
967 }
968 }
969
970 // State of SCC represents whether any lanes are live in mask,
971 // if SCC is 0 then no lanes will be alive anymore.
972 MachineInstr *EarlyTermMI =
973 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
974
975 // In the case we got this far some lanes are still live,
976 // update EXEC to deactivate lanes as appropriate.
977 MachineInstr *NewTerm;
978 MachineInstr *WQMMaskMI = nullptr;
979 Register LiveMaskWQM;
980 if (IsDemote) {
981 // Demote - deactivate quads with only helper lanes
982 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
983 WQMMaskMI =
984 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
985 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
986 .addReg(Exec)
987 .addReg(LiveMaskWQM);
988 } else {
989 // Kill - deactivate lanes no longer in live mask
990 if (Op.isImm()) {
991 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
992 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
993 } else if (!IsWQM) {
994 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
995 .addReg(Exec)
996 .addReg(LiveMaskReg);
997 } else {
998 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
999 NewTerm =
1000 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1001 }
1002 }
1003
1004 // Update live intervals
1005 LIS->RemoveMachineInstrFromMaps(MI);
1006 MBB.remove(&MI);
1007 assert(EarlyTermMI);
1008 assert(MaskUpdateMI);
1009 assert(NewTerm);
1010 if (ComputeKilledMaskMI)
1011 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1012 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1013 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1014 if (WQMMaskMI)
1015 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1016 LIS->InsertMachineInstrInMaps(*NewTerm);
1017
1018 if (CndReg) {
1019 LIS->removeInterval(CndReg);
1020 LIS->createAndComputeVirtRegInterval(CndReg);
1021 }
1022 if (TmpReg)
1023 LIS->createAndComputeVirtRegInterval(TmpReg);
1024 if (LiveMaskWQM)
1025 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1026
1027 return NewTerm;
1028 }
1029
1030 // Replace (or supplement) instructions accessing live mask.
1031 // This can only happen once all the live mask registers have been created
1032 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
lowerBlock(MachineBasicBlock & MBB)1033 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1034 auto BII = Blocks.find(&MBB);
1035 if (BII == Blocks.end())
1036 return;
1037
1038 const BlockInfo &BI = BII->second;
1039 if (!BI.NeedsLowering)
1040 return;
1041
1042 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1043
1044 SmallVector<MachineInstr *, 4> SplitPoints;
1045 char State = BI.InitialState;
1046
1047 for (MachineInstr &MI : llvm::make_early_inc_range(
1048 llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1049 if (StateTransition.count(&MI))
1050 State = StateTransition[&MI];
1051
1052 MachineInstr *SplitPoint = nullptr;
1053 switch (MI.getOpcode()) {
1054 case AMDGPU::SI_DEMOTE_I1:
1055 case AMDGPU::SI_KILL_I1_TERMINATOR:
1056 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1057 break;
1058 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1059 SplitPoint = lowerKillF32(MBB, MI);
1060 break;
1061 default:
1062 break;
1063 }
1064 if (SplitPoint)
1065 SplitPoints.push_back(SplitPoint);
1066 }
1067
1068 // Perform splitting after instruction scan to simplify iteration.
1069 if (!SplitPoints.empty()) {
1070 MachineBasicBlock *BB = &MBB;
1071 for (MachineInstr *MI : SplitPoints) {
1072 BB = splitBlock(BB, MI);
1073 }
1074 }
1075 }
1076
1077 // Return an iterator in the (inclusive) range [First, Last] at which
1078 // instructions can be safely inserted, keeping in mind that some of the
1079 // instructions we want to add necessarily clobber SCC.
prepareInsertion(MachineBasicBlock & MBB,MachineBasicBlock::iterator First,MachineBasicBlock::iterator Last,bool PreferLast,bool SaveSCC)1080 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1081 MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1082 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1083 if (!SaveSCC)
1084 return PreferLast ? Last : First;
1085
1086 LiveRange &LR =
1087 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1088 auto MBBE = MBB.end();
1089 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1090 : LIS->getMBBEndIdx(&MBB);
1091 SlotIndex LastIdx =
1092 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1093 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1094 const LiveRange::Segment *S;
1095
1096 for (;;) {
1097 S = LR.getSegmentContaining(Idx);
1098 if (!S)
1099 break;
1100
1101 if (PreferLast) {
1102 SlotIndex Next = S->start.getBaseIndex();
1103 if (Next < FirstIdx)
1104 break;
1105 Idx = Next;
1106 } else {
1107 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1108 assert(EndMI && "Segment does not end on valid instruction");
1109 auto NextI = std::next(EndMI->getIterator());
1110 if (NextI == MBB.end())
1111 break;
1112 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1113 if (Next > LastIdx)
1114 break;
1115 Idx = Next;
1116 }
1117 }
1118
1119 MachineBasicBlock::iterator MBBI;
1120
1121 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1122 MBBI = MI;
1123 else {
1124 assert(Idx == LIS->getMBBEndIdx(&MBB));
1125 MBBI = MBB.end();
1126 }
1127
1128 // Move insertion point past any operations modifying EXEC.
1129 // This assumes that the value of SCC defined by any of these operations
1130 // does not need to be preserved.
1131 while (MBBI != Last) {
1132 bool IsExecDef = false;
1133 for (const MachineOperand &MO : MBBI->all_defs()) {
1134 IsExecDef |=
1135 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1136 }
1137 if (!IsExecDef)
1138 break;
1139 MBBI++;
1140 S = nullptr;
1141 }
1142
1143 if (S)
1144 MBBI = saveSCC(MBB, MBBI);
1145
1146 return MBBI;
1147 }
1148
toExact(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveWQM)1149 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1150 MachineBasicBlock::iterator Before,
1151 Register SaveWQM) {
1152 assert(LiveMaskReg.isVirtual());
1153
1154 bool IsTerminator = Before == MBB.end();
1155 if (!IsTerminator) {
1156 auto FirstTerm = MBB.getFirstTerminator();
1157 if (FirstTerm != MBB.end()) {
1158 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1159 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1160 IsTerminator = BeforeIdx > FirstTermIdx;
1161 }
1162 }
1163
1164 MachineInstr *MI;
1165
1166 if (SaveWQM) {
1167 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1168 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1169 .addReg(LiveMaskReg);
1170 } else {
1171 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1172 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1173 .addReg(Exec)
1174 .addReg(LiveMaskReg);
1175 }
1176
1177 LIS->InsertMachineInstrInMaps(*MI);
1178 StateTransition[MI] = StateExact;
1179 }
1180
toWQM(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedWQM)1181 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1182 MachineBasicBlock::iterator Before,
1183 Register SavedWQM) {
1184 MachineInstr *MI;
1185
1186 if (SavedWQM) {
1187 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1188 .addReg(SavedWQM);
1189 } else {
1190 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1191 }
1192
1193 LIS->InsertMachineInstrInMaps(*MI);
1194 StateTransition[MI] = StateWQM;
1195 }
1196
toStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveOrig,char StrictStateNeeded)1197 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1198 MachineBasicBlock::iterator Before,
1199 Register SaveOrig, char StrictStateNeeded) {
1200 MachineInstr *MI;
1201 assert(SaveOrig);
1202 assert(StrictStateNeeded == StateStrictWWM ||
1203 StrictStateNeeded == StateStrictWQM);
1204
1205 if (StrictStateNeeded == StateStrictWWM) {
1206 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1207 SaveOrig)
1208 .addImm(-1);
1209 } else {
1210 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1211 SaveOrig)
1212 .addImm(-1);
1213 }
1214 LIS->InsertMachineInstrInMaps(*MI);
1215 StateTransition[MI] = StrictStateNeeded;
1216 }
1217
fromStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedOrig,char NonStrictState,char CurrentStrictState)1218 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1219 MachineBasicBlock::iterator Before,
1220 Register SavedOrig, char NonStrictState,
1221 char CurrentStrictState) {
1222 MachineInstr *MI;
1223
1224 assert(SavedOrig);
1225 assert(CurrentStrictState == StateStrictWWM ||
1226 CurrentStrictState == StateStrictWQM);
1227
1228 if (CurrentStrictState == StateStrictWWM) {
1229 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1230 Exec)
1231 .addReg(SavedOrig);
1232 } else {
1233 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1234 Exec)
1235 .addReg(SavedOrig);
1236 }
1237 LIS->InsertMachineInstrInMaps(*MI);
1238 StateTransition[MI] = NonStrictState;
1239 }
1240
processBlock(MachineBasicBlock & MBB,bool IsEntry)1241 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1242 auto BII = Blocks.find(&MBB);
1243 if (BII == Blocks.end())
1244 return;
1245
1246 BlockInfo &BI = BII->second;
1247
1248 // This is a non-entry block that is WQM throughout, so no need to do
1249 // anything.
1250 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1251 BI.InitialState = StateWQM;
1252 return;
1253 }
1254
1255 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1256 << ":\n");
1257
1258 Register SavedWQMReg;
1259 Register SavedNonStrictReg;
1260 bool WQMFromExec = IsEntry;
1261 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1262 char NonStrictState = 0;
1263 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1264
1265 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1266 if (IsEntry) {
1267 // Skip the instruction that saves LiveMask
1268 if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1269 II->getOperand(1).getReg() == TRI->getExec())
1270 ++II;
1271 }
1272
1273 // This stores the first instruction where it's safe to switch from WQM to
1274 // Exact or vice versa.
1275 MachineBasicBlock::iterator FirstWQM = IE;
1276
1277 // This stores the first instruction where it's safe to switch from Strict
1278 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1279 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1280 // be safe to switch to/from WQM as well.
1281 MachineBasicBlock::iterator FirstStrict = IE;
1282
1283 // Record initial state is block information.
1284 BI.InitialState = State;
1285
1286 for (;;) {
1287 MachineBasicBlock::iterator Next = II;
1288 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1289 char OutNeeds = 0;
1290
1291 if (FirstWQM == IE)
1292 FirstWQM = II;
1293
1294 if (FirstStrict == IE)
1295 FirstStrict = II;
1296
1297 // First, figure out the allowed states (Needs) based on the propagated
1298 // flags.
1299 if (II != IE) {
1300 MachineInstr &MI = *II;
1301
1302 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1303 auto III = Instructions.find(&MI);
1304 if (III != Instructions.end()) {
1305 if (III->second.Needs & StateStrictWWM)
1306 Needs = StateStrictWWM;
1307 else if (III->second.Needs & StateStrictWQM)
1308 Needs = StateStrictWQM;
1309 else if (III->second.Needs & StateWQM)
1310 Needs = StateWQM;
1311 else
1312 Needs &= ~III->second.Disabled;
1313 OutNeeds = III->second.OutNeeds;
1314 }
1315 } else {
1316 // If the instruction doesn't actually need a correct EXEC, then we can
1317 // safely leave Strict mode enabled.
1318 Needs = StateExact | StateWQM | StateStrict;
1319 }
1320
1321 // Exact mode exit can occur in terminators, but must be before branches.
1322 if (MI.isBranch() && OutNeeds == StateExact)
1323 Needs = StateExact;
1324
1325 ++Next;
1326 } else {
1327 // End of basic block
1328 if (BI.OutNeeds & StateWQM)
1329 Needs = StateWQM;
1330 else if (BI.OutNeeds == StateExact)
1331 Needs = StateExact;
1332 else
1333 Needs = StateWQM | StateExact;
1334 }
1335
1336 // Now, transition if necessary.
1337 if (!(Needs & State)) {
1338 MachineBasicBlock::iterator First;
1339 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1340 State == StateStrictWQM || Needs == StateStrictWQM) {
1341 // We must switch to or from Strict mode.
1342 First = FirstStrict;
1343 } else {
1344 // We only need to switch to/from WQM, so we can use FirstWQM.
1345 First = FirstWQM;
1346 }
1347
1348 // Whether we need to save SCC depends on start and end states.
1349 bool SaveSCC = false;
1350 switch (State) {
1351 case StateExact:
1352 case StateStrictWWM:
1353 case StateStrictWQM:
1354 // Exact/Strict -> Strict: save SCC
1355 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1356 // Exact/Strict -> Exact: no save
1357 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1358 break;
1359 case StateWQM:
1360 // WQM -> Exact/Strict: save SCC
1361 SaveSCC = !(Needs & StateWQM);
1362 break;
1363 default:
1364 llvm_unreachable("Unknown state");
1365 break;
1366 }
1367 MachineBasicBlock::iterator Before =
1368 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1369
1370 if (State & StateStrict) {
1371 assert(State == StateStrictWWM || State == StateStrictWQM);
1372 assert(SavedNonStrictReg);
1373 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1374
1375 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1376 SavedNonStrictReg = 0;
1377 State = NonStrictState;
1378 }
1379
1380 if (Needs & StateStrict) {
1381 NonStrictState = State;
1382 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1383 assert(!SavedNonStrictReg);
1384 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1385
1386 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1387 State = Needs;
1388
1389 } else {
1390 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1391 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1392 assert(!SavedWQMReg);
1393 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1394 }
1395
1396 toExact(MBB, Before, SavedWQMReg);
1397 State = StateExact;
1398 } else if (State == StateExact && (Needs & StateWQM) &&
1399 !(Needs & StateExact)) {
1400 assert(WQMFromExec == (SavedWQMReg == 0));
1401
1402 toWQM(MBB, Before, SavedWQMReg);
1403
1404 if (SavedWQMReg) {
1405 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1406 SavedWQMReg = 0;
1407 }
1408 State = StateWQM;
1409 } else {
1410 // We can get here if we transitioned from StrictWWM to a
1411 // non-StrictWWM state that already matches our needs, but we
1412 // shouldn't need to do anything.
1413 assert(Needs & State);
1414 }
1415 }
1416 }
1417
1418 if (Needs != (StateExact | StateWQM | StateStrict)) {
1419 if (Needs != (StateExact | StateWQM))
1420 FirstWQM = IE;
1421 FirstStrict = IE;
1422 }
1423
1424 if (II == IE)
1425 break;
1426
1427 II = Next;
1428 }
1429 assert(!SavedWQMReg);
1430 assert(!SavedNonStrictReg);
1431 }
1432
lowerLiveMaskQueries()1433 bool SIWholeQuadMode::lowerLiveMaskQueries() {
1434 for (MachineInstr *MI : LiveMaskQueries) {
1435 const DebugLoc &DL = MI->getDebugLoc();
1436 Register Dest = MI->getOperand(0).getReg();
1437
1438 MachineInstr *Copy =
1439 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1440 .addReg(LiveMaskReg);
1441
1442 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1443 MI->eraseFromParent();
1444 }
1445 return !LiveMaskQueries.empty();
1446 }
1447
lowerCopyInstrs()1448 bool SIWholeQuadMode::lowerCopyInstrs() {
1449 for (MachineInstr *MI : LowerToMovInstrs) {
1450 assert(MI->getNumExplicitOperands() == 2);
1451
1452 const Register Reg = MI->getOperand(0).getReg();
1453
1454 const TargetRegisterClass *regClass =
1455 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1456 if (TRI->isVGPRClass(regClass)) {
1457 const unsigned MovOp = TII->getMovOpcode(regClass);
1458 MI->setDesc(TII->get(MovOp));
1459
1460 // Check that it already implicitly depends on exec (like all VALU movs
1461 // should do).
1462 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1463 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1464 }));
1465 } else {
1466 // Remove early-clobber and exec dependency from simple SGPR copies.
1467 // This allows some to be eliminated during/post RA.
1468 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1469 if (MI->getOperand(0).isEarlyClobber()) {
1470 LIS->removeInterval(Reg);
1471 MI->getOperand(0).setIsEarlyClobber(false);
1472 LIS->createAndComputeVirtRegInterval(Reg);
1473 }
1474 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1475 while (Index >= 0) {
1476 MI->removeOperand(Index);
1477 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1478 }
1479 MI->setDesc(TII->get(AMDGPU::COPY));
1480 LLVM_DEBUG(dbgs() << " -> " << *MI);
1481 }
1482 }
1483 for (MachineInstr *MI : LowerToCopyInstrs) {
1484 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1485 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1486 assert(MI->getNumExplicitOperands() == 3);
1487 // the only reason we should be here is V_SET_INACTIVE has
1488 // an undef input so it is being replaced by a simple copy.
1489 // There should be a second undef source that we should remove.
1490 assert(MI->getOperand(2).isUndef());
1491 MI->removeOperand(2);
1492 MI->untieRegOperand(1);
1493 } else {
1494 assert(MI->getNumExplicitOperands() == 2);
1495 }
1496
1497 unsigned CopyOp = MI->getOperand(1).isReg()
1498 ? (unsigned)AMDGPU::COPY
1499 : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1500 *MRI, MI->getOperand(0)));
1501 MI->setDesc(TII->get(CopyOp));
1502 }
1503 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1504 }
1505
lowerKillInstrs(bool IsWQM)1506 bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1507 for (MachineInstr *MI : KillInstrs) {
1508 MachineBasicBlock *MBB = MI->getParent();
1509 MachineInstr *SplitPoint = nullptr;
1510 switch (MI->getOpcode()) {
1511 case AMDGPU::SI_DEMOTE_I1:
1512 case AMDGPU::SI_KILL_I1_TERMINATOR:
1513 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1514 break;
1515 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1516 SplitPoint = lowerKillF32(*MBB, *MI);
1517 break;
1518 }
1519 if (SplitPoint)
1520 splitBlock(MBB, SplitPoint);
1521 }
1522 return !KillInstrs.empty();
1523 }
1524
lowerInitExec(MachineInstr & MI)1525 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1526 MachineBasicBlock *MBB = MI.getParent();
1527 bool IsWave32 = ST->isWave32();
1528
1529 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1530 // This should be before all vector instructions.
1531 MachineInstr *InitMI =
1532 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1533 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1534 Exec)
1535 .addImm(MI.getOperand(0).getImm());
1536 if (LIS) {
1537 LIS->RemoveMachineInstrFromMaps(MI);
1538 LIS->InsertMachineInstrInMaps(*InitMI);
1539 }
1540 MI.eraseFromParent();
1541 return;
1542 }
1543
1544 // Extract the thread count from an SGPR input and set EXEC accordingly.
1545 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1546 //
1547 // S_BFE_U32 count, input, {shift, 7}
1548 // S_BFM_B64 exec, count, 0
1549 // S_CMP_EQ_U32 count, 64
1550 // S_CMOV_B64 exec, -1
1551 Register InputReg = MI.getOperand(0).getReg();
1552 MachineInstr *FirstMI = &*MBB->begin();
1553 if (InputReg.isVirtual()) {
1554 MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1555 assert(DefInstr && DefInstr->isCopy());
1556 if (DefInstr->getParent() == MBB) {
1557 if (DefInstr != FirstMI) {
1558 // If the `InputReg` is defined in current block, we also need to
1559 // move that instruction to the beginning of the block.
1560 DefInstr->removeFromParent();
1561 MBB->insert(FirstMI, DefInstr);
1562 if (LIS)
1563 LIS->handleMove(*DefInstr);
1564 } else {
1565 // If first instruction is definition then move pointer after it.
1566 FirstMI = &*std::next(FirstMI->getIterator());
1567 }
1568 }
1569 }
1570
1571 // Insert instruction sequence at block beginning (before vector operations).
1572 const DebugLoc DL = MI.getDebugLoc();
1573 const unsigned WavefrontSize = ST->getWavefrontSize();
1574 const unsigned Mask = (WavefrontSize << 1) - 1;
1575 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1576 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1577 .addReg(InputReg)
1578 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1579 auto BfmMI =
1580 BuildMI(*MBB, FirstMI, DL,
1581 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1582 .addReg(CountReg)
1583 .addImm(0);
1584 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1585 .addReg(CountReg, RegState::Kill)
1586 .addImm(WavefrontSize);
1587 auto CmovMI =
1588 BuildMI(*MBB, FirstMI, DL,
1589 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1590 Exec)
1591 .addImm(-1);
1592
1593 if (!LIS) {
1594 MI.eraseFromParent();
1595 return;
1596 }
1597
1598 LIS->RemoveMachineInstrFromMaps(MI);
1599 MI.eraseFromParent();
1600
1601 LIS->InsertMachineInstrInMaps(*BfeMI);
1602 LIS->InsertMachineInstrInMaps(*BfmMI);
1603 LIS->InsertMachineInstrInMaps(*CmpMI);
1604 LIS->InsertMachineInstrInMaps(*CmovMI);
1605
1606 LIS->removeInterval(InputReg);
1607 LIS->createAndComputeVirtRegInterval(InputReg);
1608 LIS->createAndComputeVirtRegInterval(CountReg);
1609 }
1610
1611 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1612 /// for instructions that depend on EXEC.
1613 MachineBasicBlock::iterator
lowerInitExecInstrs(MachineBasicBlock & Entry,bool & Changed)1614 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1615 MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1616
1617 for (MachineInstr *MI : InitExecInstrs) {
1618 // Try to handle undefined cases gracefully:
1619 // - multiple INIT_EXEC instructions
1620 // - INIT_EXEC instructions not in the entry block
1621 if (MI->getParent() == &Entry)
1622 InsertPt = std::next(MI->getIterator());
1623
1624 lowerInitExec(*MI);
1625 Changed = true;
1626 }
1627
1628 return InsertPt;
1629 }
1630
runOnMachineFunction(MachineFunction & MF)1631 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1632 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1633 << " ------------- \n");
1634 LLVM_DEBUG(MF.dump(););
1635
1636 Instructions.clear();
1637 Blocks.clear();
1638 LiveMaskQueries.clear();
1639 LowerToCopyInstrs.clear();
1640 LowerToMovInstrs.clear();
1641 KillInstrs.clear();
1642 InitExecInstrs.clear();
1643 StateTransition.clear();
1644
1645 ST = &MF.getSubtarget<GCNSubtarget>();
1646
1647 TII = ST->getInstrInfo();
1648 TRI = &TII->getRegisterInfo();
1649 MRI = &MF.getRegInfo();
1650 LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1651 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1652 MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1653 auto *PDTWrapper =
1654 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1655 PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1656
1657 if (ST->isWave32()) {
1658 AndOpc = AMDGPU::S_AND_B32;
1659 AndTermOpc = AMDGPU::S_AND_B32_term;
1660 AndN2Opc = AMDGPU::S_ANDN2_B32;
1661 XorOpc = AMDGPU::S_XOR_B32;
1662 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1663 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1664 WQMOpc = AMDGPU::S_WQM_B32;
1665 Exec = AMDGPU::EXEC_LO;
1666 } else {
1667 AndOpc = AMDGPU::S_AND_B64;
1668 AndTermOpc = AMDGPU::S_AND_B64_term;
1669 AndN2Opc = AMDGPU::S_ANDN2_B64;
1670 XorOpc = AMDGPU::S_XOR_B64;
1671 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1672 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1673 WQMOpc = AMDGPU::S_WQM_B64;
1674 Exec = AMDGPU::EXEC;
1675 }
1676
1677 const char GlobalFlags = analyzeFunction(MF);
1678 bool Changed = false;
1679
1680 LiveMaskReg = Exec;
1681
1682 MachineBasicBlock &Entry = MF.front();
1683 MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1684
1685 // Store a copy of the original live mask when required
1686 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1687 const bool HasWaveModes = GlobalFlags & ~StateExact;
1688 const bool HasKills = !KillInstrs.empty();
1689 const bool UsesWQM = GlobalFlags & StateWQM;
1690 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1691 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1692 MachineInstr *MI =
1693 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1694 .addReg(Exec);
1695 LIS->InsertMachineInstrInMaps(*MI);
1696 Changed = true;
1697 }
1698
1699 LLVM_DEBUG(printInfo());
1700
1701 Changed |= lowerLiveMaskQueries();
1702 Changed |= lowerCopyInstrs();
1703
1704 if (!HasWaveModes) {
1705 // No wave mode execution
1706 Changed |= lowerKillInstrs(false);
1707 } else if (GlobalFlags == StateWQM) {
1708 // Shader only needs WQM
1709 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1710 .addReg(Exec);
1711 LIS->InsertMachineInstrInMaps(*MI);
1712 lowerKillInstrs(true);
1713 Changed = true;
1714 } else {
1715 // Wave mode switching requires full lowering pass.
1716 for (auto BII : Blocks)
1717 processBlock(*BII.first, BII.first == &Entry);
1718 // Lowering blocks causes block splitting so perform as a second pass.
1719 for (auto BII : Blocks)
1720 lowerBlock(*BII.first);
1721 Changed = true;
1722 }
1723
1724 // Compute live range for live mask
1725 if (LiveMaskReg != Exec)
1726 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1727
1728 // Physical registers like SCC aren't tracked by default anyway, so just
1729 // removing the ranges we computed is the simplest option for maintaining
1730 // the analysis results.
1731 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1732
1733 // If we performed any kills then recompute EXEC
1734 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1735 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1736
1737 return Changed;
1738 }
1739