xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "GCNHazardRecognizer.h"
18 #include "GCNSubtarget.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "Utils/AMDGPUBaseInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
23 #include "llvm/CodeGen/LiveIntervals.h"
24 #include "llvm/CodeGen/LiveVariables.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineScheduler.h"
28 #include "llvm/CodeGen/RegisterScavenging.h"
29 #include "llvm/CodeGen/ScheduleDAG.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/IntrinsicsAMDGPU.h"
32 #include "llvm/MC/MCContext.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Target/TargetMachine.h"
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "si-instr-info"
39 
40 #define GET_INSTRINFO_CTOR_DTOR
41 #include "AMDGPUGenInstrInfo.inc"
42 
43 namespace llvm::AMDGPU {
44 #define GET_D16ImageDimIntrinsics_IMPL
45 #define GET_ImageDimIntrinsicTable_IMPL
46 #define GET_RsrcIntrinsics_IMPL
47 #include "AMDGPUGenSearchableTables.inc"
48 } // namespace llvm::AMDGPU
49 
50 // Must be at least 4 to be able to branch over minimum unconditional branch
51 // code. This is only for making it possible to write reasonably small tests for
52 // long branches.
53 static cl::opt<unsigned>
54 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55                  cl::desc("Restrict range of branch instructions (DEBUG)"));
56 
57 static cl::opt<bool> Fix16BitCopies(
58   "amdgpu-fix-16-bit-physreg-copies",
59   cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60   cl::init(true),
61   cl::ReallyHidden);
62 
SIInstrInfo(const GCNSubtarget & ST)63 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
64   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65     RI(ST), ST(ST) {
66   SchedModel.init(&ST);
67 }
68 
69 //===----------------------------------------------------------------------===//
70 // TargetInstrInfo callbacks
71 //===----------------------------------------------------------------------===//
72 
getNumOperandsNoGlue(SDNode * Node)73 static unsigned getNumOperandsNoGlue(SDNode *Node) {
74   unsigned N = Node->getNumOperands();
75   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76     --N;
77   return N;
78 }
79 
80 /// Returns true if both nodes have the same value for the given
81 ///        operand \p Op, or if both nodes do not have this operand.
nodesHaveSameOperandValue(SDNode * N0,SDNode * N1,AMDGPU::OpName OpName)82 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
83                                       AMDGPU::OpName OpName) {
84   unsigned Opc0 = N0->getMachineOpcode();
85   unsigned Opc1 = N1->getMachineOpcode();
86 
87   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
88   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
89 
90   if (Op0Idx == -1 && Op1Idx == -1)
91     return true;
92 
93 
94   if ((Op0Idx == -1 && Op1Idx != -1) ||
95       (Op1Idx == -1 && Op0Idx != -1))
96     return false;
97 
98   // getNamedOperandIdx returns the index for the MachineInstr's operands,
99   // which includes the result as the first operand. We are indexing into the
100   // MachineSDNode's operands, so we need to skip the result operand to get
101   // the real index.
102   --Op0Idx;
103   --Op1Idx;
104 
105   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
106 }
107 
canRemat(const MachineInstr & MI)108 static bool canRemat(const MachineInstr &MI) {
109 
110   if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
111       SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
112       SIInstrInfo::isSALU(MI))
113     return true;
114 
115   if (SIInstrInfo::isSMRD(MI)) {
116     return !MI.memoperands_empty() &&
117            llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
118              return MMO->isLoad() && MMO->isInvariant();
119            });
120   }
121 
122   return false;
123 }
124 
isReallyTriviallyReMaterializable(const MachineInstr & MI) const125 bool SIInstrInfo::isReallyTriviallyReMaterializable(
126     const MachineInstr &MI) const {
127 
128   if (canRemat(MI)) {
129     // Normally VALU use of exec would block the rematerialization, but that
130     // is OK in this case to have an implicit exec read as all VALU do.
131     // We really want all of the generic logic for this except for this.
132 
133     // Another potential implicit use is mode register. The core logic of
134     // the RA will not attempt rematerialization if mode is set anywhere
135     // in the function, otherwise it is safe since mode is not changed.
136 
137     // There is difference to generic method which does not allow
138     // rematerialization if there are virtual register uses. We allow this,
139     // therefore this method includes SOP instructions as well.
140     if (!MI.hasImplicitDef() &&
141         MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
142         !MI.mayRaiseFPException())
143       return true;
144   }
145 
146   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
147 }
148 
149 // Returns true if the scalar result of a VALU instruction depends on exec.
resultDependsOnExec(const MachineInstr & MI) const150 bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
151   // Ignore comparisons which are only used masked with exec.
152   // This allows some hoisting/sinking of VALU comparisons.
153   if (MI.isCompare()) {
154     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
155     if (!Dst)
156       return true;
157 
158     Register DstReg = Dst->getReg();
159     if (!DstReg.isVirtual())
160       return true;
161 
162     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
163     for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
164       switch (Use.getOpcode()) {
165       case AMDGPU::S_AND_SAVEEXEC_B32:
166       case AMDGPU::S_AND_SAVEEXEC_B64:
167         break;
168       case AMDGPU::S_AND_B32:
169       case AMDGPU::S_AND_B64:
170         if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
171           return true;
172         break;
173       default:
174         return true;
175       }
176     }
177     return false;
178   }
179 
180   switch (MI.getOpcode()) {
181   default:
182     break;
183   case AMDGPU::V_READFIRSTLANE_B32:
184     return true;
185   }
186 
187   return false;
188 }
189 
isIgnorableUse(const MachineOperand & MO) const190 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
191   // Any implicit use of exec by VALU is not a real register read.
192   return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
193          isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
194 }
195 
isSafeToSink(MachineInstr & MI,MachineBasicBlock * SuccToSinkTo,MachineCycleInfo * CI) const196 bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
197                                MachineBasicBlock *SuccToSinkTo,
198                                MachineCycleInfo *CI) const {
199   // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
200   if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
201     return true;
202 
203   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
204   // Check if sinking of MI would create temporal divergent use.
205   for (auto Op : MI.uses()) {
206     if (Op.isReg() && Op.getReg().isVirtual() &&
207         RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
208       MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
209 
210       // SgprDef defined inside cycle
211       MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
212       if (FromCycle == nullptr)
213         continue;
214 
215       MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
216       // Check if there is a FromCycle that contains SgprDef's basic block but
217       // does not contain SuccToSinkTo and also has divergent exit condition.
218       while (FromCycle && !FromCycle->contains(ToCycle)) {
219         SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
220         FromCycle->getExitingBlocks(ExitingBlocks);
221 
222         // FromCycle has divergent exit condition.
223         for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
224           if (hasDivergentBranch(ExitingBlock))
225             return false;
226         }
227 
228         FromCycle = FromCycle->getParentCycle();
229       }
230     }
231   }
232 
233   return true;
234 }
235 
areLoadsFromSameBasePtr(SDNode * Load0,SDNode * Load1,int64_t & Offset0,int64_t & Offset1) const236 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
237                                           int64_t &Offset0,
238                                           int64_t &Offset1) const {
239   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
240     return false;
241 
242   unsigned Opc0 = Load0->getMachineOpcode();
243   unsigned Opc1 = Load1->getMachineOpcode();
244 
245   // Make sure both are actually loads.
246   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
247     return false;
248 
249   // A mayLoad instruction without a def is not a load. Likely a prefetch.
250   if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
251     return false;
252 
253   if (isDS(Opc0) && isDS(Opc1)) {
254 
255     // FIXME: Handle this case:
256     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
257       return false;
258 
259     // Check base reg.
260     if (Load0->getOperand(0) != Load1->getOperand(0))
261       return false;
262 
263     // Skip read2 / write2 variants for simplicity.
264     // TODO: We should report true if the used offsets are adjacent (excluded
265     // st64 versions).
266     int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
267     int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
268     if (Offset0Idx == -1 || Offset1Idx == -1)
269       return false;
270 
271     // XXX - be careful of dataless loads
272     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
273     // include the output in the operand list, but SDNodes don't, we need to
274     // subtract the index by one.
275     Offset0Idx -= get(Opc0).NumDefs;
276     Offset1Idx -= get(Opc1).NumDefs;
277     Offset0 = Load0->getConstantOperandVal(Offset0Idx);
278     Offset1 = Load1->getConstantOperandVal(Offset1Idx);
279     return true;
280   }
281 
282   if (isSMRD(Opc0) && isSMRD(Opc1)) {
283     // Skip time and cache invalidation instructions.
284     if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
285         !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
286       return false;
287 
288     unsigned NumOps = getNumOperandsNoGlue(Load0);
289     if (NumOps != getNumOperandsNoGlue(Load1))
290       return false;
291 
292     // Check base reg.
293     if (Load0->getOperand(0) != Load1->getOperand(0))
294       return false;
295 
296     // Match register offsets, if both register and immediate offsets present.
297     assert(NumOps == 4 || NumOps == 5);
298     if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
299       return false;
300 
301     const ConstantSDNode *Load0Offset =
302         dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
303     const ConstantSDNode *Load1Offset =
304         dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
305 
306     if (!Load0Offset || !Load1Offset)
307       return false;
308 
309     Offset0 = Load0Offset->getZExtValue();
310     Offset1 = Load1Offset->getZExtValue();
311     return true;
312   }
313 
314   // MUBUF and MTBUF can access the same addresses.
315   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
316 
317     // MUBUF and MTBUF have vaddr at different indices.
318     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
319         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
320         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
321       return false;
322 
323     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
324     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
325 
326     if (OffIdx0 == -1 || OffIdx1 == -1)
327       return false;
328 
329     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
330     // include the output in the operand list, but SDNodes don't, we need to
331     // subtract the index by one.
332     OffIdx0 -= get(Opc0).NumDefs;
333     OffIdx1 -= get(Opc1).NumDefs;
334 
335     SDValue Off0 = Load0->getOperand(OffIdx0);
336     SDValue Off1 = Load1->getOperand(OffIdx1);
337 
338     // The offset might be a FrameIndexSDNode.
339     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
340       return false;
341 
342     Offset0 = Off0->getAsZExtVal();
343     Offset1 = Off1->getAsZExtVal();
344     return true;
345   }
346 
347   return false;
348 }
349 
isStride64(unsigned Opc)350 static bool isStride64(unsigned Opc) {
351   switch (Opc) {
352   case AMDGPU::DS_READ2ST64_B32:
353   case AMDGPU::DS_READ2ST64_B64:
354   case AMDGPU::DS_WRITE2ST64_B32:
355   case AMDGPU::DS_WRITE2ST64_B64:
356     return true;
357   default:
358     return false;
359   }
360 }
361 
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,LocationSize & Width,const TargetRegisterInfo * TRI) const362 bool SIInstrInfo::getMemOperandsWithOffsetWidth(
363     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
364     int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
365     const TargetRegisterInfo *TRI) const {
366   if (!LdSt.mayLoadOrStore())
367     return false;
368 
369   unsigned Opc = LdSt.getOpcode();
370   OffsetIsScalable = false;
371   const MachineOperand *BaseOp, *OffsetOp;
372   int DataOpIdx;
373 
374   if (isDS(LdSt)) {
375     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
376     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
377     if (OffsetOp) {
378       // Normal, single offset LDS instruction.
379       if (!BaseOp) {
380         // DS_CONSUME/DS_APPEND use M0 for the base address.
381         // TODO: find the implicit use operand for M0 and use that as BaseOp?
382         return false;
383       }
384       BaseOps.push_back(BaseOp);
385       Offset = OffsetOp->getImm();
386       // Get appropriate operand, and compute width accordingly.
387       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
388       if (DataOpIdx == -1)
389         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
390       if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
391         Width = LocationSize::precise(64);
392       else
393         Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
394     } else {
395       // The 2 offset instructions use offset0 and offset1 instead. We can treat
396       // these as a load with a single offset if the 2 offsets are consecutive.
397       // We will use this for some partially aligned loads.
398       const MachineOperand *Offset0Op =
399           getNamedOperand(LdSt, AMDGPU::OpName::offset0);
400       const MachineOperand *Offset1Op =
401           getNamedOperand(LdSt, AMDGPU::OpName::offset1);
402 
403       unsigned Offset0 = Offset0Op->getImm() & 0xff;
404       unsigned Offset1 = Offset1Op->getImm() & 0xff;
405       if (Offset0 + 1 != Offset1)
406         return false;
407 
408       // Each of these offsets is in element sized units, so we need to convert
409       // to bytes of the individual reads.
410 
411       unsigned EltSize;
412       if (LdSt.mayLoad())
413         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
414       else {
415         assert(LdSt.mayStore());
416         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
417         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
418       }
419 
420       if (isStride64(Opc))
421         EltSize *= 64;
422 
423       BaseOps.push_back(BaseOp);
424       Offset = EltSize * Offset0;
425       // Get appropriate operand(s), and compute width accordingly.
426       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
427       if (DataOpIdx == -1) {
428         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
429         Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
430         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
431         Width = LocationSize::precise(
432             Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
433       } else {
434         Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
435       }
436     }
437     return true;
438   }
439 
440   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
441     const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
442     if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
443       return false;
444     BaseOps.push_back(RSrc);
445     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
446     if (BaseOp && !BaseOp->isFI())
447       BaseOps.push_back(BaseOp);
448     const MachineOperand *OffsetImm =
449         getNamedOperand(LdSt, AMDGPU::OpName::offset);
450     Offset = OffsetImm->getImm();
451     const MachineOperand *SOffset =
452         getNamedOperand(LdSt, AMDGPU::OpName::soffset);
453     if (SOffset) {
454       if (SOffset->isReg())
455         BaseOps.push_back(SOffset);
456       else
457         Offset += SOffset->getImm();
458     }
459     // Get appropriate operand, and compute width accordingly.
460     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
461     if (DataOpIdx == -1)
462       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
463     if (DataOpIdx == -1) // LDS DMA
464       return false;
465     Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
466     return true;
467   }
468 
469   if (isImage(LdSt)) {
470     auto RsrcOpName =
471         isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
472     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
473     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
474     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
475     if (VAddr0Idx >= 0) {
476       // GFX10 possible NSA encoding.
477       for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
478         BaseOps.push_back(&LdSt.getOperand(I));
479     } else {
480       BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
481     }
482     Offset = 0;
483     // Get appropriate operand, and compute width accordingly.
484     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
485     if (DataOpIdx == -1)
486       return false; // no return sampler
487     Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
488     return true;
489   }
490 
491   if (isSMRD(LdSt)) {
492     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
493     if (!BaseOp) // e.g. S_MEMTIME
494       return false;
495     BaseOps.push_back(BaseOp);
496     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
497     Offset = OffsetOp ? OffsetOp->getImm() : 0;
498     // Get appropriate operand, and compute width accordingly.
499     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
500     if (DataOpIdx == -1)
501       return false;
502     Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
503     return true;
504   }
505 
506   if (isFLAT(LdSt)) {
507     // Instructions have either vaddr or saddr or both or none.
508     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
509     if (BaseOp)
510       BaseOps.push_back(BaseOp);
511     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
512     if (BaseOp)
513       BaseOps.push_back(BaseOp);
514     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
515     // Get appropriate operand, and compute width accordingly.
516     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
517     if (DataOpIdx == -1)
518       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
519     if (DataOpIdx == -1) // LDS DMA
520       return false;
521     Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
522     return true;
523   }
524 
525   return false;
526 }
527 
memOpsHaveSameBasePtr(const MachineInstr & MI1,ArrayRef<const MachineOperand * > BaseOps1,const MachineInstr & MI2,ArrayRef<const MachineOperand * > BaseOps2)528 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
529                                   ArrayRef<const MachineOperand *> BaseOps1,
530                                   const MachineInstr &MI2,
531                                   ArrayRef<const MachineOperand *> BaseOps2) {
532   // Only examine the first "base" operand of each instruction, on the
533   // assumption that it represents the real base address of the memory access.
534   // Other operands are typically offsets or indices from this base address.
535   if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
536     return true;
537 
538   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
539     return false;
540 
541   auto *MO1 = *MI1.memoperands_begin();
542   auto *MO2 = *MI2.memoperands_begin();
543   if (MO1->getAddrSpace() != MO2->getAddrSpace())
544     return false;
545 
546   const auto *Base1 = MO1->getValue();
547   const auto *Base2 = MO2->getValue();
548   if (!Base1 || !Base2)
549     return false;
550   Base1 = getUnderlyingObject(Base1);
551   Base2 = getUnderlyingObject(Base2);
552 
553   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
554     return false;
555 
556   return Base1 == Base2;
557 }
558 
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t Offset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t Offset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const559 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
560                                       int64_t Offset1, bool OffsetIsScalable1,
561                                       ArrayRef<const MachineOperand *> BaseOps2,
562                                       int64_t Offset2, bool OffsetIsScalable2,
563                                       unsigned ClusterSize,
564                                       unsigned NumBytes) const {
565   // If the mem ops (to be clustered) do not have the same base ptr, then they
566   // should not be clustered
567   unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
568   if (!BaseOps1.empty() && !BaseOps2.empty()) {
569     const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
570     const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
571     if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
572       return false;
573 
574     const SIMachineFunctionInfo *MFI =
575         FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
576     MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
577   } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
578     // If only one base op is empty, they do not have the same base ptr
579     return false;
580   }
581 
582   // In order to avoid register pressure, on an average, the number of DWORDS
583   // loaded together by all clustered mem ops should not exceed
584   // MaxMemoryClusterDWords. This is an empirical value based on certain
585   // observations and performance related experiments.
586   // The good thing about this heuristic is - it avoids clustering of too many
587   // sub-word loads, and also avoids clustering of wide loads. Below is the
588   // brief summary of how the heuristic behaves for various `LoadSize` when
589   // MaxMemoryClusterDWords is 8.
590   //
591   // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
592   // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
593   // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
594   // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
595   // (5) LoadSize >= 17: do not cluster
596   const unsigned LoadSize = NumBytes / ClusterSize;
597   const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
598   return NumDWords <= MaxMemoryClusterDWords;
599 }
600 
601 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
602 // the first 16 loads will be interleaved with the stores, and the next 16 will
603 // be clustered as expected. It should really split into 2 16 store batches.
604 //
605 // Loads are clustered until this returns false, rather than trying to schedule
606 // groups of stores. This also means we have to deal with saying different
607 // address space loads should be clustered, and ones which might cause bank
608 // conflicts.
609 //
610 // This might be deprecated so it might not be worth that much effort to fix.
shouldScheduleLoadsNear(SDNode * Load0,SDNode * Load1,int64_t Offset0,int64_t Offset1,unsigned NumLoads) const611 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
612                                           int64_t Offset0, int64_t Offset1,
613                                           unsigned NumLoads) const {
614   assert(Offset1 > Offset0 &&
615          "Second offset should be larger than first offset!");
616   // If we have less than 16 loads in a row, and the offsets are within 64
617   // bytes, then schedule together.
618 
619   // A cacheline is 64 bytes (for global memory).
620   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
621 }
622 
reportIllegalCopy(const SIInstrInfo * TII,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,const char * Msg="illegal VGPR to SGPR copy")623 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
624                               MachineBasicBlock::iterator MI,
625                               const DebugLoc &DL, MCRegister DestReg,
626                               MCRegister SrcReg, bool KillSrc,
627                               const char *Msg = "illegal VGPR to SGPR copy") {
628   MachineFunction *MF = MBB.getParent();
629 
630   LLVMContext &C = MF->getFunction().getContext();
631   C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
632 
633   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
634       .addReg(SrcReg, getKillRegState(KillSrc));
635 }
636 
637 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
638 /// possible to have a direct copy in these cases on GFX908, so an intermediate
639 /// VGPR copy is required.
indirectCopyToAGPR(const SIInstrInfo & TII,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,RegScavenger & RS,bool RegsOverlap,Register ImpDefSuperReg=Register (),Register ImpUseSuperReg=Register ())640 static void indirectCopyToAGPR(const SIInstrInfo &TII,
641                                MachineBasicBlock &MBB,
642                                MachineBasicBlock::iterator MI,
643                                const DebugLoc &DL, MCRegister DestReg,
644                                MCRegister SrcReg, bool KillSrc,
645                                RegScavenger &RS, bool RegsOverlap,
646                                Register ImpDefSuperReg = Register(),
647                                Register ImpUseSuperReg = Register()) {
648   assert((TII.getSubtarget().hasMAIInsts() &&
649           !TII.getSubtarget().hasGFX90AInsts()) &&
650          "Expected GFX908 subtarget.");
651 
652   assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
653           AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
654          "Source register of the copy should be either an SGPR or an AGPR.");
655 
656   assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
657          "Destination register of the copy should be an AGPR.");
658 
659   const SIRegisterInfo &RI = TII.getRegisterInfo();
660 
661   // First try to find defining accvgpr_write to avoid temporary registers.
662   // In the case of copies of overlapping AGPRs, we conservatively do not
663   // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
664   // an accvgpr_write used for this same copy due to implicit-defs
665   if (!RegsOverlap) {
666     for (auto Def = MI, E = MBB.begin(); Def != E; ) {
667       --Def;
668 
669       if (!Def->modifiesRegister(SrcReg, &RI))
670         continue;
671 
672       if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
673           Def->getOperand(0).getReg() != SrcReg)
674         break;
675 
676       MachineOperand &DefOp = Def->getOperand(1);
677       assert(DefOp.isReg() || DefOp.isImm());
678 
679       if (DefOp.isReg()) {
680         bool SafeToPropagate = true;
681         // Check that register source operand is not clobbered before MI.
682         // Immediate operands are always safe to propagate.
683         for (auto I = Def; I != MI && SafeToPropagate; ++I)
684           if (I->modifiesRegister(DefOp.getReg(), &RI))
685             SafeToPropagate = false;
686 
687         if (!SafeToPropagate)
688           break;
689 
690         DefOp.setIsKill(false);
691       }
692 
693       MachineInstrBuilder Builder =
694         BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
695         .add(DefOp);
696       if (ImpDefSuperReg)
697         Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
698 
699       if (ImpUseSuperReg) {
700         Builder.addReg(ImpUseSuperReg,
701                       getKillRegState(KillSrc) | RegState::Implicit);
702       }
703 
704       return;
705     }
706   }
707 
708   RS.enterBasicBlockEnd(MBB);
709   RS.backward(std::next(MI));
710 
711   // Ideally we want to have three registers for a long reg_sequence copy
712   // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
713   unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
714                                              *MBB.getParent());
715 
716   // Registers in the sequence are allocated contiguously so we can just
717   // use register number to pick one of three round-robin temps.
718   unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
719   Register Tmp =
720       MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
721   assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
722          "VGPR used for an intermediate copy should have been reserved.");
723 
724   // Only loop through if there are any free registers left. We don't want to
725   // spill.
726   while (RegNo--) {
727     Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
728                                                  /* RestoreAfter */ false, 0,
729                                                  /* AllowSpill */ false);
730     if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
731       break;
732     Tmp = Tmp2;
733     RS.setRegUsed(Tmp);
734   }
735 
736   // Insert copy to temporary VGPR.
737   unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
738   if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
739     TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
740   } else {
741     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
742   }
743 
744   MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
745     .addReg(SrcReg, getKillRegState(KillSrc));
746   if (ImpUseSuperReg) {
747     UseBuilder.addReg(ImpUseSuperReg,
748                       getKillRegState(KillSrc) | RegState::Implicit);
749   }
750 
751   MachineInstrBuilder DefBuilder
752     = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
753     .addReg(Tmp, RegState::Kill);
754 
755   if (ImpDefSuperReg)
756     DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
757 }
758 
expandSGPRCopy(const SIInstrInfo & TII,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,const TargetRegisterClass * RC,bool Forward)759 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
760                            MachineBasicBlock::iterator MI, const DebugLoc &DL,
761                            MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
762                            const TargetRegisterClass *RC, bool Forward) {
763   const SIRegisterInfo &RI = TII.getRegisterInfo();
764   ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
765   MachineBasicBlock::iterator I = MI;
766   MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
767 
768   for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
769     int16_t SubIdx = BaseIndices[Idx];
770     Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
771     Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
772     assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
773     unsigned Opcode = AMDGPU::S_MOV_B32;
774 
775     // Is SGPR aligned? If so try to combine with next.
776     bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
777     bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
778     if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
779       // Can use SGPR64 copy
780       unsigned Channel = RI.getChannelFromSubReg(SubIdx);
781       SubIdx = RI.getSubRegFromChannel(Channel, 2);
782       DestSubReg = RI.getSubReg(DestReg, SubIdx);
783       SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
784       assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
785       Opcode = AMDGPU::S_MOV_B64;
786       Idx++;
787     }
788 
789     LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
790                  .addReg(SrcSubReg)
791                  .addReg(SrcReg, RegState::Implicit);
792 
793     if (!FirstMI)
794       FirstMI = LastMI;
795 
796     if (!Forward)
797       I--;
798   }
799 
800   assert(FirstMI && LastMI);
801   if (!Forward)
802     std::swap(FirstMI, LastMI);
803 
804   FirstMI->addOperand(
805       MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
806 
807   if (KillSrc)
808     LastMI->addRegisterKilled(SrcReg, &RI);
809 }
810 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,Register DestReg,Register SrcReg,bool KillSrc,bool RenamableDest,bool RenamableSrc) const811 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
812                               MachineBasicBlock::iterator MI,
813                               const DebugLoc &DL, Register DestReg,
814                               Register SrcReg, bool KillSrc, bool RenamableDest,
815                               bool RenamableSrc) const {
816   const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
817   unsigned Size = RI.getRegSizeInBits(*RC);
818   const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
819   unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
820 
821   // The rest of copyPhysReg assumes Src and Dst size are the same size.
822   // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
823   // we remove Fix16BitCopies and this code block?
824   if (Fix16BitCopies) {
825     if (((Size == 16) != (SrcSize == 16))) {
826       // Non-VGPR Src and Dst will later be expanded back to 32 bits.
827       assert(ST.useRealTrue16Insts());
828       Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
829       MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
830       RegToFix = SubReg;
831 
832       if (DestReg == SrcReg) {
833         // Identity copy. Insert empty bundle since ExpandPostRA expects an
834         // instruction here.
835         BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
836         return;
837       }
838       RC = RI.getPhysRegBaseClass(DestReg);
839       Size = RI.getRegSizeInBits(*RC);
840       SrcRC = RI.getPhysRegBaseClass(SrcReg);
841       SrcSize = RI.getRegSizeInBits(*SrcRC);
842     }
843   }
844 
845   if (RC == &AMDGPU::VGPR_32RegClass) {
846     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
847            AMDGPU::SReg_32RegClass.contains(SrcReg) ||
848            AMDGPU::AGPR_32RegClass.contains(SrcReg));
849     unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
850                      AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
851     BuildMI(MBB, MI, DL, get(Opc), DestReg)
852       .addReg(SrcReg, getKillRegState(KillSrc));
853     return;
854   }
855 
856   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
857       RC == &AMDGPU::SReg_32RegClass) {
858     if (SrcReg == AMDGPU::SCC) {
859       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
860           .addImm(1)
861           .addImm(0);
862       return;
863     }
864 
865     if (DestReg == AMDGPU::VCC_LO) {
866       if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
867         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
868           .addReg(SrcReg, getKillRegState(KillSrc));
869       } else {
870         // FIXME: Hack until VReg_1 removed.
871         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
872         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
873           .addImm(0)
874           .addReg(SrcReg, getKillRegState(KillSrc));
875       }
876 
877       return;
878     }
879 
880     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
881       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
882       return;
883     }
884 
885     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
886             .addReg(SrcReg, getKillRegState(KillSrc));
887     return;
888   }
889 
890   if (RC == &AMDGPU::SReg_64RegClass) {
891     if (SrcReg == AMDGPU::SCC) {
892       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
893           .addImm(1)
894           .addImm(0);
895       return;
896     }
897 
898     if (DestReg == AMDGPU::VCC) {
899       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
900         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
901           .addReg(SrcReg, getKillRegState(KillSrc));
902       } else {
903         // FIXME: Hack until VReg_1 removed.
904         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
905         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
906           .addImm(0)
907           .addReg(SrcReg, getKillRegState(KillSrc));
908       }
909 
910       return;
911     }
912 
913     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
914       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
915       return;
916     }
917 
918     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
919             .addReg(SrcReg, getKillRegState(KillSrc));
920     return;
921   }
922 
923   if (DestReg == AMDGPU::SCC) {
924     // Copying 64-bit or 32-bit sources to SCC barely makes sense,
925     // but SelectionDAG emits such copies for i1 sources.
926     if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
927       // This copy can only be produced by patterns
928       // with explicit SCC, which are known to be enabled
929       // only for subtargets with S_CMP_LG_U64 present.
930       assert(ST.hasScalarCompareEq64());
931       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
932           .addReg(SrcReg, getKillRegState(KillSrc))
933           .addImm(0);
934     } else {
935       assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
936       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
937           .addReg(SrcReg, getKillRegState(KillSrc))
938           .addImm(0);
939     }
940 
941     return;
942   }
943 
944   if (RC == &AMDGPU::AGPR_32RegClass) {
945     if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
946         (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
947       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
948         .addReg(SrcReg, getKillRegState(KillSrc));
949       return;
950     }
951 
952     if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
953       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
954         .addReg(SrcReg, getKillRegState(KillSrc));
955       return;
956     }
957 
958     // FIXME: Pass should maintain scavenger to avoid scan through the block on
959     // every AGPR spill.
960     RegScavenger RS;
961     const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
962     indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
963     return;
964   }
965 
966   if (Size == 16) {
967     assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
968            AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
969            AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
970 
971     bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
972     bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
973     bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
974     bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
975     bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
976     bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
977     MCRegister NewDestReg = RI.get32BitRegister(DestReg);
978     MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
979 
980     if (IsSGPRDst) {
981       if (!IsSGPRSrc) {
982         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
983         return;
984       }
985 
986       BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
987         .addReg(NewSrcReg, getKillRegState(KillSrc));
988       return;
989     }
990 
991     if (IsAGPRDst || IsAGPRSrc) {
992       if (!DstLow || !SrcLow) {
993         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
994                           "Cannot use hi16 subreg with an AGPR!");
995       }
996 
997       copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
998       return;
999     }
1000 
1001     if (ST.useRealTrue16Insts()) {
1002       if (IsSGPRSrc) {
1003         assert(SrcLow);
1004         SrcReg = NewSrcReg;
1005       }
1006       // Use the smaller instruction encoding if possible.
1007       if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1008           (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1009         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1010             .addReg(SrcReg);
1011       } else {
1012         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1013             .addImm(0) // src0_modifiers
1014             .addReg(SrcReg)
1015             .addImm(0); // op_sel
1016       }
1017       return;
1018     }
1019 
1020     if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1021       if (!DstLow || !SrcLow) {
1022         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1023                           "Cannot use hi16 subreg on VI!");
1024       }
1025 
1026       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1027         .addReg(NewSrcReg, getKillRegState(KillSrc));
1028       return;
1029     }
1030 
1031     auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1032       .addImm(0) // src0_modifiers
1033       .addReg(NewSrcReg)
1034       .addImm(0) // clamp
1035       .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1036                      : AMDGPU::SDWA::SdwaSel::WORD_1)
1037       .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1038       .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1039                      : AMDGPU::SDWA::SdwaSel::WORD_1)
1040       .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
1041     // First implicit operand is $exec.
1042     MIB->tieOperands(0, MIB->getNumOperands() - 1);
1043     return;
1044   }
1045 
1046   if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1047     if (ST.hasMovB64()) {
1048       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1049         .addReg(SrcReg, getKillRegState(KillSrc));
1050       return;
1051     }
1052     if (ST.hasPkMovB32()) {
1053       BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1054         .addImm(SISrcMods::OP_SEL_1)
1055         .addReg(SrcReg)
1056         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1057         .addReg(SrcReg)
1058         .addImm(0) // op_sel_lo
1059         .addImm(0) // op_sel_hi
1060         .addImm(0) // neg_lo
1061         .addImm(0) // neg_hi
1062         .addImm(0) // clamp
1063         .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1064       return;
1065     }
1066   }
1067 
1068   const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1069   if (RI.isSGPRClass(RC)) {
1070     if (!RI.isSGPRClass(SrcRC)) {
1071       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1072       return;
1073     }
1074     const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1075     expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1076                    Forward);
1077     return;
1078   }
1079 
1080   unsigned EltSize = 4;
1081   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1082   if (RI.isAGPRClass(RC)) {
1083     if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1084       Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1085     else if (RI.hasVGPRs(SrcRC) ||
1086              (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1087       Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1088     else
1089       Opcode = AMDGPU::INSTRUCTION_LIST_END;
1090   } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1091     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1092   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1093              (RI.isProperlyAlignedRC(*RC) &&
1094               (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1095     // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1096     if (ST.hasMovB64()) {
1097       Opcode = AMDGPU::V_MOV_B64_e32;
1098       EltSize = 8;
1099     } else if (ST.hasPkMovB32()) {
1100       Opcode = AMDGPU::V_PK_MOV_B32;
1101       EltSize = 8;
1102     }
1103   }
1104 
1105   // For the cases where we need an intermediate instruction/temporary register
1106   // (destination is an AGPR), we need a scavenger.
1107   //
1108   // FIXME: The pass should maintain this for us so we don't have to re-scan the
1109   // whole block for every handled copy.
1110   std::unique_ptr<RegScavenger> RS;
1111   if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1112     RS = std::make_unique<RegScavenger>();
1113 
1114   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1115 
1116   // If there is an overlap, we can't kill the super-register on the last
1117   // instruction, since it will also kill the components made live by this def.
1118   const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1119   const bool CanKillSuperReg = KillSrc && !Overlap;
1120 
1121   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1122     unsigned SubIdx;
1123     if (Forward)
1124       SubIdx = SubIndices[Idx];
1125     else
1126       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1127     Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1128     Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1129     assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1130 
1131     bool IsFirstSubreg = Idx == 0;
1132     bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1133 
1134     if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1135       Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1136       Register ImpUseSuper = SrcReg;
1137       indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1138                          *RS, Overlap, ImpDefSuper, ImpUseSuper);
1139     } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1140       MachineInstrBuilder MIB =
1141           BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1142               .addImm(SISrcMods::OP_SEL_1)
1143               .addReg(SrcSubReg)
1144               .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1145               .addReg(SrcSubReg)
1146               .addImm(0) // op_sel_lo
1147               .addImm(0) // op_sel_hi
1148               .addImm(0) // neg_lo
1149               .addImm(0) // neg_hi
1150               .addImm(0) // clamp
1151               .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1152       if (IsFirstSubreg)
1153         MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
1154     } else {
1155       MachineInstrBuilder Builder =
1156           BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1157       if (IsFirstSubreg)
1158         Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1159 
1160       Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1161     }
1162   }
1163 }
1164 
commuteOpcode(unsigned Opcode) const1165 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1166   int NewOpc;
1167 
1168   // Try to map original to commuted opcode
1169   NewOpc = AMDGPU::getCommuteRev(Opcode);
1170   if (NewOpc != -1)
1171     // Check if the commuted (REV) opcode exists on the target.
1172     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1173 
1174   // Try to map commuted to original opcode
1175   NewOpc = AMDGPU::getCommuteOrig(Opcode);
1176   if (NewOpc != -1)
1177     // Check if the original (non-REV) opcode exists on the target.
1178     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1179 
1180   return Opcode;
1181 }
1182 
1183 const TargetRegisterClass *
getPreferredSelectRegClass(unsigned Size) const1184 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1185   return &AMDGPU::VGPR_32RegClass;
1186 }
1187 
insertVectorSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const1188 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1189                                      MachineBasicBlock::iterator I,
1190                                      const DebugLoc &DL, Register DstReg,
1191                                      ArrayRef<MachineOperand> Cond,
1192                                      Register TrueReg,
1193                                      Register FalseReg) const {
1194   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1195   const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1196   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1197          "Not a VGPR32 reg");
1198 
1199   if (Cond.size() == 1) {
1200     Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1201     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1202       .add(Cond[0]);
1203     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1204       .addImm(0)
1205       .addReg(FalseReg)
1206       .addImm(0)
1207       .addReg(TrueReg)
1208       .addReg(SReg);
1209   } else if (Cond.size() == 2) {
1210     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1211     switch (Cond[0].getImm()) {
1212     case SIInstrInfo::SCC_TRUE: {
1213       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1214       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1215                                             : AMDGPU::S_CSELECT_B64), SReg)
1216         .addImm(1)
1217         .addImm(0);
1218       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1219         .addImm(0)
1220         .addReg(FalseReg)
1221         .addImm(0)
1222         .addReg(TrueReg)
1223         .addReg(SReg);
1224       break;
1225     }
1226     case SIInstrInfo::SCC_FALSE: {
1227       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1228       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1229                                             : AMDGPU::S_CSELECT_B64), SReg)
1230         .addImm(0)
1231         .addImm(1);
1232       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1233         .addImm(0)
1234         .addReg(FalseReg)
1235         .addImm(0)
1236         .addReg(TrueReg)
1237         .addReg(SReg);
1238       break;
1239     }
1240     case SIInstrInfo::VCCNZ: {
1241       MachineOperand RegOp = Cond[1];
1242       RegOp.setImplicit(false);
1243       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1244       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1245         .add(RegOp);
1246       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1247           .addImm(0)
1248           .addReg(FalseReg)
1249           .addImm(0)
1250           .addReg(TrueReg)
1251           .addReg(SReg);
1252       break;
1253     }
1254     case SIInstrInfo::VCCZ: {
1255       MachineOperand RegOp = Cond[1];
1256       RegOp.setImplicit(false);
1257       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1258       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1259         .add(RegOp);
1260       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261           .addImm(0)
1262           .addReg(TrueReg)
1263           .addImm(0)
1264           .addReg(FalseReg)
1265           .addReg(SReg);
1266       break;
1267     }
1268     case SIInstrInfo::EXECNZ: {
1269       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1272                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1273         .addImm(0);
1274       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275                                             : AMDGPU::S_CSELECT_B64), SReg)
1276         .addImm(1)
1277         .addImm(0);
1278       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279         .addImm(0)
1280         .addReg(FalseReg)
1281         .addImm(0)
1282         .addReg(TrueReg)
1283         .addReg(SReg);
1284       break;
1285     }
1286     case SIInstrInfo::EXECZ: {
1287       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1288       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1289       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1290                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1291         .addImm(0);
1292       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1293                                             : AMDGPU::S_CSELECT_B64), SReg)
1294         .addImm(0)
1295         .addImm(1);
1296       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1297         .addImm(0)
1298         .addReg(FalseReg)
1299         .addImm(0)
1300         .addReg(TrueReg)
1301         .addReg(SReg);
1302       llvm_unreachable("Unhandled branch predicate EXECZ");
1303       break;
1304     }
1305     default:
1306       llvm_unreachable("invalid branch predicate");
1307     }
1308   } else {
1309     llvm_unreachable("Can only handle Cond size 1 or 2");
1310   }
1311 }
1312 
insertEQ(MachineBasicBlock * MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SrcReg,int Value) const1313 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1314                                MachineBasicBlock::iterator I,
1315                                const DebugLoc &DL,
1316                                Register SrcReg, int Value) const {
1317   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1318   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1319   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1320     .addImm(Value)
1321     .addReg(SrcReg);
1322 
1323   return Reg;
1324 }
1325 
insertNE(MachineBasicBlock * MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SrcReg,int Value) const1326 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1327                                MachineBasicBlock::iterator I,
1328                                const DebugLoc &DL,
1329                                Register SrcReg, int Value) const {
1330   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1331   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1332   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1333     .addImm(Value)
1334     .addReg(SrcReg);
1335 
1336   return Reg;
1337 }
1338 
getConstValDefinedInReg(const MachineInstr & MI,const Register Reg,int64_t & ImmVal) const1339 bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1340                                           const Register Reg,
1341                                           int64_t &ImmVal) const {
1342   switch (MI.getOpcode()) {
1343   case AMDGPU::V_MOV_B32_e32:
1344   case AMDGPU::S_MOV_B32:
1345   case AMDGPU::S_MOVK_I32:
1346   case AMDGPU::S_MOV_B64:
1347   case AMDGPU::V_MOV_B64_e32:
1348   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1349   case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1350   case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1351   case AMDGPU::V_MOV_B64_PSEUDO: {
1352     const MachineOperand &Src0 = MI.getOperand(1);
1353     if (Src0.isImm()) {
1354       ImmVal = Src0.getImm();
1355       return MI.getOperand(0).getReg() == Reg;
1356     }
1357 
1358     return false;
1359   }
1360   case AMDGPU::S_BREV_B32:
1361   case AMDGPU::V_BFREV_B32_e32:
1362   case AMDGPU::V_BFREV_B32_e64: {
1363     const MachineOperand &Src0 = MI.getOperand(1);
1364     if (Src0.isImm()) {
1365       ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1366       return MI.getOperand(0).getReg() == Reg;
1367     }
1368 
1369     return false;
1370   }
1371   case AMDGPU::S_NOT_B32:
1372   case AMDGPU::V_NOT_B32_e32:
1373   case AMDGPU::V_NOT_B32_e64: {
1374     const MachineOperand &Src0 = MI.getOperand(1);
1375     if (Src0.isImm()) {
1376       ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1377       return MI.getOperand(0).getReg() == Reg;
1378     }
1379 
1380     return false;
1381   }
1382   default:
1383     return false;
1384   }
1385 }
1386 
getMovOpcode(const TargetRegisterClass * DstRC) const1387 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1388 
1389   if (RI.isAGPRClass(DstRC))
1390     return AMDGPU::COPY;
1391   if (RI.getRegSizeInBits(*DstRC) == 16) {
1392     // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1393     // before RA.
1394     return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1395   }
1396   if (RI.getRegSizeInBits(*DstRC) == 32)
1397     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1398   if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1399     return AMDGPU::S_MOV_B64;
1400   if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1401     return AMDGPU::V_MOV_B64_PSEUDO;
1402   return AMDGPU::COPY;
1403 }
1404 
1405 const MCInstrDesc &
getIndirectGPRIDXPseudo(unsigned VecSize,bool IsIndirectSrc) const1406 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1407                                      bool IsIndirectSrc) const {
1408   if (IsIndirectSrc) {
1409     if (VecSize <= 32) // 4 bytes
1410       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1411     if (VecSize <= 64) // 8 bytes
1412       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1413     if (VecSize <= 96) // 12 bytes
1414       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1415     if (VecSize <= 128) // 16 bytes
1416       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1417     if (VecSize <= 160) // 20 bytes
1418       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1419     if (VecSize <= 256) // 32 bytes
1420       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1421     if (VecSize <= 288) // 36 bytes
1422       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1423     if (VecSize <= 320) // 40 bytes
1424       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1425     if (VecSize <= 352) // 44 bytes
1426       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1427     if (VecSize <= 384) // 48 bytes
1428       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1429     if (VecSize <= 512) // 64 bytes
1430       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1431     if (VecSize <= 1024) // 128 bytes
1432       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1433 
1434     llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1435   }
1436 
1437   if (VecSize <= 32) // 4 bytes
1438     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1439   if (VecSize <= 64) // 8 bytes
1440     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1441   if (VecSize <= 96) // 12 bytes
1442     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1443   if (VecSize <= 128) // 16 bytes
1444     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1445   if (VecSize <= 160) // 20 bytes
1446     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1447   if (VecSize <= 256) // 32 bytes
1448     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1449   if (VecSize <= 288) // 36 bytes
1450     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1451   if (VecSize <= 320) // 40 bytes
1452     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1453   if (VecSize <= 352) // 44 bytes
1454     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1455   if (VecSize <= 384) // 48 bytes
1456     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1457   if (VecSize <= 512) // 64 bytes
1458     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1459   if (VecSize <= 1024) // 128 bytes
1460     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1461 
1462   llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1463 }
1464 
getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)1465 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1466   if (VecSize <= 32) // 4 bytes
1467     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1468   if (VecSize <= 64) // 8 bytes
1469     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1470   if (VecSize <= 96) // 12 bytes
1471     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1472   if (VecSize <= 128) // 16 bytes
1473     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1474   if (VecSize <= 160) // 20 bytes
1475     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1476   if (VecSize <= 256) // 32 bytes
1477     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478   if (VecSize <= 288) // 36 bytes
1479     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480   if (VecSize <= 320) // 40 bytes
1481     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482   if (VecSize <= 352) // 44 bytes
1483     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484   if (VecSize <= 384) // 48 bytes
1485     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486   if (VecSize <= 512) // 64 bytes
1487     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488   if (VecSize <= 1024) // 128 bytes
1489     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490 
1491   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492 }
1493 
getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)1494 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495   if (VecSize <= 32) // 4 bytes
1496     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497   if (VecSize <= 64) // 8 bytes
1498     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499   if (VecSize <= 96) // 12 bytes
1500     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501   if (VecSize <= 128) // 16 bytes
1502     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503   if (VecSize <= 160) // 20 bytes
1504     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505   if (VecSize <= 256) // 32 bytes
1506     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1507   if (VecSize <= 288) // 36 bytes
1508     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1509   if (VecSize <= 320) // 40 bytes
1510     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1511   if (VecSize <= 352) // 44 bytes
1512     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1513   if (VecSize <= 384) // 48 bytes
1514     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1515   if (VecSize <= 512) // 64 bytes
1516     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1517   if (VecSize <= 1024) // 128 bytes
1518     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1519 
1520   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1521 }
1522 
getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)1523 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1524   if (VecSize <= 64) // 8 bytes
1525     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1526   if (VecSize <= 128) // 16 bytes
1527     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1528   if (VecSize <= 256) // 32 bytes
1529     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1530   if (VecSize <= 512) // 64 bytes
1531     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1532   if (VecSize <= 1024) // 128 bytes
1533     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1534 
1535   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1536 }
1537 
1538 const MCInstrDesc &
getIndirectRegWriteMovRelPseudo(unsigned VecSize,unsigned EltSize,bool IsSGPR) const1539 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1540                                              bool IsSGPR) const {
1541   if (IsSGPR) {
1542     switch (EltSize) {
1543     case 32:
1544       return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1545     case 64:
1546       return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1547     default:
1548       llvm_unreachable("invalid reg indexing elt size");
1549     }
1550   }
1551 
1552   assert(EltSize == 32 && "invalid reg indexing elt size");
1553   return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1554 }
1555 
getSGPRSpillSaveOpcode(unsigned Size)1556 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1557   switch (Size) {
1558   case 4:
1559     return AMDGPU::SI_SPILL_S32_SAVE;
1560   case 8:
1561     return AMDGPU::SI_SPILL_S64_SAVE;
1562   case 12:
1563     return AMDGPU::SI_SPILL_S96_SAVE;
1564   case 16:
1565     return AMDGPU::SI_SPILL_S128_SAVE;
1566   case 20:
1567     return AMDGPU::SI_SPILL_S160_SAVE;
1568   case 24:
1569     return AMDGPU::SI_SPILL_S192_SAVE;
1570   case 28:
1571     return AMDGPU::SI_SPILL_S224_SAVE;
1572   case 32:
1573     return AMDGPU::SI_SPILL_S256_SAVE;
1574   case 36:
1575     return AMDGPU::SI_SPILL_S288_SAVE;
1576   case 40:
1577     return AMDGPU::SI_SPILL_S320_SAVE;
1578   case 44:
1579     return AMDGPU::SI_SPILL_S352_SAVE;
1580   case 48:
1581     return AMDGPU::SI_SPILL_S384_SAVE;
1582   case 64:
1583     return AMDGPU::SI_SPILL_S512_SAVE;
1584   case 128:
1585     return AMDGPU::SI_SPILL_S1024_SAVE;
1586   default:
1587     llvm_unreachable("unknown register size");
1588   }
1589 }
1590 
getVGPRSpillSaveOpcode(unsigned Size)1591 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1592   switch (Size) {
1593   case 2:
1594     return AMDGPU::SI_SPILL_V16_SAVE;
1595   case 4:
1596     return AMDGPU::SI_SPILL_V32_SAVE;
1597   case 8:
1598     return AMDGPU::SI_SPILL_V64_SAVE;
1599   case 12:
1600     return AMDGPU::SI_SPILL_V96_SAVE;
1601   case 16:
1602     return AMDGPU::SI_SPILL_V128_SAVE;
1603   case 20:
1604     return AMDGPU::SI_SPILL_V160_SAVE;
1605   case 24:
1606     return AMDGPU::SI_SPILL_V192_SAVE;
1607   case 28:
1608     return AMDGPU::SI_SPILL_V224_SAVE;
1609   case 32:
1610     return AMDGPU::SI_SPILL_V256_SAVE;
1611   case 36:
1612     return AMDGPU::SI_SPILL_V288_SAVE;
1613   case 40:
1614     return AMDGPU::SI_SPILL_V320_SAVE;
1615   case 44:
1616     return AMDGPU::SI_SPILL_V352_SAVE;
1617   case 48:
1618     return AMDGPU::SI_SPILL_V384_SAVE;
1619   case 64:
1620     return AMDGPU::SI_SPILL_V512_SAVE;
1621   case 128:
1622     return AMDGPU::SI_SPILL_V1024_SAVE;
1623   default:
1624     llvm_unreachable("unknown register size");
1625   }
1626 }
1627 
getAGPRSpillSaveOpcode(unsigned Size)1628 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1629   switch (Size) {
1630   case 4:
1631     return AMDGPU::SI_SPILL_A32_SAVE;
1632   case 8:
1633     return AMDGPU::SI_SPILL_A64_SAVE;
1634   case 12:
1635     return AMDGPU::SI_SPILL_A96_SAVE;
1636   case 16:
1637     return AMDGPU::SI_SPILL_A128_SAVE;
1638   case 20:
1639     return AMDGPU::SI_SPILL_A160_SAVE;
1640   case 24:
1641     return AMDGPU::SI_SPILL_A192_SAVE;
1642   case 28:
1643     return AMDGPU::SI_SPILL_A224_SAVE;
1644   case 32:
1645     return AMDGPU::SI_SPILL_A256_SAVE;
1646   case 36:
1647     return AMDGPU::SI_SPILL_A288_SAVE;
1648   case 40:
1649     return AMDGPU::SI_SPILL_A320_SAVE;
1650   case 44:
1651     return AMDGPU::SI_SPILL_A352_SAVE;
1652   case 48:
1653     return AMDGPU::SI_SPILL_A384_SAVE;
1654   case 64:
1655     return AMDGPU::SI_SPILL_A512_SAVE;
1656   case 128:
1657     return AMDGPU::SI_SPILL_A1024_SAVE;
1658   default:
1659     llvm_unreachable("unknown register size");
1660   }
1661 }
1662 
getAVSpillSaveOpcode(unsigned Size)1663 static unsigned getAVSpillSaveOpcode(unsigned Size) {
1664   switch (Size) {
1665   case 4:
1666     return AMDGPU::SI_SPILL_AV32_SAVE;
1667   case 8:
1668     return AMDGPU::SI_SPILL_AV64_SAVE;
1669   case 12:
1670     return AMDGPU::SI_SPILL_AV96_SAVE;
1671   case 16:
1672     return AMDGPU::SI_SPILL_AV128_SAVE;
1673   case 20:
1674     return AMDGPU::SI_SPILL_AV160_SAVE;
1675   case 24:
1676     return AMDGPU::SI_SPILL_AV192_SAVE;
1677   case 28:
1678     return AMDGPU::SI_SPILL_AV224_SAVE;
1679   case 32:
1680     return AMDGPU::SI_SPILL_AV256_SAVE;
1681   case 36:
1682     return AMDGPU::SI_SPILL_AV288_SAVE;
1683   case 40:
1684     return AMDGPU::SI_SPILL_AV320_SAVE;
1685   case 44:
1686     return AMDGPU::SI_SPILL_AV352_SAVE;
1687   case 48:
1688     return AMDGPU::SI_SPILL_AV384_SAVE;
1689   case 64:
1690     return AMDGPU::SI_SPILL_AV512_SAVE;
1691   case 128:
1692     return AMDGPU::SI_SPILL_AV1024_SAVE;
1693   default:
1694     llvm_unreachable("unknown register size");
1695   }
1696 }
1697 
getWWMRegSpillSaveOpcode(unsigned Size,bool IsVectorSuperClass)1698 static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1699                                          bool IsVectorSuperClass) {
1700   // Currently, there is only 32-bit WWM register spills needed.
1701   if (Size != 4)
1702     llvm_unreachable("unknown wwm register spill size");
1703 
1704   if (IsVectorSuperClass)
1705     return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1706 
1707   return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1708 }
1709 
getVectorRegSpillSaveOpcode(Register Reg,const TargetRegisterClass * RC,unsigned Size,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & MFI)1710 static unsigned getVectorRegSpillSaveOpcode(Register Reg,
1711                                             const TargetRegisterClass *RC,
1712                                             unsigned Size,
1713                                             const SIRegisterInfo &TRI,
1714                                             const SIMachineFunctionInfo &MFI) {
1715   bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1716 
1717   // Choose the right opcode if spilling a WWM register.
1718   if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
1719     return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1720 
1721   if (IsVectorSuperClass)
1722     return getAVSpillSaveOpcode(Size);
1723 
1724   return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1725                              : getVGPRSpillSaveOpcode(Size);
1726 }
1727 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,Register SrcReg,bool isKill,int FrameIndex,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg,MachineInstr::MIFlag Flags) const1728 void SIInstrInfo::storeRegToStackSlot(
1729     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1730     bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1731     const TargetRegisterInfo *TRI, Register VReg,
1732     MachineInstr::MIFlag Flags) const {
1733   MachineFunction *MF = MBB.getParent();
1734   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1735   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1736   const DebugLoc &DL = MBB.findDebugLoc(MI);
1737 
1738   MachinePointerInfo PtrInfo
1739     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1740   MachineMemOperand *MMO = MF->getMachineMemOperand(
1741       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1742       FrameInfo.getObjectAlign(FrameIndex));
1743   unsigned SpillSize = TRI->getSpillSize(*RC);
1744 
1745   MachineRegisterInfo &MRI = MF->getRegInfo();
1746   if (RI.isSGPRClass(RC)) {
1747     MFI->setHasSpilledSGPRs();
1748     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1749     assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1750            SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1751 
1752     // We are only allowed to create one new instruction when spilling
1753     // registers, so we need to use pseudo instruction for spilling SGPRs.
1754     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1755 
1756     // The SGPR spill/restore instructions only work on number sgprs, so we need
1757     // to make sure we are using the correct register class.
1758     if (SrcReg.isVirtual() && SpillSize == 4) {
1759       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1760     }
1761 
1762     BuildMI(MBB, MI, DL, OpDesc)
1763       .addReg(SrcReg, getKillRegState(isKill)) // data
1764       .addFrameIndex(FrameIndex)               // addr
1765       .addMemOperand(MMO)
1766       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1767 
1768     if (RI.spillSGPRToVGPR())
1769       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1770     return;
1771   }
1772 
1773   unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1774                                                 SpillSize, RI, *MFI);
1775   MFI->setHasSpilledVGPRs();
1776 
1777   BuildMI(MBB, MI, DL, get(Opcode))
1778     .addReg(SrcReg, getKillRegState(isKill)) // data
1779     .addFrameIndex(FrameIndex)               // addr
1780     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
1781     .addImm(0)                               // offset
1782     .addMemOperand(MMO);
1783 }
1784 
getSGPRSpillRestoreOpcode(unsigned Size)1785 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1786   switch (Size) {
1787   case 4:
1788     return AMDGPU::SI_SPILL_S32_RESTORE;
1789   case 8:
1790     return AMDGPU::SI_SPILL_S64_RESTORE;
1791   case 12:
1792     return AMDGPU::SI_SPILL_S96_RESTORE;
1793   case 16:
1794     return AMDGPU::SI_SPILL_S128_RESTORE;
1795   case 20:
1796     return AMDGPU::SI_SPILL_S160_RESTORE;
1797   case 24:
1798     return AMDGPU::SI_SPILL_S192_RESTORE;
1799   case 28:
1800     return AMDGPU::SI_SPILL_S224_RESTORE;
1801   case 32:
1802     return AMDGPU::SI_SPILL_S256_RESTORE;
1803   case 36:
1804     return AMDGPU::SI_SPILL_S288_RESTORE;
1805   case 40:
1806     return AMDGPU::SI_SPILL_S320_RESTORE;
1807   case 44:
1808     return AMDGPU::SI_SPILL_S352_RESTORE;
1809   case 48:
1810     return AMDGPU::SI_SPILL_S384_RESTORE;
1811   case 64:
1812     return AMDGPU::SI_SPILL_S512_RESTORE;
1813   case 128:
1814     return AMDGPU::SI_SPILL_S1024_RESTORE;
1815   default:
1816     llvm_unreachable("unknown register size");
1817   }
1818 }
1819 
getVGPRSpillRestoreOpcode(unsigned Size)1820 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1821   switch (Size) {
1822   case 2:
1823     return AMDGPU::SI_SPILL_V16_RESTORE;
1824   case 4:
1825     return AMDGPU::SI_SPILL_V32_RESTORE;
1826   case 8:
1827     return AMDGPU::SI_SPILL_V64_RESTORE;
1828   case 12:
1829     return AMDGPU::SI_SPILL_V96_RESTORE;
1830   case 16:
1831     return AMDGPU::SI_SPILL_V128_RESTORE;
1832   case 20:
1833     return AMDGPU::SI_SPILL_V160_RESTORE;
1834   case 24:
1835     return AMDGPU::SI_SPILL_V192_RESTORE;
1836   case 28:
1837     return AMDGPU::SI_SPILL_V224_RESTORE;
1838   case 32:
1839     return AMDGPU::SI_SPILL_V256_RESTORE;
1840   case 36:
1841     return AMDGPU::SI_SPILL_V288_RESTORE;
1842   case 40:
1843     return AMDGPU::SI_SPILL_V320_RESTORE;
1844   case 44:
1845     return AMDGPU::SI_SPILL_V352_RESTORE;
1846   case 48:
1847     return AMDGPU::SI_SPILL_V384_RESTORE;
1848   case 64:
1849     return AMDGPU::SI_SPILL_V512_RESTORE;
1850   case 128:
1851     return AMDGPU::SI_SPILL_V1024_RESTORE;
1852   default:
1853     llvm_unreachable("unknown register size");
1854   }
1855 }
1856 
getAGPRSpillRestoreOpcode(unsigned Size)1857 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1858   switch (Size) {
1859   case 4:
1860     return AMDGPU::SI_SPILL_A32_RESTORE;
1861   case 8:
1862     return AMDGPU::SI_SPILL_A64_RESTORE;
1863   case 12:
1864     return AMDGPU::SI_SPILL_A96_RESTORE;
1865   case 16:
1866     return AMDGPU::SI_SPILL_A128_RESTORE;
1867   case 20:
1868     return AMDGPU::SI_SPILL_A160_RESTORE;
1869   case 24:
1870     return AMDGPU::SI_SPILL_A192_RESTORE;
1871   case 28:
1872     return AMDGPU::SI_SPILL_A224_RESTORE;
1873   case 32:
1874     return AMDGPU::SI_SPILL_A256_RESTORE;
1875   case 36:
1876     return AMDGPU::SI_SPILL_A288_RESTORE;
1877   case 40:
1878     return AMDGPU::SI_SPILL_A320_RESTORE;
1879   case 44:
1880     return AMDGPU::SI_SPILL_A352_RESTORE;
1881   case 48:
1882     return AMDGPU::SI_SPILL_A384_RESTORE;
1883   case 64:
1884     return AMDGPU::SI_SPILL_A512_RESTORE;
1885   case 128:
1886     return AMDGPU::SI_SPILL_A1024_RESTORE;
1887   default:
1888     llvm_unreachable("unknown register size");
1889   }
1890 }
1891 
getAVSpillRestoreOpcode(unsigned Size)1892 static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1893   switch (Size) {
1894   case 4:
1895     return AMDGPU::SI_SPILL_AV32_RESTORE;
1896   case 8:
1897     return AMDGPU::SI_SPILL_AV64_RESTORE;
1898   case 12:
1899     return AMDGPU::SI_SPILL_AV96_RESTORE;
1900   case 16:
1901     return AMDGPU::SI_SPILL_AV128_RESTORE;
1902   case 20:
1903     return AMDGPU::SI_SPILL_AV160_RESTORE;
1904   case 24:
1905     return AMDGPU::SI_SPILL_AV192_RESTORE;
1906   case 28:
1907     return AMDGPU::SI_SPILL_AV224_RESTORE;
1908   case 32:
1909     return AMDGPU::SI_SPILL_AV256_RESTORE;
1910   case 36:
1911     return AMDGPU::SI_SPILL_AV288_RESTORE;
1912   case 40:
1913     return AMDGPU::SI_SPILL_AV320_RESTORE;
1914   case 44:
1915     return AMDGPU::SI_SPILL_AV352_RESTORE;
1916   case 48:
1917     return AMDGPU::SI_SPILL_AV384_RESTORE;
1918   case 64:
1919     return AMDGPU::SI_SPILL_AV512_RESTORE;
1920   case 128:
1921     return AMDGPU::SI_SPILL_AV1024_RESTORE;
1922   default:
1923     llvm_unreachable("unknown register size");
1924   }
1925 }
1926 
getWWMRegSpillRestoreOpcode(unsigned Size,bool IsVectorSuperClass)1927 static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1928                                             bool IsVectorSuperClass) {
1929   // Currently, there is only 32-bit WWM register spills needed.
1930   if (Size != 4)
1931     llvm_unreachable("unknown wwm register spill size");
1932 
1933   if (IsVectorSuperClass)
1934     return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1935 
1936   return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1937 }
1938 
1939 static unsigned
getVectorRegSpillRestoreOpcode(Register Reg,const TargetRegisterClass * RC,unsigned Size,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & MFI)1940 getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
1941                                unsigned Size, const SIRegisterInfo &TRI,
1942                                const SIMachineFunctionInfo &MFI) {
1943   bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1944 
1945   // Choose the right opcode if restoring a WWM register.
1946   if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
1947     return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1948 
1949   if (IsVectorSuperClass)
1950     return getAVSpillRestoreOpcode(Size);
1951 
1952   return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1953                              : getVGPRSpillRestoreOpcode(Size);
1954 }
1955 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,Register DestReg,int FrameIndex,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg,MachineInstr::MIFlag Flags) const1956 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1957                                        MachineBasicBlock::iterator MI,
1958                                        Register DestReg, int FrameIndex,
1959                                        const TargetRegisterClass *RC,
1960                                        const TargetRegisterInfo *TRI,
1961                                        Register VReg,
1962                                        MachineInstr::MIFlag Flags) const {
1963   MachineFunction *MF = MBB.getParent();
1964   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1965   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1966   const DebugLoc &DL = MBB.findDebugLoc(MI);
1967   unsigned SpillSize = TRI->getSpillSize(*RC);
1968 
1969   MachinePointerInfo PtrInfo
1970     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1971 
1972   MachineMemOperand *MMO = MF->getMachineMemOperand(
1973       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1974       FrameInfo.getObjectAlign(FrameIndex));
1975 
1976   if (RI.isSGPRClass(RC)) {
1977     MFI->setHasSpilledSGPRs();
1978     assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1979     assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1980            DestReg != AMDGPU::EXEC && "exec should not be spilled");
1981 
1982     // FIXME: Maybe this should not include a memoperand because it will be
1983     // lowered to non-memory instructions.
1984     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1985     if (DestReg.isVirtual() && SpillSize == 4) {
1986       MachineRegisterInfo &MRI = MF->getRegInfo();
1987       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1988     }
1989 
1990     if (RI.spillSGPRToVGPR())
1991       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1992     BuildMI(MBB, MI, DL, OpDesc, DestReg)
1993       .addFrameIndex(FrameIndex) // addr
1994       .addMemOperand(MMO)
1995       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1996 
1997     return;
1998   }
1999 
2000   unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
2001                                                    SpillSize, RI, *MFI);
2002   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
2003       .addFrameIndex(FrameIndex)           // vaddr
2004       .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
2005       .addImm(0)                           // offset
2006       .addMemOperand(MMO);
2007 }
2008 
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const2009 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
2010                              MachineBasicBlock::iterator MI) const {
2011   insertNoops(MBB, MI, 1);
2012 }
2013 
insertNoops(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,unsigned Quantity) const2014 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
2015                               MachineBasicBlock::iterator MI,
2016                               unsigned Quantity) const {
2017   DebugLoc DL = MBB.findDebugLoc(MI);
2018   while (Quantity > 0) {
2019     unsigned Arg = std::min(Quantity, 8u);
2020     Quantity -= Arg;
2021     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2022   }
2023 }
2024 
insertReturn(MachineBasicBlock & MBB) const2025 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
2026   auto *MF = MBB.getParent();
2027   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2028 
2029   assert(Info->isEntryFunction());
2030 
2031   if (MBB.succ_empty()) {
2032     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2033     if (HasNoTerminator) {
2034       if (Info->returnsVoid()) {
2035         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2036       } else {
2037         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2038       }
2039     }
2040   }
2041 }
2042 
insertSimulatedTrap(MachineRegisterInfo & MRI,MachineBasicBlock & MBB,MachineInstr & MI,const DebugLoc & DL) const2043 MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
2044                                                     MachineBasicBlock &MBB,
2045                                                     MachineInstr &MI,
2046                                                     const DebugLoc &DL) const {
2047   MachineFunction *MF = MBB.getParent();
2048   constexpr unsigned DoorbellIDMask = 0x3ff;
2049   constexpr unsigned ECQueueWaveAbort = 0x400;
2050 
2051   MachineBasicBlock *TrapBB = &MBB;
2052   MachineBasicBlock *ContBB = &MBB;
2053   MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2054 
2055   if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2056     ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2057     TrapBB = MF->CreateMachineBasicBlock();
2058     BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2059     MF->push_back(TrapBB);
2060     MBB.addSuccessor(TrapBB);
2061   }
2062 
2063   // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2064   // will be a nop.
2065   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2066       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2067   Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2068   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2069           DoorbellReg)
2070       .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2071   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2072       .addUse(AMDGPU::M0);
2073   Register DoorbellRegMasked =
2074       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2075   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2076       .addUse(DoorbellReg)
2077       .addImm(DoorbellIDMask);
2078   Register SetWaveAbortBit =
2079       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2080   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2081       .addUse(DoorbellRegMasked)
2082       .addImm(ECQueueWaveAbort);
2083   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2084       .addUse(SetWaveAbortBit);
2085   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2086       .addImm(AMDGPU::SendMsg::ID_INTERRUPT);
2087   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2088       .addUse(AMDGPU::TTMP2);
2089   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2090   TrapBB->addSuccessor(HaltLoopBB);
2091 
2092   BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2093   BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2094       .addMBB(HaltLoopBB);
2095   MF->push_back(HaltLoopBB);
2096   HaltLoopBB->addSuccessor(HaltLoopBB);
2097 
2098   return ContBB;
2099 }
2100 
getNumWaitStates(const MachineInstr & MI)2101 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2102   switch (MI.getOpcode()) {
2103   default:
2104     if (MI.isMetaInstruction())
2105       return 0;
2106     return 1; // FIXME: Do wait states equal cycles?
2107 
2108   case AMDGPU::S_NOP:
2109     return MI.getOperand(0).getImm() + 1;
2110   // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2111   // hazard, even if one exist, won't really be visible. Should we handle it?
2112   }
2113 }
2114 
expandPostRAPseudo(MachineInstr & MI) const2115 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2116   MachineBasicBlock &MBB = *MI.getParent();
2117   DebugLoc DL = MBB.findDebugLoc(MI);
2118   switch (MI.getOpcode()) {
2119   default: return TargetInstrInfo::expandPostRAPseudo(MI);
2120   case AMDGPU::S_MOV_B64_term:
2121     // This is only a terminator to get the correct spill code placement during
2122     // register allocation.
2123     MI.setDesc(get(AMDGPU::S_MOV_B64));
2124     break;
2125 
2126   case AMDGPU::S_MOV_B32_term:
2127     // This is only a terminator to get the correct spill code placement during
2128     // register allocation.
2129     MI.setDesc(get(AMDGPU::S_MOV_B32));
2130     break;
2131 
2132   case AMDGPU::S_XOR_B64_term:
2133     // This is only a terminator to get the correct spill code placement during
2134     // register allocation.
2135     MI.setDesc(get(AMDGPU::S_XOR_B64));
2136     break;
2137 
2138   case AMDGPU::S_XOR_B32_term:
2139     // This is only a terminator to get the correct spill code placement during
2140     // register allocation.
2141     MI.setDesc(get(AMDGPU::S_XOR_B32));
2142     break;
2143   case AMDGPU::S_OR_B64_term:
2144     // This is only a terminator to get the correct spill code placement during
2145     // register allocation.
2146     MI.setDesc(get(AMDGPU::S_OR_B64));
2147     break;
2148   case AMDGPU::S_OR_B32_term:
2149     // This is only a terminator to get the correct spill code placement during
2150     // register allocation.
2151     MI.setDesc(get(AMDGPU::S_OR_B32));
2152     break;
2153 
2154   case AMDGPU::S_ANDN2_B64_term:
2155     // This is only a terminator to get the correct spill code placement during
2156     // register allocation.
2157     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2158     break;
2159 
2160   case AMDGPU::S_ANDN2_B32_term:
2161     // This is only a terminator to get the correct spill code placement during
2162     // register allocation.
2163     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2164     break;
2165 
2166   case AMDGPU::S_AND_B64_term:
2167     // This is only a terminator to get the correct spill code placement during
2168     // register allocation.
2169     MI.setDesc(get(AMDGPU::S_AND_B64));
2170     break;
2171 
2172   case AMDGPU::S_AND_B32_term:
2173     // This is only a terminator to get the correct spill code placement during
2174     // register allocation.
2175     MI.setDesc(get(AMDGPU::S_AND_B32));
2176     break;
2177 
2178   case AMDGPU::S_AND_SAVEEXEC_B64_term:
2179     // This is only a terminator to get the correct spill code placement during
2180     // register allocation.
2181     MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2182     break;
2183 
2184   case AMDGPU::S_AND_SAVEEXEC_B32_term:
2185     // This is only a terminator to get the correct spill code placement during
2186     // register allocation.
2187     MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2188     break;
2189 
2190   case AMDGPU::SI_SPILL_S32_TO_VGPR:
2191     MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2192     break;
2193 
2194   case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2195     MI.setDesc(get(AMDGPU::V_READLANE_B32));
2196     MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2197                                                &AMDGPU::SReg_32_XM0RegClass);
2198     break;
2199   case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2200     Register Dst = MI.getOperand(0).getReg();
2201     bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2202     MI.setDesc(
2203         get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2204     break;
2205   }
2206   case AMDGPU::V_MOV_B64_PSEUDO: {
2207     Register Dst = MI.getOperand(0).getReg();
2208     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2209     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2210 
2211     const MachineOperand &SrcOp = MI.getOperand(1);
2212     // FIXME: Will this work for 64-bit floating point immediates?
2213     assert(!SrcOp.isFPImm());
2214     if (ST.hasMovB64()) {
2215       MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2216       if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2217           isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2218         break;
2219     }
2220     if (SrcOp.isImm()) {
2221       APInt Imm(64, SrcOp.getImm());
2222       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2223       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2224       if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2225         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2226           .addImm(SISrcMods::OP_SEL_1)
2227           .addImm(Lo.getSExtValue())
2228           .addImm(SISrcMods::OP_SEL_1)
2229           .addImm(Lo.getSExtValue())
2230           .addImm(0)  // op_sel_lo
2231           .addImm(0)  // op_sel_hi
2232           .addImm(0)  // neg_lo
2233           .addImm(0)  // neg_hi
2234           .addImm(0); // clamp
2235       } else {
2236         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2237           .addImm(Lo.getSExtValue())
2238           .addReg(Dst, RegState::Implicit | RegState::Define);
2239         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2240           .addImm(Hi.getSExtValue())
2241           .addReg(Dst, RegState::Implicit | RegState::Define);
2242       }
2243     } else {
2244       assert(SrcOp.isReg());
2245       if (ST.hasPkMovB32() &&
2246           !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2247         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2248           .addImm(SISrcMods::OP_SEL_1) // src0_mod
2249           .addReg(SrcOp.getReg())
2250           .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2251           .addReg(SrcOp.getReg())
2252           .addImm(0)  // op_sel_lo
2253           .addImm(0)  // op_sel_hi
2254           .addImm(0)  // neg_lo
2255           .addImm(0)  // neg_hi
2256           .addImm(0); // clamp
2257       } else {
2258         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2259           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2260           .addReg(Dst, RegState::Implicit | RegState::Define);
2261         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2262           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2263           .addReg(Dst, RegState::Implicit | RegState::Define);
2264       }
2265     }
2266     MI.eraseFromParent();
2267     break;
2268   }
2269   case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2270     expandMovDPP64(MI);
2271     break;
2272   }
2273   case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2274     const MachineOperand &SrcOp = MI.getOperand(1);
2275     assert(!SrcOp.isFPImm());
2276 
2277     if (ST.has64BitLiterals()) {
2278       MI.setDesc(get(AMDGPU::S_MOV_B64));
2279       break;
2280     }
2281 
2282     APInt Imm(64, SrcOp.getImm());
2283     if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2284       MI.setDesc(get(AMDGPU::S_MOV_B64));
2285       break;
2286     }
2287 
2288     Register Dst = MI.getOperand(0).getReg();
2289     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2290     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2291 
2292     APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2293     APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2294     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2295       .addImm(Lo.getSExtValue())
2296       .addReg(Dst, RegState::Implicit | RegState::Define);
2297     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2298       .addImm(Hi.getSExtValue())
2299       .addReg(Dst, RegState::Implicit | RegState::Define);
2300     MI.eraseFromParent();
2301     break;
2302   }
2303   case AMDGPU::V_SET_INACTIVE_B32: {
2304     // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2305     Register DstReg = MI.getOperand(0).getReg();
2306     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2307         .add(MI.getOperand(3))
2308         .add(MI.getOperand(4))
2309         .add(MI.getOperand(1))
2310         .add(MI.getOperand(2))
2311         .add(MI.getOperand(5));
2312     MI.eraseFromParent();
2313     break;
2314   }
2315   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2316   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2317   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2318   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2319   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2320   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2321   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2322   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2323   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2324   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2325   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2326   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2327   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2328   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2329   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2330   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2331   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2332   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2333   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2334   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2335   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2336   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2337   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2338   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2339   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2340   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2341   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2342   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2343   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2344     const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2345 
2346     unsigned Opc;
2347     if (RI.hasVGPRs(EltRC)) {
2348       Opc = AMDGPU::V_MOVRELD_B32_e32;
2349     } else {
2350       Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2351                                               : AMDGPU::S_MOVRELD_B32;
2352     }
2353 
2354     const MCInstrDesc &OpDesc = get(Opc);
2355     Register VecReg = MI.getOperand(0).getReg();
2356     bool IsUndef = MI.getOperand(1).isUndef();
2357     unsigned SubReg = MI.getOperand(3).getImm();
2358     assert(VecReg == MI.getOperand(1).getReg());
2359 
2360     MachineInstrBuilder MIB =
2361       BuildMI(MBB, MI, DL, OpDesc)
2362         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2363         .add(MI.getOperand(2))
2364         .addReg(VecReg, RegState::ImplicitDefine)
2365         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2366 
2367     const int ImpDefIdx =
2368         OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2369     const int ImpUseIdx = ImpDefIdx + 1;
2370     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2371     MI.eraseFromParent();
2372     break;
2373   }
2374   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2375   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2376   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2377   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2378   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2379   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2380   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2381   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2382   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2383   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2384   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2385   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2386     assert(ST.useVGPRIndexMode());
2387     Register VecReg = MI.getOperand(0).getReg();
2388     bool IsUndef = MI.getOperand(1).isUndef();
2389     MachineOperand &Idx = MI.getOperand(3);
2390     Register SubReg = MI.getOperand(4).getImm();
2391 
2392     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2393                               .add(Idx)
2394                               .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2395     SetOn->getOperand(3).setIsUndef();
2396 
2397     const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2398     MachineInstrBuilder MIB =
2399         BuildMI(MBB, MI, DL, OpDesc)
2400             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2401             .add(MI.getOperand(2))
2402             .addReg(VecReg, RegState::ImplicitDefine)
2403             .addReg(VecReg,
2404                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2405 
2406     const int ImpDefIdx =
2407         OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2408     const int ImpUseIdx = ImpDefIdx + 1;
2409     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2410 
2411     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2412 
2413     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2414 
2415     MI.eraseFromParent();
2416     break;
2417   }
2418   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2419   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2420   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2421   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2422   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2423   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2424   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2425   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2426   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2427   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2428   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2429   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2430     assert(ST.useVGPRIndexMode());
2431     Register Dst = MI.getOperand(0).getReg();
2432     Register VecReg = MI.getOperand(1).getReg();
2433     bool IsUndef = MI.getOperand(1).isUndef();
2434     Register Idx = MI.getOperand(2).getReg();
2435     Register SubReg = MI.getOperand(3).getImm();
2436 
2437     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2438                               .addReg(Idx)
2439                               .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2440     SetOn->getOperand(3).setIsUndef();
2441 
2442     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2443         .addDef(Dst)
2444         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2445         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2446 
2447     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2448 
2449     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2450 
2451     MI.eraseFromParent();
2452     break;
2453   }
2454   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2455     MachineFunction &MF = *MBB.getParent();
2456     Register Reg = MI.getOperand(0).getReg();
2457     Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2458     Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2459     MachineOperand OpLo = MI.getOperand(1);
2460     MachineOperand OpHi = MI.getOperand(2);
2461 
2462     // Create a bundle so these instructions won't be re-ordered by the
2463     // post-RA scheduler.
2464     MIBundleBuilder Bundler(MBB, MI);
2465     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2466 
2467     // What we want here is an offset from the value returned by s_getpc (which
2468     // is the address of the s_add_u32 instruction) to the global variable, but
2469     // since the encoding of $symbol starts 4 bytes after the start of the
2470     // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2471     // small. This requires us to add 4 to the global variable offset in order
2472     // to compute the correct address. Similarly for the s_addc_u32 instruction,
2473     // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2474     // instruction.
2475 
2476     int64_t Adjust = 0;
2477     if (ST.hasGetPCZeroExtension()) {
2478       // Fix up hardware that does not sign-extend the 48-bit PC value by
2479       // inserting: s_sext_i32_i16 reghi, reghi
2480       Bundler.append(
2481           BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2482       Adjust += 4;
2483     }
2484 
2485     if (OpLo.isGlobal())
2486       OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2487     Bundler.append(
2488         BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2489 
2490     if (OpHi.isGlobal())
2491       OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2492     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2493                        .addReg(RegHi)
2494                        .add(OpHi));
2495 
2496     finalizeBundle(MBB, Bundler.begin());
2497 
2498     MI.eraseFromParent();
2499     break;
2500   }
2501   case AMDGPU::ENTER_STRICT_WWM: {
2502     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2503     // Whole Wave Mode is entered.
2504     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2505                                  : AMDGPU::S_OR_SAVEEXEC_B64));
2506     break;
2507   }
2508   case AMDGPU::ENTER_STRICT_WQM: {
2509     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2510     // STRICT_WQM is entered.
2511     const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2512     const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2513     const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2514     BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2515     BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2516 
2517     MI.eraseFromParent();
2518     break;
2519   }
2520   case AMDGPU::EXIT_STRICT_WWM:
2521   case AMDGPU::EXIT_STRICT_WQM: {
2522     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2523     // WWM/STICT_WQM is exited.
2524     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2525     break;
2526   }
2527   case AMDGPU::SI_RETURN: {
2528     const MachineFunction *MF = MBB.getParent();
2529     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2530     const SIRegisterInfo *TRI = ST.getRegisterInfo();
2531     // Hiding the return address use with SI_RETURN may lead to extra kills in
2532     // the function and missing live-ins. We are fine in practice because callee
2533     // saved register handling ensures the register value is restored before
2534     // RET, but we need the undef flag here to appease the MachineVerifier
2535     // liveness checks.
2536     MachineInstrBuilder MIB =
2537         BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2538             .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2539 
2540     MIB.copyImplicitOps(MI);
2541     MI.eraseFromParent();
2542     break;
2543   }
2544 
2545   case AMDGPU::S_MUL_U64_U32_PSEUDO:
2546   case AMDGPU::S_MUL_I64_I32_PSEUDO:
2547     MI.setDesc(get(AMDGPU::S_MUL_U64));
2548     break;
2549 
2550   case AMDGPU::S_GETPC_B64_pseudo:
2551     MI.setDesc(get(AMDGPU::S_GETPC_B64));
2552     if (ST.hasGetPCZeroExtension()) {
2553       Register Dst = MI.getOperand(0).getReg();
2554       Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2555       // Fix up hardware that does not sign-extend the 48-bit PC value by
2556       // inserting: s_sext_i32_i16 dsthi, dsthi
2557       BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2558               DstHi)
2559           .addReg(DstHi);
2560     }
2561     break;
2562   }
2563   return true;
2564 }
2565 
reMaterialize(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,Register DestReg,unsigned SubIdx,const MachineInstr & Orig,const TargetRegisterInfo & RI) const2566 void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2567                                 MachineBasicBlock::iterator I, Register DestReg,
2568                                 unsigned SubIdx, const MachineInstr &Orig,
2569                                 const TargetRegisterInfo &RI) const {
2570 
2571   // Try shrinking the instruction to remat only the part needed for current
2572   // context.
2573   // TODO: Handle more cases.
2574   unsigned Opcode = Orig.getOpcode();
2575   switch (Opcode) {
2576   case AMDGPU::S_LOAD_DWORDX16_IMM:
2577   case AMDGPU::S_LOAD_DWORDX8_IMM: {
2578     if (SubIdx != 0)
2579       break;
2580 
2581     if (I == MBB.end())
2582       break;
2583 
2584     if (I->isBundled())
2585       break;
2586 
2587     // Look for a single use of the register that is also a subreg.
2588     Register RegToFind = Orig.getOperand(0).getReg();
2589     MachineOperand *UseMO = nullptr;
2590     for (auto &CandMO : I->operands()) {
2591       if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2592         continue;
2593       if (UseMO) {
2594         UseMO = nullptr;
2595         break;
2596       }
2597       UseMO = &CandMO;
2598     }
2599     if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2600       break;
2601 
2602     unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2603     unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2604 
2605     MachineFunction *MF = MBB.getParent();
2606     MachineRegisterInfo &MRI = MF->getRegInfo();
2607     assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2608 
2609     unsigned NewOpcode = -1;
2610     if (SubregSize == 256)
2611       NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2612     else if (SubregSize == 128)
2613       NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2614     else
2615       break;
2616 
2617     const MCInstrDesc &TID = get(NewOpcode);
2618     const TargetRegisterClass *NewRC =
2619         RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2620     MRI.setRegClass(DestReg, NewRC);
2621 
2622     UseMO->setReg(DestReg);
2623     UseMO->setSubReg(AMDGPU::NoSubRegister);
2624 
2625     // Use a smaller load with the desired size, possibly with updated offset.
2626     MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2627     MI->setDesc(TID);
2628     MI->getOperand(0).setReg(DestReg);
2629     MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2630     if (Offset) {
2631       MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2632       int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2633       OffsetMO->setImm(FinalOffset);
2634     }
2635     SmallVector<MachineMemOperand *> NewMMOs;
2636     for (const MachineMemOperand *MemOp : Orig.memoperands())
2637       NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2638                                                  SubregSize / 8));
2639     MI->setMemRefs(*MF, NewMMOs);
2640 
2641     MBB.insert(I, MI);
2642     return;
2643   }
2644 
2645   default:
2646     break;
2647   }
2648 
2649   TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2650 }
2651 
2652 std::pair<MachineInstr*, MachineInstr*>
expandMovDPP64(MachineInstr & MI) const2653 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2654   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2655 
2656   if (ST.hasMovB64() &&
2657       AMDGPU::isLegalDPALU_DPPControl(
2658         getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2659     MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2660     return std::pair(&MI, nullptr);
2661   }
2662 
2663   MachineBasicBlock &MBB = *MI.getParent();
2664   DebugLoc DL = MBB.findDebugLoc(MI);
2665   MachineFunction *MF = MBB.getParent();
2666   MachineRegisterInfo &MRI = MF->getRegInfo();
2667   Register Dst = MI.getOperand(0).getReg();
2668   unsigned Part = 0;
2669   MachineInstr *Split[2];
2670 
2671   for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2672     auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2673     if (Dst.isPhysical()) {
2674       MovDPP.addDef(RI.getSubReg(Dst, Sub));
2675     } else {
2676       assert(MRI.isSSA());
2677       auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2678       MovDPP.addDef(Tmp);
2679     }
2680 
2681     for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2682       const MachineOperand &SrcOp = MI.getOperand(I);
2683       assert(!SrcOp.isFPImm());
2684       if (SrcOp.isImm()) {
2685         APInt Imm(64, SrcOp.getImm());
2686         Imm.ashrInPlace(Part * 32);
2687         MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2688       } else {
2689         assert(SrcOp.isReg());
2690         Register Src = SrcOp.getReg();
2691         if (Src.isPhysical())
2692           MovDPP.addReg(RI.getSubReg(Src, Sub));
2693         else
2694           MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2695       }
2696     }
2697 
2698     for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2699       MovDPP.addImm(MO.getImm());
2700 
2701     Split[Part] = MovDPP;
2702     ++Part;
2703   }
2704 
2705   if (Dst.isVirtual())
2706     BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2707       .addReg(Split[0]->getOperand(0).getReg())
2708       .addImm(AMDGPU::sub0)
2709       .addReg(Split[1]->getOperand(0).getReg())
2710       .addImm(AMDGPU::sub1);
2711 
2712   MI.eraseFromParent();
2713   return std::pair(Split[0], Split[1]);
2714 }
2715 
2716 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const2717 SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2718   if (MI.getOpcode() == AMDGPU::WWM_COPY)
2719     return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2720 
2721   return std::nullopt;
2722 }
2723 
swapSourceModifiers(MachineInstr & MI,MachineOperand & Src0,AMDGPU::OpName Src0OpName,MachineOperand & Src1,AMDGPU::OpName Src1OpName) const2724 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2725                                       AMDGPU::OpName Src0OpName,
2726                                       MachineOperand &Src1,
2727                                       AMDGPU::OpName Src1OpName) const {
2728   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2729   if (!Src0Mods)
2730     return false;
2731 
2732   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2733   assert(Src1Mods &&
2734          "All commutable instructions have both src0 and src1 modifiers");
2735 
2736   int Src0ModsVal = Src0Mods->getImm();
2737   int Src1ModsVal = Src1Mods->getImm();
2738 
2739   Src1Mods->setImm(Src0ModsVal);
2740   Src0Mods->setImm(Src1ModsVal);
2741   return true;
2742 }
2743 
swapRegAndNonRegOperand(MachineInstr & MI,MachineOperand & RegOp,MachineOperand & NonRegOp)2744 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2745                                              MachineOperand &RegOp,
2746                                              MachineOperand &NonRegOp) {
2747   Register Reg = RegOp.getReg();
2748   unsigned SubReg = RegOp.getSubReg();
2749   bool IsKill = RegOp.isKill();
2750   bool IsDead = RegOp.isDead();
2751   bool IsUndef = RegOp.isUndef();
2752   bool IsDebug = RegOp.isDebug();
2753 
2754   if (NonRegOp.isImm())
2755     RegOp.ChangeToImmediate(NonRegOp.getImm());
2756   else if (NonRegOp.isFI())
2757     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2758   else if (NonRegOp.isGlobal()) {
2759     RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2760                      NonRegOp.getTargetFlags());
2761   } else
2762     return nullptr;
2763 
2764   // Make sure we don't reinterpret a subreg index in the target flags.
2765   RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2766 
2767   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2768   NonRegOp.setSubReg(SubReg);
2769 
2770   return &MI;
2771 }
2772 
swapImmOperands(MachineInstr & MI,MachineOperand & NonRegOp1,MachineOperand & NonRegOp2)2773 static MachineInstr *swapImmOperands(MachineInstr &MI,
2774                                      MachineOperand &NonRegOp1,
2775                                      MachineOperand &NonRegOp2) {
2776   unsigned TargetFlags = NonRegOp1.getTargetFlags();
2777   int64_t NonRegVal = NonRegOp1.getImm();
2778 
2779   NonRegOp1.setImm(NonRegOp2.getImm());
2780   NonRegOp2.setImm(NonRegVal);
2781   NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2782   NonRegOp2.setTargetFlags(TargetFlags);
2783   return &MI;
2784 }
2785 
isLegalToSwap(const MachineInstr & MI,unsigned OpIdx0,const MachineOperand * MO0,unsigned OpIdx1,const MachineOperand * MO1) const2786 bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2787                                 const MachineOperand *MO0, unsigned OpIdx1,
2788                                 const MachineOperand *MO1) const {
2789   const MCInstrDesc &InstDesc = MI.getDesc();
2790   const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2791   const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2792   const TargetRegisterClass *DefinedRC1 =
2793       OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
2794   const TargetRegisterClass *DefinedRC0 =
2795       OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
2796 
2797   unsigned Opc = MI.getOpcode();
2798   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2799 
2800   // Swap doesn't breach constant bus or literal limits
2801   // It may move literal to position other than src0, this is not allowed
2802   // pre-gfx10 However, most test cases need literals in Src0 for VOP
2803   // FIXME: After gfx9, literal can be in place other than Src0
2804   if (isVALU(MI)) {
2805     if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
2806         !isInlineConstant(*MO0, OpInfo1))
2807       return false;
2808     if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
2809         !isInlineConstant(*MO1, OpInfo0))
2810       return false;
2811   }
2812 
2813   if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
2814     if (!DefinedRC1)
2815       return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2816     return isLegalRegOperand(MI, OpIdx1, *MO0);
2817   }
2818   if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
2819     if (!DefinedRC0)
2820       return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2821     return isLegalRegOperand(MI, OpIdx0, *MO1);
2822   }
2823 
2824   // No need to check 64-bit literals since swapping does not bring new
2825   // 64-bit literals into current instruction to fold to 32-bit
2826 
2827   return isImmOperandLegal(MI, OpIdx1, *MO0);
2828 }
2829 
commuteInstructionImpl(MachineInstr & MI,bool NewMI,unsigned Src0Idx,unsigned Src1Idx) const2830 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2831                                                   unsigned Src0Idx,
2832                                                   unsigned Src1Idx) const {
2833   assert(!NewMI && "this should never be used");
2834 
2835   unsigned Opc = MI.getOpcode();
2836   int CommutedOpcode = commuteOpcode(Opc);
2837   if (CommutedOpcode == -1)
2838     return nullptr;
2839 
2840   if (Src0Idx > Src1Idx)
2841     std::swap(Src0Idx, Src1Idx);
2842 
2843   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2844            static_cast<int>(Src0Idx) &&
2845          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2846            static_cast<int>(Src1Idx) &&
2847          "inconsistency with findCommutedOpIndices");
2848 
2849   MachineOperand &Src0 = MI.getOperand(Src0Idx);
2850   MachineOperand &Src1 = MI.getOperand(Src1Idx);
2851   if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
2852     return nullptr;
2853   }
2854   MachineInstr *CommutedMI = nullptr;
2855   if (Src0.isReg() && Src1.isReg()) {
2856     // Be sure to copy the source modifiers to the right place.
2857     CommutedMI =
2858         TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2859   } else if (Src0.isReg() && !Src1.isReg()) {
2860     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2861   } else if (!Src0.isReg() && Src1.isReg()) {
2862     CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2863   } else if (Src0.isImm() && Src1.isImm()) {
2864     CommutedMI = swapImmOperands(MI, Src0, Src1);
2865   } else {
2866     // FIXME: Found two non registers to commute. This does happen.
2867     return nullptr;
2868   }
2869 
2870   if (CommutedMI) {
2871     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2872                         Src1, AMDGPU::OpName::src1_modifiers);
2873 
2874     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2875                         AMDGPU::OpName::src1_sel);
2876 
2877     CommutedMI->setDesc(get(CommutedOpcode));
2878   }
2879 
2880   return CommutedMI;
2881 }
2882 
2883 // This needs to be implemented because the source modifiers may be inserted
2884 // between the true commutable operands, and the base
2885 // TargetInstrInfo::commuteInstruction uses it.
findCommutedOpIndices(const MachineInstr & MI,unsigned & SrcOpIdx0,unsigned & SrcOpIdx1) const2886 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2887                                         unsigned &SrcOpIdx0,
2888                                         unsigned &SrcOpIdx1) const {
2889   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2890 }
2891 
findCommutedOpIndices(const MCInstrDesc & Desc,unsigned & SrcOpIdx0,unsigned & SrcOpIdx1) const2892 bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2893                                         unsigned &SrcOpIdx0,
2894                                         unsigned &SrcOpIdx1) const {
2895   if (!Desc.isCommutable())
2896     return false;
2897 
2898   unsigned Opc = Desc.getOpcode();
2899   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2900   if (Src0Idx == -1)
2901     return false;
2902 
2903   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2904   if (Src1Idx == -1)
2905     return false;
2906 
2907   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2908 }
2909 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const2910 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2911                                         int64_t BrOffset) const {
2912   // BranchRelaxation should never have to check s_setpc_b64 because its dest
2913   // block is unanalyzable.
2914   assert(BranchOp != AMDGPU::S_SETPC_B64);
2915 
2916   // Convert to dwords.
2917   BrOffset /= 4;
2918 
2919   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2920   // from the next instruction.
2921   BrOffset -= 1;
2922 
2923   return isIntN(BranchOffsetBits, BrOffset);
2924 }
2925 
2926 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const2927 SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2928   return MI.getOperand(0).getMBB();
2929 }
2930 
hasDivergentBranch(const MachineBasicBlock * MBB) const2931 bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2932   for (const MachineInstr &MI : MBB->terminators()) {
2933     if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2934         MI.getOpcode() == AMDGPU::SI_LOOP)
2935       return true;
2936   }
2937   return false;
2938 }
2939 
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & DestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const2940 void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2941                                        MachineBasicBlock &DestBB,
2942                                        MachineBasicBlock &RestoreBB,
2943                                        const DebugLoc &DL, int64_t BrOffset,
2944                                        RegScavenger *RS) const {
2945   assert(RS && "RegScavenger required for long branching");
2946   assert(MBB.empty() &&
2947          "new block should be inserted for expanding unconditional branch");
2948   assert(MBB.pred_size() == 1);
2949   assert(RestoreBB.empty() &&
2950          "restore block should be inserted for restoring clobbered registers");
2951 
2952   MachineFunction *MF = MBB.getParent();
2953   MachineRegisterInfo &MRI = MF->getRegInfo();
2954   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2955 
2956   // FIXME: Virtual register workaround for RegScavenger not working with empty
2957   // blocks.
2958   Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2959 
2960   auto I = MBB.end();
2961 
2962   // Note: as this is used after hazard recognizer we need to apply some hazard
2963   // workarounds directly.
2964   const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2965                                ST.hasVALUReadSGPRHazard();
2966   auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2967     if (FlushSGPRWrites)
2968       BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2969           .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2970   };
2971 
2972   // We need to compute the offset relative to the instruction immediately after
2973   // s_getpc_b64. Insert pc arithmetic code before last terminator.
2974   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2975   ApplyHazardWorkarounds();
2976 
2977   auto &MCCtx = MF->getContext();
2978   MCSymbol *PostGetPCLabel =
2979       MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2980   GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2981 
2982   MCSymbol *OffsetLo =
2983       MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2984   MCSymbol *OffsetHi =
2985       MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2986   BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2987       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2988       .addReg(PCReg, 0, AMDGPU::sub0)
2989       .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2990   BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2991       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2992       .addReg(PCReg, 0, AMDGPU::sub1)
2993       .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2994   ApplyHazardWorkarounds();
2995 
2996   // Insert the indirect branch after the other terminator.
2997   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2998     .addReg(PCReg);
2999 
3000   // If a spill is needed for the pc register pair, we need to insert a spill
3001   // restore block right before the destination block, and insert a short branch
3002   // into the old destination block's fallthrough predecessor.
3003   // e.g.:
3004   //
3005   // s_cbranch_scc0 skip_long_branch:
3006   //
3007   // long_branch_bb:
3008   //   spill s[8:9]
3009   //   s_getpc_b64 s[8:9]
3010   //   s_add_u32 s8, s8, restore_bb
3011   //   s_addc_u32 s9, s9, 0
3012   //   s_setpc_b64 s[8:9]
3013   //
3014   // skip_long_branch:
3015   //   foo;
3016   //
3017   // .....
3018   //
3019   // dest_bb_fallthrough_predecessor:
3020   // bar;
3021   // s_branch dest_bb
3022   //
3023   // restore_bb:
3024   //  restore s[8:9]
3025   //  fallthrough dest_bb
3026   ///
3027   // dest_bb:
3028   //   buzz;
3029 
3030   Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3031   Register Scav;
3032 
3033   // If we've previously reserved a register for long branches
3034   // avoid running the scavenger and just use those registers
3035   if (LongBranchReservedReg) {
3036     RS->enterBasicBlock(MBB);
3037     Scav = LongBranchReservedReg;
3038   } else {
3039     RS->enterBasicBlockEnd(MBB);
3040     Scav = RS->scavengeRegisterBackwards(
3041         AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3042         /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3043   }
3044   if (Scav) {
3045     RS->setRegUsed(Scav);
3046     MRI.replaceRegWith(PCReg, Scav);
3047     MRI.clearVirtRegs();
3048   } else {
3049     // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3050     // SGPR spill.
3051     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3052     const SIRegisterInfo *TRI = ST.getRegisterInfo();
3053     TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3054     MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3055     MRI.clearVirtRegs();
3056   }
3057 
3058   MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3059   // Now, the distance could be defined.
3060   auto *Offset = MCBinaryExpr::createSub(
3061       MCSymbolRefExpr::create(DestLabel, MCCtx),
3062       MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3063   // Add offset assignments.
3064   auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3065   OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3066   auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3067   OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3068 }
3069 
getBranchOpcode(SIInstrInfo::BranchPredicate Cond)3070 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3071   switch (Cond) {
3072   case SIInstrInfo::SCC_TRUE:
3073     return AMDGPU::S_CBRANCH_SCC1;
3074   case SIInstrInfo::SCC_FALSE:
3075     return AMDGPU::S_CBRANCH_SCC0;
3076   case SIInstrInfo::VCCNZ:
3077     return AMDGPU::S_CBRANCH_VCCNZ;
3078   case SIInstrInfo::VCCZ:
3079     return AMDGPU::S_CBRANCH_VCCZ;
3080   case SIInstrInfo::EXECNZ:
3081     return AMDGPU::S_CBRANCH_EXECNZ;
3082   case SIInstrInfo::EXECZ:
3083     return AMDGPU::S_CBRANCH_EXECZ;
3084   default:
3085     llvm_unreachable("invalid branch predicate");
3086   }
3087 }
3088 
getBranchPredicate(unsigned Opcode)3089 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3090   switch (Opcode) {
3091   case AMDGPU::S_CBRANCH_SCC0:
3092     return SCC_FALSE;
3093   case AMDGPU::S_CBRANCH_SCC1:
3094     return SCC_TRUE;
3095   case AMDGPU::S_CBRANCH_VCCNZ:
3096     return VCCNZ;
3097   case AMDGPU::S_CBRANCH_VCCZ:
3098     return VCCZ;
3099   case AMDGPU::S_CBRANCH_EXECNZ:
3100     return EXECNZ;
3101   case AMDGPU::S_CBRANCH_EXECZ:
3102     return EXECZ;
3103   default:
3104     return INVALID_BR;
3105   }
3106 }
3107 
analyzeBranchImpl(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const3108 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3109                                     MachineBasicBlock::iterator I,
3110                                     MachineBasicBlock *&TBB,
3111                                     MachineBasicBlock *&FBB,
3112                                     SmallVectorImpl<MachineOperand> &Cond,
3113                                     bool AllowModify) const {
3114   if (I->getOpcode() == AMDGPU::S_BRANCH) {
3115     // Unconditional Branch
3116     TBB = I->getOperand(0).getMBB();
3117     return false;
3118   }
3119 
3120   BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3121   if (Pred == INVALID_BR)
3122     return true;
3123 
3124   MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3125   Cond.push_back(MachineOperand::CreateImm(Pred));
3126   Cond.push_back(I->getOperand(1)); // Save the branch register.
3127 
3128   ++I;
3129 
3130   if (I == MBB.end()) {
3131     // Conditional branch followed by fall-through.
3132     TBB = CondBB;
3133     return false;
3134   }
3135 
3136   if (I->getOpcode() == AMDGPU::S_BRANCH) {
3137     TBB = CondBB;
3138     FBB = I->getOperand(0).getMBB();
3139     return false;
3140   }
3141 
3142   return true;
3143 }
3144 
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const3145 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3146                                 MachineBasicBlock *&FBB,
3147                                 SmallVectorImpl<MachineOperand> &Cond,
3148                                 bool AllowModify) const {
3149   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3150   auto E = MBB.end();
3151   if (I == E)
3152     return false;
3153 
3154   // Skip over the instructions that are artificially terminators for special
3155   // exec management.
3156   while (I != E && !I->isBranch() && !I->isReturn()) {
3157     switch (I->getOpcode()) {
3158     case AMDGPU::S_MOV_B64_term:
3159     case AMDGPU::S_XOR_B64_term:
3160     case AMDGPU::S_OR_B64_term:
3161     case AMDGPU::S_ANDN2_B64_term:
3162     case AMDGPU::S_AND_B64_term:
3163     case AMDGPU::S_AND_SAVEEXEC_B64_term:
3164     case AMDGPU::S_MOV_B32_term:
3165     case AMDGPU::S_XOR_B32_term:
3166     case AMDGPU::S_OR_B32_term:
3167     case AMDGPU::S_ANDN2_B32_term:
3168     case AMDGPU::S_AND_B32_term:
3169     case AMDGPU::S_AND_SAVEEXEC_B32_term:
3170       break;
3171     case AMDGPU::SI_IF:
3172     case AMDGPU::SI_ELSE:
3173     case AMDGPU::SI_KILL_I1_TERMINATOR:
3174     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3175       // FIXME: It's messy that these need to be considered here at all.
3176       return true;
3177     default:
3178       llvm_unreachable("unexpected non-branch terminator inst");
3179     }
3180 
3181     ++I;
3182   }
3183 
3184   if (I == E)
3185     return false;
3186 
3187   return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3188 }
3189 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const3190 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3191                                    int *BytesRemoved) const {
3192   unsigned Count = 0;
3193   unsigned RemovedSize = 0;
3194   for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3195     // Skip over artificial terminators when removing instructions.
3196     if (MI.isBranch() || MI.isReturn()) {
3197       RemovedSize += getInstSizeInBytes(MI);
3198       MI.eraseFromParent();
3199       ++Count;
3200     }
3201   }
3202 
3203   if (BytesRemoved)
3204     *BytesRemoved = RemovedSize;
3205 
3206   return Count;
3207 }
3208 
3209 // Copy the flags onto the implicit condition register operand.
preserveCondRegFlags(MachineOperand & CondReg,const MachineOperand & OrigCond)3210 static void preserveCondRegFlags(MachineOperand &CondReg,
3211                                  const MachineOperand &OrigCond) {
3212   CondReg.setIsUndef(OrigCond.isUndef());
3213   CondReg.setIsKill(OrigCond.isKill());
3214 }
3215 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const3216 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3217                                    MachineBasicBlock *TBB,
3218                                    MachineBasicBlock *FBB,
3219                                    ArrayRef<MachineOperand> Cond,
3220                                    const DebugLoc &DL,
3221                                    int *BytesAdded) const {
3222   if (!FBB && Cond.empty()) {
3223     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3224       .addMBB(TBB);
3225     if (BytesAdded)
3226       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3227     return 1;
3228   }
3229 
3230   assert(TBB && Cond[0].isImm());
3231 
3232   unsigned Opcode
3233     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3234 
3235   if (!FBB) {
3236     MachineInstr *CondBr =
3237       BuildMI(&MBB, DL, get(Opcode))
3238       .addMBB(TBB);
3239 
3240     // Copy the flags onto the implicit condition register operand.
3241     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3242     fixImplicitOperands(*CondBr);
3243 
3244     if (BytesAdded)
3245       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3246     return 1;
3247   }
3248 
3249   assert(TBB && FBB);
3250 
3251   MachineInstr *CondBr =
3252     BuildMI(&MBB, DL, get(Opcode))
3253     .addMBB(TBB);
3254   fixImplicitOperands(*CondBr);
3255   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3256     .addMBB(FBB);
3257 
3258   MachineOperand &CondReg = CondBr->getOperand(1);
3259   CondReg.setIsUndef(Cond[1].isUndef());
3260   CondReg.setIsKill(Cond[1].isKill());
3261 
3262   if (BytesAdded)
3263     *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3264 
3265   return 2;
3266 }
3267 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const3268 bool SIInstrInfo::reverseBranchCondition(
3269   SmallVectorImpl<MachineOperand> &Cond) const {
3270   if (Cond.size() != 2) {
3271     return true;
3272   }
3273 
3274   if (Cond[0].isImm()) {
3275     Cond[0].setImm(-Cond[0].getImm());
3276     return false;
3277   }
3278 
3279   return true;
3280 }
3281 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const3282 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3283                                   ArrayRef<MachineOperand> Cond,
3284                                   Register DstReg, Register TrueReg,
3285                                   Register FalseReg, int &CondCycles,
3286                                   int &TrueCycles, int &FalseCycles) const {
3287   switch (Cond[0].getImm()) {
3288   case VCCNZ:
3289   case VCCZ: {
3290     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3291     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3292     if (MRI.getRegClass(FalseReg) != RC)
3293       return false;
3294 
3295     int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3296     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3297 
3298     // Limit to equal cost for branch vs. N v_cndmask_b32s.
3299     return RI.hasVGPRs(RC) && NumInsts <= 6;
3300   }
3301   case SCC_TRUE:
3302   case SCC_FALSE: {
3303     // FIXME: We could insert for VGPRs if we could replace the original compare
3304     // with a vector one.
3305     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3306     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3307     if (MRI.getRegClass(FalseReg) != RC)
3308       return false;
3309 
3310     int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3311 
3312     // Multiples of 8 can do s_cselect_b64
3313     if (NumInsts % 2 == 0)
3314       NumInsts /= 2;
3315 
3316     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3317     return RI.isSGPRClass(RC);
3318   }
3319   default:
3320     return false;
3321   }
3322 }
3323 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const3324 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3325                                MachineBasicBlock::iterator I, const DebugLoc &DL,
3326                                Register DstReg, ArrayRef<MachineOperand> Cond,
3327                                Register TrueReg, Register FalseReg) const {
3328   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3329   if (Pred == VCCZ || Pred == SCC_FALSE) {
3330     Pred = static_cast<BranchPredicate>(-Pred);
3331     std::swap(TrueReg, FalseReg);
3332   }
3333 
3334   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3335   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3336   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3337 
3338   if (DstSize == 32) {
3339     MachineInstr *Select;
3340     if (Pred == SCC_TRUE) {
3341       Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3342         .addReg(TrueReg)
3343         .addReg(FalseReg);
3344     } else {
3345       // Instruction's operands are backwards from what is expected.
3346       Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3347         .addReg(FalseReg)
3348         .addReg(TrueReg);
3349     }
3350 
3351     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3352     return;
3353   }
3354 
3355   if (DstSize == 64 && Pred == SCC_TRUE) {
3356     MachineInstr *Select =
3357       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3358       .addReg(TrueReg)
3359       .addReg(FalseReg);
3360 
3361     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3362     return;
3363   }
3364 
3365   static const int16_t Sub0_15[] = {
3366     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3367     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3368     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3369     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3370   };
3371 
3372   static const int16_t Sub0_15_64[] = {
3373     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3374     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3375     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3376     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3377   };
3378 
3379   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3380   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3381   const int16_t *SubIndices = Sub0_15;
3382   int NElts = DstSize / 32;
3383 
3384   // 64-bit select is only available for SALU.
3385   // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3386   if (Pred == SCC_TRUE) {
3387     if (NElts % 2) {
3388       SelOp = AMDGPU::S_CSELECT_B32;
3389       EltRC = &AMDGPU::SGPR_32RegClass;
3390     } else {
3391       SelOp = AMDGPU::S_CSELECT_B64;
3392       EltRC = &AMDGPU::SGPR_64RegClass;
3393       SubIndices = Sub0_15_64;
3394       NElts /= 2;
3395     }
3396   }
3397 
3398   MachineInstrBuilder MIB = BuildMI(
3399     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3400 
3401   I = MIB->getIterator();
3402 
3403   SmallVector<Register, 8> Regs;
3404   for (int Idx = 0; Idx != NElts; ++Idx) {
3405     Register DstElt = MRI.createVirtualRegister(EltRC);
3406     Regs.push_back(DstElt);
3407 
3408     unsigned SubIdx = SubIndices[Idx];
3409 
3410     MachineInstr *Select;
3411     if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3412       Select =
3413         BuildMI(MBB, I, DL, get(SelOp), DstElt)
3414         .addReg(FalseReg, 0, SubIdx)
3415         .addReg(TrueReg, 0, SubIdx);
3416     } else {
3417       Select =
3418         BuildMI(MBB, I, DL, get(SelOp), DstElt)
3419         .addReg(TrueReg, 0, SubIdx)
3420         .addReg(FalseReg, 0, SubIdx);
3421     }
3422 
3423     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3424     fixImplicitOperands(*Select);
3425 
3426     MIB.addReg(DstElt)
3427        .addImm(SubIdx);
3428   }
3429 }
3430 
isFoldableCopy(const MachineInstr & MI)3431 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3432   switch (MI.getOpcode()) {
3433   case AMDGPU::V_MOV_B16_t16_e32:
3434   case AMDGPU::V_MOV_B16_t16_e64:
3435   case AMDGPU::V_MOV_B32_e32:
3436   case AMDGPU::V_MOV_B32_e64:
3437   case AMDGPU::V_MOV_B64_PSEUDO:
3438   case AMDGPU::V_MOV_B64_e32:
3439   case AMDGPU::V_MOV_B64_e64:
3440   case AMDGPU::S_MOV_B32:
3441   case AMDGPU::S_MOV_B64:
3442   case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3443   case AMDGPU::COPY:
3444   case AMDGPU::WWM_COPY:
3445   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3446   case AMDGPU::V_ACCVGPR_READ_B32_e64:
3447   case AMDGPU::V_ACCVGPR_MOV_B32:
3448   case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3449     return true;
3450   default:
3451     return false;
3452   }
3453 }
3454 
3455 static constexpr AMDGPU::OpName ModifierOpNames[] = {
3456     AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3457     AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3458     AMDGPU::OpName::omod,           AMDGPU::OpName::op_sel};
3459 
removeModOperands(MachineInstr & MI) const3460 void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3461   unsigned Opc = MI.getOpcode();
3462   for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3463     int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3464     if (Idx >= 0)
3465       MI.removeOperand(Idx);
3466   }
3467 }
3468 
extractSubregFromImm(int64_t Imm,unsigned SubRegIndex)3469 std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3470                                                          unsigned SubRegIndex) {
3471   switch (SubRegIndex) {
3472   case AMDGPU::NoSubRegister:
3473     return Imm;
3474   case AMDGPU::sub0:
3475     return Lo_32(Imm);
3476   case AMDGPU::sub1:
3477     return Hi_32(Imm);
3478   case AMDGPU::lo16:
3479     return SignExtend64<16>(Imm);
3480   case AMDGPU::hi16:
3481     return SignExtend64<16>(Imm >> 16);
3482   case AMDGPU::sub1_lo16:
3483     return SignExtend64<16>(Imm >> 32);
3484   case AMDGPU::sub1_hi16:
3485     return SignExtend64<16>(Imm >> 48);
3486   default:
3487     return std::nullopt;
3488   }
3489 
3490   llvm_unreachable("covered subregister switch");
3491 }
3492 
getNewFMAAKInst(const GCNSubtarget & ST,unsigned Opc)3493 static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3494   switch (Opc) {
3495   case AMDGPU::V_MAC_F16_e32:
3496   case AMDGPU::V_MAC_F16_e64:
3497   case AMDGPU::V_MAD_F16_e64:
3498     return AMDGPU::V_MADAK_F16;
3499   case AMDGPU::V_MAC_F32_e32:
3500   case AMDGPU::V_MAC_F32_e64:
3501   case AMDGPU::V_MAD_F32_e64:
3502     return AMDGPU::V_MADAK_F32;
3503   case AMDGPU::V_FMAC_F32_e32:
3504   case AMDGPU::V_FMAC_F32_e64:
3505   case AMDGPU::V_FMA_F32_e64:
3506     return AMDGPU::V_FMAAK_F32;
3507   case AMDGPU::V_FMAC_F16_e32:
3508   case AMDGPU::V_FMAC_F16_e64:
3509   case AMDGPU::V_FMAC_F16_t16_e64:
3510   case AMDGPU::V_FMAC_F16_fake16_e64:
3511   case AMDGPU::V_FMA_F16_e64:
3512     return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3513                                         ? AMDGPU::V_FMAAK_F16_t16
3514                                         : AMDGPU::V_FMAAK_F16_fake16
3515                                   : AMDGPU::V_FMAAK_F16;
3516   case AMDGPU::V_FMAC_F64_e32:
3517   case AMDGPU::V_FMAC_F64_e64:
3518   case AMDGPU::V_FMA_F64_e64:
3519     return AMDGPU::V_FMAAK_F64;
3520   default:
3521     llvm_unreachable("invalid instruction");
3522   }
3523 }
3524 
getNewFMAMKInst(const GCNSubtarget & ST,unsigned Opc)3525 static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3526   switch (Opc) {
3527   case AMDGPU::V_MAC_F16_e32:
3528   case AMDGPU::V_MAC_F16_e64:
3529   case AMDGPU::V_MAD_F16_e64:
3530     return AMDGPU::V_MADMK_F16;
3531   case AMDGPU::V_MAC_F32_e32:
3532   case AMDGPU::V_MAC_F32_e64:
3533   case AMDGPU::V_MAD_F32_e64:
3534     return AMDGPU::V_MADMK_F32;
3535   case AMDGPU::V_FMAC_F32_e32:
3536   case AMDGPU::V_FMAC_F32_e64:
3537   case AMDGPU::V_FMA_F32_e64:
3538     return AMDGPU::V_FMAMK_F32;
3539   case AMDGPU::V_FMAC_F16_e32:
3540   case AMDGPU::V_FMAC_F16_e64:
3541   case AMDGPU::V_FMAC_F16_t16_e64:
3542   case AMDGPU::V_FMAC_F16_fake16_e64:
3543   case AMDGPU::V_FMA_F16_e64:
3544     return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3545                                         ? AMDGPU::V_FMAMK_F16_t16
3546                                         : AMDGPU::V_FMAMK_F16_fake16
3547                                   : AMDGPU::V_FMAMK_F16;
3548   case AMDGPU::V_FMAC_F64_e32:
3549   case AMDGPU::V_FMAC_F64_e64:
3550   case AMDGPU::V_FMA_F64_e64:
3551     return AMDGPU::V_FMAMK_F64;
3552   default:
3553     llvm_unreachable("invalid instruction");
3554   }
3555 }
3556 
foldImmediate(MachineInstr & UseMI,MachineInstr & DefMI,Register Reg,MachineRegisterInfo * MRI) const3557 bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3558                                 Register Reg, MachineRegisterInfo *MRI) const {
3559   if (!MRI->hasOneNonDBGUse(Reg))
3560     return false;
3561 
3562   int64_t Imm;
3563   if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3564     return false;
3565 
3566   assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3567 
3568   unsigned Opc = UseMI.getOpcode();
3569   if (Opc == AMDGPU::COPY) {
3570     assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3571 
3572     Register DstReg = UseMI.getOperand(0).getReg();
3573     unsigned OpSize = getOpSize(UseMI, 0);
3574     bool Is16Bit = OpSize == 2;
3575     bool Is64Bit = OpSize == 8;
3576     bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3577     unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3578                                            : AMDGPU::V_MOV_B32_e32
3579                                  : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3580                                            : AMDGPU::S_MOV_B32;
3581 
3582     std::optional<int64_t> SubRegImm =
3583         extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg());
3584 
3585     APInt Imm(Is64Bit ? 64 : 32, *SubRegImm,
3586               /*isSigned=*/true, /*implicitTrunc=*/true);
3587 
3588     if (RI.isAGPR(*MRI, DstReg)) {
3589       if (Is64Bit || !isInlineConstant(Imm))
3590         return false;
3591       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3592     }
3593 
3594     if (Is16Bit) {
3595       if (isVGPRCopy)
3596         return false; // Do not clobber vgpr_hi16
3597 
3598       if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3599         return false;
3600 
3601       UseMI.getOperand(0).setSubReg(0);
3602       if (DstReg.isPhysical()) {
3603         DstReg = RI.get32BitRegister(DstReg);
3604         UseMI.getOperand(0).setReg(DstReg);
3605       }
3606       assert(UseMI.getOperand(1).getReg().isVirtual());
3607     }
3608 
3609     MachineFunction *MF = UseMI.getMF();
3610     const MCInstrDesc &NewMCID = get(NewOpc);
3611     const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF);
3612 
3613     if (DstReg.isPhysical()) {
3614       if (!NewDefRC->contains(DstReg))
3615         return false;
3616     } else if (!MRI->constrainRegClass(DstReg, NewDefRC))
3617       return false;
3618 
3619     UseMI.setDesc(NewMCID);
3620     UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3621     UseMI.addImplicitDefUseOperands(*MF);
3622     return true;
3623   }
3624 
3625   if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3626       Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3627       Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3628       Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3629       Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3630       Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3631       Opc == AMDGPU::V_FMAC_F64_e64) {
3632     // Don't fold if we are using source or output modifiers. The new VOP2
3633     // instructions don't have them.
3634     if (hasAnyModifiersSet(UseMI))
3635       return false;
3636 
3637     // If this is a free constant, there's no reason to do this.
3638     // TODO: We could fold this here instead of letting SIFoldOperands do it
3639     // later.
3640     int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3641 
3642     // Any src operand can be used for the legality check.
3643     if (isInlineConstant(UseMI, Src0Idx, Imm))
3644       return false;
3645 
3646     MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3647 
3648     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3649     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3650 
3651     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3652     if ((Src0->isReg() && Src0->getReg() == Reg) ||
3653         (Src1->isReg() && Src1->getReg() == Reg)) {
3654       MachineOperand *RegSrc =
3655           Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3656       if (!RegSrc->isReg())
3657         return false;
3658       if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3659           ST.getConstantBusLimit(Opc) < 2)
3660         return false;
3661 
3662       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3663         return false;
3664 
3665       // If src2 is also a literal constant then we have to choose which one to
3666       // fold. In general it is better to choose madak so that the other literal
3667       // can be materialized in an sgpr instead of a vgpr:
3668       //   s_mov_b32 s0, literal
3669       //   v_madak_f32 v0, s0, v0, literal
3670       // Instead of:
3671       //   v_mov_b32 v1, literal
3672       //   v_madmk_f32 v0, v0, literal, v1
3673       MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3674       if (Def && Def->isMoveImmediate() &&
3675           !isInlineConstant(Def->getOperand(1)))
3676         return false;
3677 
3678       unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3679       if (pseudoToMCOpcode(NewOpc) == -1)
3680         return false;
3681 
3682       // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3683       // takes VGPR_32_Lo128 operands, so the rewrite would also require
3684       // restricting their register classes. For now just bail out.
3685       if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3686           NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3687         return false;
3688 
3689       const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3690           Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3691 
3692       // FIXME: This would be a lot easier if we could return a new instruction
3693       // instead of having to modify in place.
3694 
3695       Register SrcReg = RegSrc->getReg();
3696       unsigned SrcSubReg = RegSrc->getSubReg();
3697       Src0->setReg(SrcReg);
3698       Src0->setSubReg(SrcSubReg);
3699       Src0->setIsKill(RegSrc->isKill());
3700 
3701       if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3702           Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3703           Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3704           Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3705         UseMI.untieRegOperand(
3706             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3707 
3708       Src1->ChangeToImmediate(*SubRegImm);
3709 
3710       removeModOperands(UseMI);
3711       UseMI.setDesc(get(NewOpc));
3712 
3713       bool DeleteDef = MRI->use_nodbg_empty(Reg);
3714       if (DeleteDef)
3715         DefMI.eraseFromParent();
3716 
3717       return true;
3718     }
3719 
3720     // Added part is the constant: Use v_madak_{f16, f32}.
3721     if (Src2->isReg() && Src2->getReg() == Reg) {
3722       if (ST.getConstantBusLimit(Opc) < 2) {
3723         // Not allowed to use constant bus for another operand.
3724         // We can however allow an inline immediate as src0.
3725         bool Src0Inlined = false;
3726         if (Src0->isReg()) {
3727           // Try to inline constant if possible.
3728           // If the Def moves immediate and the use is single
3729           // We are saving VGPR here.
3730           MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3731           if (Def && Def->isMoveImmediate() &&
3732               isInlineConstant(Def->getOperand(1)) &&
3733               MRI->hasOneUse(Src0->getReg())) {
3734             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3735             Src0Inlined = true;
3736           } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3737                      RI.isSGPRReg(*MRI, Src0->getReg())) {
3738             return false;
3739           }
3740           // VGPR is okay as Src0 - fallthrough
3741         }
3742 
3743         if (Src1->isReg() && !Src0Inlined) {
3744           // We have one slot for inlinable constant so far - try to fill it
3745           MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3746           if (Def && Def->isMoveImmediate() &&
3747               isInlineConstant(Def->getOperand(1)) &&
3748               MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3749             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3750           else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3751             return false;
3752           // VGPR is okay as Src1 - fallthrough
3753         }
3754       }
3755 
3756       unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3757       if (pseudoToMCOpcode(NewOpc) == -1)
3758         return false;
3759 
3760       // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3761       // takes VGPR_32_Lo128 operands, so the rewrite would also require
3762       // restricting their register classes. For now just bail out.
3763       if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3764           NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3765         return false;
3766 
3767       // FIXME: This would be a lot easier if we could return a new instruction
3768       // instead of having to modify in place.
3769 
3770       if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3771           Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3772           Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3773           Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3774         UseMI.untieRegOperand(
3775             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3776 
3777       const std::optional<int64_t> SubRegImm =
3778           extractSubregFromImm(Imm, Src2->getSubReg());
3779 
3780       // ChangingToImmediate adds Src2 back to the instruction.
3781       Src2->ChangeToImmediate(*SubRegImm);
3782 
3783       // These come before src2.
3784       removeModOperands(UseMI);
3785       UseMI.setDesc(get(NewOpc));
3786       // It might happen that UseMI was commuted
3787       // and we now have SGPR as SRC1. If so 2 inlined
3788       // constant and SGPR are illegal.
3789       legalizeOperands(UseMI);
3790 
3791       bool DeleteDef = MRI->use_nodbg_empty(Reg);
3792       if (DeleteDef)
3793         DefMI.eraseFromParent();
3794 
3795       return true;
3796     }
3797   }
3798 
3799   return false;
3800 }
3801 
3802 static bool
memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand * > BaseOps1,ArrayRef<const MachineOperand * > BaseOps2)3803 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3804                            ArrayRef<const MachineOperand *> BaseOps2) {
3805   if (BaseOps1.size() != BaseOps2.size())
3806     return false;
3807   for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3808     if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3809       return false;
3810   }
3811   return true;
3812 }
3813 
offsetsDoNotOverlap(LocationSize WidthA,int OffsetA,LocationSize WidthB,int OffsetB)3814 static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3815                                 LocationSize WidthB, int OffsetB) {
3816   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3817   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3818   LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3819   return LowWidth.hasValue() &&
3820          LowOffset + (int)LowWidth.getValue() <= HighOffset;
3821 }
3822 
checkInstOffsetsDoNotOverlap(const MachineInstr & MIa,const MachineInstr & MIb) const3823 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3824                                                const MachineInstr &MIb) const {
3825   SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3826   int64_t Offset0, Offset1;
3827   LocationSize Dummy0 = LocationSize::precise(0);
3828   LocationSize Dummy1 = LocationSize::precise(0);
3829   bool Offset0IsScalable, Offset1IsScalable;
3830   if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3831                                      Dummy0, &RI) ||
3832       !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3833                                      Dummy1, &RI))
3834     return false;
3835 
3836   if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3837     return false;
3838 
3839   if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3840     // FIXME: Handle ds_read2 / ds_write2.
3841     return false;
3842   }
3843   LocationSize Width0 = MIa.memoperands().front()->getSize();
3844   LocationSize Width1 = MIb.memoperands().front()->getSize();
3845   return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3846 }
3847 
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const3848 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
3849                                                   const MachineInstr &MIb) const {
3850   assert(MIa.mayLoadOrStore() &&
3851          "MIa must load from or modify a memory location");
3852   assert(MIb.mayLoadOrStore() &&
3853          "MIb must load from or modify a memory location");
3854 
3855   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
3856     return false;
3857 
3858   // XXX - Can we relax this between address spaces?
3859   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3860     return false;
3861 
3862   if (isLDSDMA(MIa) || isLDSDMA(MIb))
3863     return false;
3864 
3865   // TODO: Should we check the address space from the MachineMemOperand? That
3866   // would allow us to distinguish objects we know don't alias based on the
3867   // underlying address space, even if it was lowered to a different one,
3868   // e.g. private accesses lowered to use MUBUF instructions on a scratch
3869   // buffer.
3870   if (isDS(MIa)) {
3871     if (isDS(MIb))
3872       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3873 
3874     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3875   }
3876 
3877   if (isMUBUF(MIa) || isMTBUF(MIa)) {
3878     if (isMUBUF(MIb) || isMTBUF(MIb))
3879       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3880 
3881     if (isFLAT(MIb))
3882       return isFLATScratch(MIb);
3883 
3884     return !isSMRD(MIb);
3885   }
3886 
3887   if (isSMRD(MIa)) {
3888     if (isSMRD(MIb))
3889       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3890 
3891     if (isFLAT(MIb))
3892       return isFLATScratch(MIb);
3893 
3894     return !isMUBUF(MIb) && !isMTBUF(MIb);
3895   }
3896 
3897   if (isFLAT(MIa)) {
3898     if (isFLAT(MIb)) {
3899       if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3900           (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3901         return true;
3902 
3903       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3904     }
3905 
3906     return false;
3907   }
3908 
3909   return false;
3910 }
3911 
getFoldableImm(Register Reg,const MachineRegisterInfo & MRI,int64_t & Imm,MachineInstr ** DefMI=nullptr)3912 static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
3913                            int64_t &Imm, MachineInstr **DefMI = nullptr) {
3914   if (Reg.isPhysical())
3915     return false;
3916   auto *Def = MRI.getUniqueVRegDef(Reg);
3917   if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3918     Imm = Def->getOperand(1).getImm();
3919     if (DefMI)
3920       *DefMI = Def;
3921     return true;
3922   }
3923   return false;
3924 }
3925 
getFoldableImm(const MachineOperand * MO,int64_t & Imm,MachineInstr ** DefMI=nullptr)3926 static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3927                            MachineInstr **DefMI = nullptr) {
3928   if (!MO->isReg())
3929     return false;
3930   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3931   const MachineRegisterInfo &MRI = MF->getRegInfo();
3932   return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3933 }
3934 
updateLiveVariables(LiveVariables * LV,MachineInstr & MI,MachineInstr & NewMI)3935 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
3936                                 MachineInstr &NewMI) {
3937   if (LV) {
3938     unsigned NumOps = MI.getNumOperands();
3939     for (unsigned I = 1; I < NumOps; ++I) {
3940       MachineOperand &Op = MI.getOperand(I);
3941       if (Op.isReg() && Op.isKill())
3942         LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3943     }
3944   }
3945 }
3946 
getNewFMAInst(const GCNSubtarget & ST,unsigned Opc)3947 static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3948   switch (Opc) {
3949   case AMDGPU::V_MAC_F16_e32:
3950   case AMDGPU::V_MAC_F16_e64:
3951     return AMDGPU::V_MAD_F16_e64;
3952   case AMDGPU::V_MAC_F32_e32:
3953   case AMDGPU::V_MAC_F32_e64:
3954     return AMDGPU::V_MAD_F32_e64;
3955   case AMDGPU::V_MAC_LEGACY_F32_e32:
3956   case AMDGPU::V_MAC_LEGACY_F32_e64:
3957     return AMDGPU::V_MAD_LEGACY_F32_e64;
3958   case AMDGPU::V_FMAC_LEGACY_F32_e32:
3959   case AMDGPU::V_FMAC_LEGACY_F32_e64:
3960     return AMDGPU::V_FMA_LEGACY_F32_e64;
3961   case AMDGPU::V_FMAC_F16_e32:
3962   case AMDGPU::V_FMAC_F16_e64:
3963   case AMDGPU::V_FMAC_F16_t16_e64:
3964   case AMDGPU::V_FMAC_F16_fake16_e64:
3965     return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3966                                         ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3967                                         : AMDGPU::V_FMA_F16_gfx9_fake16_e64
3968                                   : AMDGPU::V_FMA_F16_gfx9_e64;
3969   case AMDGPU::V_FMAC_F32_e32:
3970   case AMDGPU::V_FMAC_F32_e64:
3971     return AMDGPU::V_FMA_F32_e64;
3972   case AMDGPU::V_FMAC_F64_e32:
3973   case AMDGPU::V_FMAC_F64_e64:
3974     return AMDGPU::V_FMA_F64_e64;
3975   default:
3976     llvm_unreachable("invalid instruction");
3977   }
3978 }
3979 
convertToThreeAddress(MachineInstr & MI,LiveVariables * LV,LiveIntervals * LIS) const3980 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3981                                                  LiveVariables *LV,
3982                                                  LiveIntervals *LIS) const {
3983   MachineBasicBlock &MBB = *MI.getParent();
3984   unsigned Opc = MI.getOpcode();
3985 
3986   // Handle MFMA.
3987   int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3988   if (NewMFMAOpc != -1) {
3989     MachineInstrBuilder MIB =
3990         BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3991     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3992       MIB.add(MI.getOperand(I));
3993     updateLiveVariables(LV, MI, *MIB);
3994     if (LIS) {
3995       LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3996       // SlotIndex of defs needs to be updated when converting to early-clobber
3997       MachineOperand &Def = MIB->getOperand(0);
3998       if (Def.isEarlyClobber() && Def.isReg() &&
3999           LIS->hasInterval(Def.getReg())) {
4000         SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4001         SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4002         auto &LI = LIS->getInterval(Def.getReg());
4003         auto UpdateDefIndex = [&](LiveRange &LR) {
4004           auto *S = LR.find(OldIndex);
4005           if (S != LR.end() && S->start == OldIndex) {
4006             assert(S->valno && S->valno->def == OldIndex);
4007             S->start = NewIndex;
4008             S->valno->def = NewIndex;
4009           }
4010         };
4011         UpdateDefIndex(LI);
4012         for (auto &SR : LI.subranges())
4013           UpdateDefIndex(SR);
4014       }
4015     }
4016     return MIB;
4017   }
4018 
4019   if (SIInstrInfo::isWMMA(MI)) {
4020     unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4021     MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4022                                   .setMIFlags(MI.getFlags());
4023     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4024       MIB->addOperand(MI.getOperand(I));
4025 
4026     updateLiveVariables(LV, MI, *MIB);
4027     if (LIS)
4028       LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4029 
4030     return MIB;
4031   }
4032 
4033   assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4034          Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4035          "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4036          "present pre-RA");
4037 
4038   // Handle MAC/FMAC.
4039   bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4040   bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4041                   Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4042                   Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4043                   Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4044   bool Src0Literal = false;
4045 
4046   switch (Opc) {
4047   default:
4048     return nullptr;
4049   case AMDGPU::V_MAC_F16_e64:
4050   case AMDGPU::V_FMAC_F16_e64:
4051   case AMDGPU::V_FMAC_F16_t16_e64:
4052   case AMDGPU::V_FMAC_F16_fake16_e64:
4053   case AMDGPU::V_MAC_F32_e64:
4054   case AMDGPU::V_MAC_LEGACY_F32_e64:
4055   case AMDGPU::V_FMAC_F32_e64:
4056   case AMDGPU::V_FMAC_LEGACY_F32_e64:
4057   case AMDGPU::V_FMAC_F64_e64:
4058     break;
4059   case AMDGPU::V_MAC_F16_e32:
4060   case AMDGPU::V_FMAC_F16_e32:
4061   case AMDGPU::V_MAC_F32_e32:
4062   case AMDGPU::V_MAC_LEGACY_F32_e32:
4063   case AMDGPU::V_FMAC_F32_e32:
4064   case AMDGPU::V_FMAC_LEGACY_F32_e32:
4065   case AMDGPU::V_FMAC_F64_e32: {
4066     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4067                                              AMDGPU::OpName::src0);
4068     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4069     if (!Src0->isReg() && !Src0->isImm())
4070       return nullptr;
4071 
4072     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4073       Src0Literal = true;
4074 
4075     break;
4076   }
4077   }
4078 
4079   MachineInstrBuilder MIB;
4080   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4081   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4082   const MachineOperand *Src0Mods =
4083     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4084   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4085   const MachineOperand *Src1Mods =
4086     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4087   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4088   const MachineOperand *Src2Mods =
4089       getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4090   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4091   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4092   const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4093 
4094   if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4095       (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4096       // If we have an SGPR input, we will violate the constant bus restriction.
4097       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4098        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4099     MachineInstr *DefMI;
4100     const auto killDef = [&]() -> void {
4101       MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4102       // The only user is the instruction which will be killed.
4103       Register DefReg = DefMI->getOperand(0).getReg();
4104 
4105       if (MRI.hasOneNonDBGUse(DefReg)) {
4106         // We cannot just remove the DefMI here, calling pass will crash.
4107         DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4108         DefMI->getOperand(0).setIsDead(true);
4109         for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4110           DefMI->removeOperand(I);
4111         if (LV)
4112           LV->getVarInfo(DefReg).AliveBlocks.clear();
4113       }
4114 
4115       if (LIS) {
4116         LiveInterval &DefLI = LIS->getInterval(DefReg);
4117 
4118         // We cannot delete the original instruction here, so hack out the use
4119         // in the original instruction with a dummy register so we can use
4120         // shrinkToUses to deal with any multi-use edge cases. Other targets do
4121         // not have the complexity of deleting a use to consider here.
4122         Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4123         for (MachineOperand &MIOp : MI.uses()) {
4124           if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4125             MIOp.setIsUndef(true);
4126             MIOp.setReg(DummyReg);
4127           }
4128         }
4129 
4130         LIS->shrinkToUses(&DefLI);
4131       }
4132     };
4133 
4134     int64_t Imm;
4135     if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4136       unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4137       if (pseudoToMCOpcode(NewOpc) != -1) {
4138         MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4139                   .add(*Dst)
4140                   .add(*Src0)
4141                   .add(*Src1)
4142                   .addImm(Imm)
4143                   .setMIFlags(MI.getFlags());
4144         updateLiveVariables(LV, MI, *MIB);
4145         if (LIS)
4146           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4147         killDef();
4148         return MIB;
4149       }
4150     }
4151     unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4152     if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4153       if (pseudoToMCOpcode(NewOpc) != -1) {
4154         MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4155                   .add(*Dst)
4156                   .add(*Src0)
4157                   .addImm(Imm)
4158                   .add(*Src2)
4159                   .setMIFlags(MI.getFlags());
4160         updateLiveVariables(LV, MI, *MIB);
4161 
4162         if (LIS)
4163           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4164         killDef();
4165         return MIB;
4166       }
4167     }
4168     if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4169       if (Src0Literal) {
4170         Imm = Src0->getImm();
4171         DefMI = nullptr;
4172       }
4173       if (pseudoToMCOpcode(NewOpc) != -1 &&
4174           isOperandLegal(
4175               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4176               Src1)) {
4177         MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4178                   .add(*Dst)
4179                   .add(*Src1)
4180                   .addImm(Imm)
4181                   .add(*Src2)
4182                   .setMIFlags(MI.getFlags());
4183         updateLiveVariables(LV, MI, *MIB);
4184 
4185         if (LIS)
4186           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4187         if (DefMI)
4188           killDef();
4189         return MIB;
4190       }
4191     }
4192   }
4193 
4194   // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4195   // if VOP3 does not allow a literal operand.
4196   if (Src0Literal && !ST.hasVOP3Literal())
4197     return nullptr;
4198 
4199   unsigned NewOpc = getNewFMAInst(ST, Opc);
4200 
4201   if (pseudoToMCOpcode(NewOpc) == -1)
4202     return nullptr;
4203 
4204   MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4205             .add(*Dst)
4206             .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4207             .add(*Src0)
4208             .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4209             .add(*Src1)
4210             .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4211             .add(*Src2)
4212             .addImm(Clamp ? Clamp->getImm() : 0)
4213             .addImm(Omod ? Omod->getImm() : 0)
4214             .setMIFlags(MI.getFlags());
4215   if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4216     MIB.addImm(OpSel ? OpSel->getImm() : 0);
4217   updateLiveVariables(LV, MI, *MIB);
4218   if (LIS)
4219     LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4220   return MIB;
4221 }
4222 
4223 // It's not generally safe to move VALU instructions across these since it will
4224 // start using the register as a base index rather than directly.
4225 // XXX - Why isn't hasSideEffects sufficient for these?
changesVGPRIndexingMode(const MachineInstr & MI)4226 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4227   switch (MI.getOpcode()) {
4228   case AMDGPU::S_SET_GPR_IDX_ON:
4229   case AMDGPU::S_SET_GPR_IDX_MODE:
4230   case AMDGPU::S_SET_GPR_IDX_OFF:
4231     return true;
4232   default:
4233     return false;
4234   }
4235 }
4236 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const4237 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4238                                        const MachineBasicBlock *MBB,
4239                                        const MachineFunction &MF) const {
4240   // Skipping the check for SP writes in the base implementation. The reason it
4241   // was added was apparently due to compile time concerns.
4242   //
4243   // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4244   // but is probably avoidable.
4245 
4246   // Copied from base implementation.
4247   // Terminators and labels can't be scheduled around.
4248   if (MI.isTerminator() || MI.isPosition())
4249     return true;
4250 
4251   // INLINEASM_BR can jump to another block
4252   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4253     return true;
4254 
4255   if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4256     return true;
4257 
4258   // Target-independent instructions do not have an implicit-use of EXEC, even
4259   // when they operate on VGPRs. Treating EXEC modifications as scheduling
4260   // boundaries prevents incorrect movements of such instructions.
4261   return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4262          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4263          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4264          MI.getOpcode() == AMDGPU::S_SETPRIO ||
4265          changesVGPRIndexingMode(MI);
4266 }
4267 
isAlwaysGDS(uint16_t Opcode) const4268 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
4269   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4270          Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4271          Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4272 }
4273 
modifiesModeRegister(const MachineInstr & MI)4274 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4275   // Skip the full operand and register alias search modifiesRegister
4276   // does. There's only a handful of instructions that touch this, it's only an
4277   // implicit def, and doesn't alias any other registers.
4278   return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4279 }
4280 
hasUnwantedEffectsWhenEXECEmpty(const MachineInstr & MI) const4281 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4282   unsigned Opcode = MI.getOpcode();
4283 
4284   if (MI.mayStore() && isSMRD(MI))
4285     return true; // scalar store or atomic
4286 
4287   // This will terminate the function when other lanes may need to continue.
4288   if (MI.isReturn())
4289     return true;
4290 
4291   // These instructions cause shader I/O that may cause hardware lockups
4292   // when executed with an empty EXEC mask.
4293   //
4294   // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4295   //       EXEC = 0, but checking for that case here seems not worth it
4296   //       given the typical code patterns.
4297   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4298       isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4299       Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4300     return true;
4301 
4302   if (MI.isCall() || MI.isInlineAsm())
4303     return true; // conservative assumption
4304 
4305   // Assume that barrier interactions are only intended with active lanes.
4306   if (isBarrier(Opcode))
4307     return true;
4308 
4309   // A mode change is a scalar operation that influences vector instructions.
4310   if (modifiesModeRegister(MI))
4311     return true;
4312 
4313   // These are like SALU instructions in terms of effects, so it's questionable
4314   // whether we should return true for those.
4315   //
4316   // However, executing them with EXEC = 0 causes them to operate on undefined
4317   // data, which we avoid by returning true here.
4318   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4319       Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4320       Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4321       Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4322     return true;
4323 
4324   return false;
4325 }
4326 
mayReadEXEC(const MachineRegisterInfo & MRI,const MachineInstr & MI) const4327 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4328                               const MachineInstr &MI) const {
4329   if (MI.isMetaInstruction())
4330     return false;
4331 
4332   // This won't read exec if this is an SGPR->SGPR copy.
4333   if (MI.isCopyLike()) {
4334     if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4335       return true;
4336 
4337     // Make sure this isn't copying exec as a normal operand
4338     return MI.readsRegister(AMDGPU::EXEC, &RI);
4339   }
4340 
4341   // Make a conservative assumption about the callee.
4342   if (MI.isCall())
4343     return true;
4344 
4345   // Be conservative with any unhandled generic opcodes.
4346   if (!isTargetSpecificOpcode(MI.getOpcode()))
4347     return true;
4348 
4349   return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4350 }
4351 
isInlineConstant(const APInt & Imm) const4352 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4353   switch (Imm.getBitWidth()) {
4354   case 1: // This likely will be a condition code mask.
4355     return true;
4356 
4357   case 32:
4358     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4359                                         ST.hasInv2PiInlineImm());
4360   case 64:
4361     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4362                                         ST.hasInv2PiInlineImm());
4363   case 16:
4364     return ST.has16BitInsts() &&
4365            AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4366                                          ST.hasInv2PiInlineImm());
4367   default:
4368     llvm_unreachable("invalid bitwidth");
4369   }
4370 }
4371 
isInlineConstant(const APFloat & Imm) const4372 bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4373   APInt IntImm = Imm.bitcastToAPInt();
4374   int64_t IntImmVal = IntImm.getSExtValue();
4375   bool HasInv2Pi = ST.hasInv2PiInlineImm();
4376   switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4377   default:
4378     llvm_unreachable("invalid fltSemantics");
4379   case APFloatBase::S_IEEEsingle:
4380   case APFloatBase::S_IEEEdouble:
4381     return isInlineConstant(IntImm);
4382   case APFloatBase::S_BFloat:
4383     return ST.has16BitInsts() &&
4384            AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4385   case APFloatBase::S_IEEEhalf:
4386     return ST.has16BitInsts() &&
4387            AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4388   }
4389 }
4390 
isInlineConstant(int64_t Imm,uint8_t OperandType) const4391 bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4392   // MachineOperand provides no way to tell the true operand size, since it only
4393   // records a 64-bit value. We need to know the size to determine if a 32-bit
4394   // floating point immediate bit pattern is legal for an integer immediate. It
4395   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4396   switch (OperandType) {
4397   case AMDGPU::OPERAND_REG_IMM_INT32:
4398   case AMDGPU::OPERAND_REG_IMM_FP32:
4399   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4400   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4401   case AMDGPU::OPERAND_REG_IMM_V2FP32:
4402   case AMDGPU::OPERAND_REG_IMM_V2INT32:
4403   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4404   case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4405   case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4406     int32_t Trunc = static_cast<int32_t>(Imm);
4407     return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4408   }
4409   case AMDGPU::OPERAND_REG_IMM_INT64:
4410   case AMDGPU::OPERAND_REG_IMM_FP64:
4411   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4412   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4413   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4414     return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4415   case AMDGPU::OPERAND_REG_IMM_INT16:
4416   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4417     // We would expect inline immediates to not be concerned with an integer/fp
4418     // distinction. However, in the case of 16-bit integer operations, the
4419     // "floating point" values appear to not work. It seems read the low 16-bits
4420     // of 32-bit immediates, which happens to always work for the integer
4421     // values.
4422     //
4423     // See llvm bugzilla 46302.
4424     //
4425     // TODO: Theoretically we could use op-sel to use the high bits of the
4426     // 32-bit FP values.
4427     return AMDGPU::isInlinableIntLiteral(Imm);
4428   case AMDGPU::OPERAND_REG_IMM_V2INT16:
4429   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4430     return AMDGPU::isInlinableLiteralV2I16(Imm);
4431   case AMDGPU::OPERAND_REG_IMM_V2FP16:
4432   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4433     return AMDGPU::isInlinableLiteralV2F16(Imm);
4434   case AMDGPU::OPERAND_REG_IMM_V2BF16:
4435   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4436     return AMDGPU::isInlinableLiteralV2BF16(Imm);
4437   case AMDGPU::OPERAND_REG_IMM_FP16:
4438   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4439     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4440       // A few special case instructions have 16-bit operands on subtargets
4441       // where 16-bit instructions are not legal.
4442       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4443       // constants in these cases
4444       int16_t Trunc = static_cast<int16_t>(Imm);
4445       return ST.has16BitInsts() &&
4446              AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4447     }
4448 
4449     return false;
4450   }
4451   case AMDGPU::OPERAND_REG_IMM_BF16:
4452   case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4453     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4454       int16_t Trunc = static_cast<int16_t>(Imm);
4455       return ST.has16BitInsts() &&
4456              AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4457     }
4458     return false;
4459   }
4460   case AMDGPU::OPERAND_KIMM32:
4461   case AMDGPU::OPERAND_KIMM16:
4462   case AMDGPU::OPERAND_KIMM64:
4463     return false;
4464   case AMDGPU::OPERAND_INPUT_MODS:
4465   case MCOI::OPERAND_IMMEDIATE:
4466     // Always embedded in the instruction for free.
4467     return true;
4468   case MCOI::OPERAND_UNKNOWN:
4469   case MCOI::OPERAND_REGISTER:
4470   case MCOI::OPERAND_PCREL:
4471   case MCOI::OPERAND_GENERIC_0:
4472   case MCOI::OPERAND_GENERIC_1:
4473   case MCOI::OPERAND_GENERIC_2:
4474   case MCOI::OPERAND_GENERIC_3:
4475   case MCOI::OPERAND_GENERIC_4:
4476   case MCOI::OPERAND_GENERIC_5:
4477     // Just ignore anything else.
4478     return true;
4479   default:
4480     llvm_unreachable("invalid operand type");
4481   }
4482 }
4483 
compareMachineOp(const MachineOperand & Op0,const MachineOperand & Op1)4484 static bool compareMachineOp(const MachineOperand &Op0,
4485                              const MachineOperand &Op1) {
4486   if (Op0.getType() != Op1.getType())
4487     return false;
4488 
4489   switch (Op0.getType()) {
4490   case MachineOperand::MO_Register:
4491     return Op0.getReg() == Op1.getReg();
4492   case MachineOperand::MO_Immediate:
4493     return Op0.getImm() == Op1.getImm();
4494   default:
4495     llvm_unreachable("Didn't expect to be comparing these operand types");
4496   }
4497 }
4498 
isImmOperandLegal(const MachineInstr & MI,unsigned OpNo,const MachineOperand & MO) const4499 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
4500                                     const MachineOperand &MO) const {
4501   const MCInstrDesc &InstDesc = MI.getDesc();
4502   const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4503 
4504   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4505 
4506   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4507     return true;
4508 
4509   if (OpInfo.RegClass < 0)
4510     return false;
4511 
4512   if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4513     if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4514         OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4515                                                     AMDGPU::OpName::src2))
4516       return false;
4517     return RI.opCanUseInlineConstant(OpInfo.OperandType);
4518   }
4519 
4520   if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4521     return false;
4522 
4523   if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4524     return true;
4525 
4526   return ST.hasVOP3Literal();
4527 }
4528 
hasVALU32BitEncoding(unsigned Opcode) const4529 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4530   // GFX90A does not have V_MUL_LEGACY_F32_e32.
4531   if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4532     return false;
4533 
4534   int Op32 = AMDGPU::getVOPe32(Opcode);
4535   if (Op32 == -1)
4536     return false;
4537 
4538   return pseudoToMCOpcode(Op32) != -1;
4539 }
4540 
hasModifiers(unsigned Opcode) const4541 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4542   // The src0_modifier operand is present on all instructions
4543   // that have modifiers.
4544 
4545   return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4546 }
4547 
hasModifiersSet(const MachineInstr & MI,AMDGPU::OpName OpName) const4548 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4549                                   AMDGPU::OpName OpName) const {
4550   const MachineOperand *Mods = getNamedOperand(MI, OpName);
4551   return Mods && Mods->getImm();
4552 }
4553 
hasAnyModifiersSet(const MachineInstr & MI) const4554 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4555   return any_of(ModifierOpNames,
4556                 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4557 }
4558 
canShrink(const MachineInstr & MI,const MachineRegisterInfo & MRI) const4559 bool SIInstrInfo::canShrink(const MachineInstr &MI,
4560                             const MachineRegisterInfo &MRI) const {
4561   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4562   // Can't shrink instruction with three operands.
4563   if (Src2) {
4564     switch (MI.getOpcode()) {
4565       default: return false;
4566 
4567       case AMDGPU::V_ADDC_U32_e64:
4568       case AMDGPU::V_SUBB_U32_e64:
4569       case AMDGPU::V_SUBBREV_U32_e64: {
4570         const MachineOperand *Src1
4571           = getNamedOperand(MI, AMDGPU::OpName::src1);
4572         if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4573           return false;
4574         // Additional verification is needed for sdst/src2.
4575         return true;
4576       }
4577       case AMDGPU::V_MAC_F16_e64:
4578       case AMDGPU::V_MAC_F32_e64:
4579       case AMDGPU::V_MAC_LEGACY_F32_e64:
4580       case AMDGPU::V_FMAC_F16_e64:
4581       case AMDGPU::V_FMAC_F16_t16_e64:
4582       case AMDGPU::V_FMAC_F16_fake16_e64:
4583       case AMDGPU::V_FMAC_F32_e64:
4584       case AMDGPU::V_FMAC_F64_e64:
4585       case AMDGPU::V_FMAC_LEGACY_F32_e64:
4586         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4587             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4588           return false;
4589         break;
4590 
4591       case AMDGPU::V_CNDMASK_B32_e64:
4592         break;
4593     }
4594   }
4595 
4596   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4597   if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4598                hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4599     return false;
4600 
4601   // We don't need to check src0, all input types are legal, so just make sure
4602   // src0 isn't using any modifiers.
4603   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4604     return false;
4605 
4606   // Can it be shrunk to a valid 32 bit opcode?
4607   if (!hasVALU32BitEncoding(MI.getOpcode()))
4608     return false;
4609 
4610   // Check output modifiers
4611   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4612          !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4613          !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4614          // TODO: Can we avoid checking bound_ctrl/fi here?
4615          // They are only used by permlane*_swap special case.
4616          !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4617          !hasModifiersSet(MI, AMDGPU::OpName::fi);
4618 }
4619 
4620 // Set VCC operand with all flags from \p Orig, except for setting it as
4621 // implicit.
copyFlagsToImplicitVCC(MachineInstr & MI,const MachineOperand & Orig)4622 static void copyFlagsToImplicitVCC(MachineInstr &MI,
4623                                    const MachineOperand &Orig) {
4624 
4625   for (MachineOperand &Use : MI.implicit_operands()) {
4626     if (Use.isUse() &&
4627         (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4628       Use.setIsUndef(Orig.isUndef());
4629       Use.setIsKill(Orig.isKill());
4630       return;
4631     }
4632   }
4633 }
4634 
buildShrunkInst(MachineInstr & MI,unsigned Op32) const4635 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4636                                            unsigned Op32) const {
4637   MachineBasicBlock *MBB = MI.getParent();
4638 
4639   const MCInstrDesc &Op32Desc = get(Op32);
4640   MachineInstrBuilder Inst32 =
4641     BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4642     .setMIFlags(MI.getFlags());
4643 
4644   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4645   // For VOPC instructions, this is replaced by an implicit def of vcc.
4646 
4647   // We assume the defs of the shrunk opcode are in the same order, and the
4648   // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4649   for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4650     Inst32.add(MI.getOperand(I));
4651 
4652   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4653 
4654   int Idx = MI.getNumExplicitDefs();
4655   for (const MachineOperand &Use : MI.explicit_uses()) {
4656     int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4657     if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
4658       continue;
4659 
4660     if (&Use == Src2) {
4661       if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4662         // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4663         // replaced with an implicit read of vcc or vcc_lo. The implicit read
4664         // of vcc was already added during the initial BuildMI, but we
4665         // 1) may need to change vcc to vcc_lo to preserve the original register
4666         // 2) have to preserve the original flags.
4667         copyFlagsToImplicitVCC(*Inst32, *Src2);
4668         continue;
4669       }
4670     }
4671 
4672     Inst32.add(Use);
4673   }
4674 
4675   // FIXME: Losing implicit operands
4676   fixImplicitOperands(*Inst32);
4677   return Inst32;
4678 }
4679 
usesConstantBus(const MachineRegisterInfo & MRI,const MachineOperand & MO,const MCOperandInfo & OpInfo) const4680 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
4681                                   const MachineOperand &MO,
4682                                   const MCOperandInfo &OpInfo) const {
4683   // Literal constants use the constant bus.
4684   if (!MO.isReg())
4685     return !isInlineConstant(MO, OpInfo);
4686 
4687   if (!MO.isUse())
4688     return false;
4689 
4690   if (MO.getReg().isVirtual())
4691     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4692 
4693   // Null is free
4694   if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4695     return false;
4696 
4697   // SGPRs use the constant bus
4698   if (MO.isImplicit()) {
4699     return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4700            MO.getReg() == AMDGPU::VCC_LO;
4701   }
4702   return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4703          AMDGPU::SReg_64RegClass.contains(MO.getReg());
4704 }
4705 
findImplicitSGPRRead(const MachineInstr & MI)4706 static Register findImplicitSGPRRead(const MachineInstr &MI) {
4707   for (const MachineOperand &MO : MI.implicit_operands()) {
4708     // We only care about reads.
4709     if (MO.isDef())
4710       continue;
4711 
4712     switch (MO.getReg()) {
4713     case AMDGPU::VCC:
4714     case AMDGPU::VCC_LO:
4715     case AMDGPU::VCC_HI:
4716     case AMDGPU::M0:
4717     case AMDGPU::FLAT_SCR:
4718       return MO.getReg();
4719 
4720     default:
4721       break;
4722     }
4723   }
4724 
4725   return Register();
4726 }
4727 
shouldReadExec(const MachineInstr & MI)4728 static bool shouldReadExec(const MachineInstr &MI) {
4729   if (SIInstrInfo::isVALU(MI)) {
4730     switch (MI.getOpcode()) {
4731     case AMDGPU::V_READLANE_B32:
4732     case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4733     case AMDGPU::V_WRITELANE_B32:
4734     case AMDGPU::SI_SPILL_S32_TO_VGPR:
4735       return false;
4736     }
4737 
4738     return true;
4739   }
4740 
4741   if (MI.isPreISelOpcode() ||
4742       SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4743       SIInstrInfo::isSALU(MI) ||
4744       SIInstrInfo::isSMRD(MI))
4745     return false;
4746 
4747   return true;
4748 }
4749 
isRegOrFI(const MachineOperand & MO)4750 static bool isRegOrFI(const MachineOperand &MO) {
4751   return MO.isReg() || MO.isFI();
4752 }
4753 
isSubRegOf(const SIRegisterInfo & TRI,const MachineOperand & SuperVec,const MachineOperand & SubReg)4754 static bool isSubRegOf(const SIRegisterInfo &TRI,
4755                        const MachineOperand &SuperVec,
4756                        const MachineOperand &SubReg) {
4757   if (SubReg.getReg().isPhysical())
4758     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4759 
4760   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4761          SubReg.getReg() == SuperVec.getReg();
4762 }
4763 
4764 // Verify the illegal copy from vector register to SGPR for generic opcode COPY
verifyCopy(const MachineInstr & MI,const MachineRegisterInfo & MRI,StringRef & ErrInfo) const4765 bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4766                              const MachineRegisterInfo &MRI,
4767                              StringRef &ErrInfo) const {
4768   Register DstReg = MI.getOperand(0).getReg();
4769   Register SrcReg = MI.getOperand(1).getReg();
4770   // This is a check for copy from vector register to SGPR
4771   if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4772     ErrInfo = "illegal copy from vector register to SGPR";
4773     return false;
4774   }
4775   return true;
4776 }
4777 
verifyInstruction(const MachineInstr & MI,StringRef & ErrInfo) const4778 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
4779                                     StringRef &ErrInfo) const {
4780   uint16_t Opcode = MI.getOpcode();
4781   const MachineFunction *MF = MI.getParent()->getParent();
4782   const MachineRegisterInfo &MRI = MF->getRegInfo();
4783 
4784   // FIXME: At this point the COPY verify is done only for non-ssa forms.
4785   // Find a better property to recognize the point where instruction selection
4786   // is just done.
4787   // We can only enforce this check after SIFixSGPRCopies pass so that the
4788   // illegal copies are legalized and thereafter we don't expect a pass
4789   // inserting similar copies.
4790   if (!MRI.isSSA() && MI.isCopy())
4791     return verifyCopy(MI, MRI, ErrInfo);
4792 
4793   if (SIInstrInfo::isGenericOpcode(Opcode))
4794     return true;
4795 
4796   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4797   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4798   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4799   int Src3Idx = -1;
4800   if (Src0Idx == -1) {
4801     // VOPD V_DUAL_* instructions use different operand names.
4802     Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4803     Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4804     Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4805     Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4806   }
4807 
4808   // Make sure the number of operands is correct.
4809   const MCInstrDesc &Desc = get(Opcode);
4810   if (!Desc.isVariadic() &&
4811       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4812     ErrInfo = "Instruction has wrong number of operands.";
4813     return false;
4814   }
4815 
4816   if (MI.isInlineAsm()) {
4817     // Verify register classes for inlineasm constraints.
4818     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4819          I != E; ++I) {
4820       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4821       if (!RC)
4822         continue;
4823 
4824       const MachineOperand &Op = MI.getOperand(I);
4825       if (!Op.isReg())
4826         continue;
4827 
4828       Register Reg = Op.getReg();
4829       if (!Reg.isVirtual() && !RC->contains(Reg)) {
4830         ErrInfo = "inlineasm operand has incorrect register class.";
4831         return false;
4832       }
4833     }
4834 
4835     return true;
4836   }
4837 
4838   if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4839     ErrInfo = "missing memory operand from image instruction.";
4840     return false;
4841   }
4842 
4843   // Make sure the register classes are correct.
4844   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4845     const MachineOperand &MO = MI.getOperand(i);
4846     if (MO.isFPImm()) {
4847       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4848                 "all fp values to integers.";
4849       return false;
4850     }
4851 
4852     int RegClass = Desc.operands()[i].RegClass;
4853 
4854     switch (Desc.operands()[i].OperandType) {
4855     case MCOI::OPERAND_REGISTER:
4856       if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4857         ErrInfo = "Illegal immediate value for operand.";
4858         return false;
4859       }
4860       break;
4861     case AMDGPU::OPERAND_REG_IMM_INT32:
4862     case AMDGPU::OPERAND_REG_IMM_FP32:
4863     case AMDGPU::OPERAND_REG_IMM_V2FP32:
4864       break;
4865     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4866     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4867     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4868     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4869     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4870     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
4871     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4872     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4873     case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
4874       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4875         ErrInfo = "Illegal immediate value for operand.";
4876         return false;
4877       }
4878       break;
4879     }
4880     case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
4881       if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4882         ErrInfo = "Expected inline constant for operand.";
4883         return false;
4884       }
4885       break;
4886     case MCOI::OPERAND_IMMEDIATE:
4887     case AMDGPU::OPERAND_KIMM32:
4888     case AMDGPU::OPERAND_KIMM64:
4889       // Check if this operand is an immediate.
4890       // FrameIndex operands will be replaced by immediates, so they are
4891       // allowed.
4892       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4893         ErrInfo = "Expected immediate, but got non-immediate";
4894         return false;
4895       }
4896       [[fallthrough]];
4897     default:
4898       continue;
4899     }
4900 
4901     if (!MO.isReg())
4902       continue;
4903     Register Reg = MO.getReg();
4904     if (!Reg)
4905       continue;
4906 
4907     // FIXME: Ideally we would have separate instruction definitions with the
4908     // aligned register constraint.
4909     // FIXME: We do not verify inline asm operands, but custom inline asm
4910     // verification is broken anyway
4911     if (ST.needsAlignedVGPRs()) {
4912       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4913       if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4914         if (const TargetRegisterClass *SubRC =
4915                 RI.getSubRegisterClass(RC, MO.getSubReg())) {
4916           RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4917           if (RC)
4918             RC = SubRC;
4919         }
4920       }
4921 
4922       // Check that this is the aligned version of the class.
4923       if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4924         ErrInfo = "Subtarget requires even aligned vector registers";
4925         return false;
4926       }
4927     }
4928 
4929     if (RegClass != -1) {
4930       if (Reg.isVirtual())
4931         continue;
4932 
4933       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4934       if (!RC->contains(Reg)) {
4935         ErrInfo = "Operand has incorrect register class.";
4936         return false;
4937       }
4938     }
4939   }
4940 
4941   // Verify SDWA
4942   if (isSDWA(MI)) {
4943     if (!ST.hasSDWA()) {
4944       ErrInfo = "SDWA is not supported on this target";
4945       return false;
4946     }
4947 
4948     for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
4949                     AMDGPU::OpName::dst_sel}) {
4950       const MachineOperand *MO = getNamedOperand(MI, Op);
4951       if (!MO)
4952         continue;
4953       int64_t Imm = MO->getImm();
4954       if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
4955         ErrInfo = "Invalid SDWA selection";
4956         return false;
4957       }
4958     }
4959 
4960     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4961 
4962     for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4963       if (OpIdx == -1)
4964         continue;
4965       const MachineOperand &MO = MI.getOperand(OpIdx);
4966 
4967       if (!ST.hasSDWAScalar()) {
4968         // Only VGPRS on VI
4969         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4970           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4971           return false;
4972         }
4973       } else {
4974         // No immediates on GFX9
4975         if (!MO.isReg()) {
4976           ErrInfo =
4977             "Only reg allowed as operands in SDWA instructions on GFX9+";
4978           return false;
4979         }
4980       }
4981     }
4982 
4983     if (!ST.hasSDWAOmod()) {
4984       // No omod allowed on VI
4985       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4986       if (OMod != nullptr &&
4987         (!OMod->isImm() || OMod->getImm() != 0)) {
4988         ErrInfo = "OMod not allowed in SDWA instructions on VI";
4989         return false;
4990       }
4991     }
4992 
4993     if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4994         Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4995         Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4996         Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4997       const MachineOperand *Src0ModsMO =
4998           getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4999       unsigned Mods = Src0ModsMO->getImm();
5000       if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5001           Mods & SISrcMods::SEXT) {
5002         ErrInfo = "sext, abs and neg are not allowed on this instruction";
5003         return false;
5004       }
5005     }
5006 
5007     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5008     if (isVOPC(BasicOpcode)) {
5009       if (!ST.hasSDWASdst() && DstIdx != -1) {
5010         // Only vcc allowed as dst on VI for VOPC
5011         const MachineOperand &Dst = MI.getOperand(DstIdx);
5012         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5013           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5014           return false;
5015         }
5016       } else if (!ST.hasSDWAOutModsVOPC()) {
5017         // No clamp allowed on GFX9 for VOPC
5018         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5019         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5020           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5021           return false;
5022         }
5023 
5024         // No omod allowed on GFX9 for VOPC
5025         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5026         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5027           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5028           return false;
5029         }
5030       }
5031     }
5032 
5033     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5034     if (DstUnused && DstUnused->isImm() &&
5035         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5036       const MachineOperand &Dst = MI.getOperand(DstIdx);
5037       if (!Dst.isReg() || !Dst.isTied()) {
5038         ErrInfo = "Dst register should have tied register";
5039         return false;
5040       }
5041 
5042       const MachineOperand &TiedMO =
5043           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5044       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5045         ErrInfo =
5046             "Dst register should be tied to implicit use of preserved register";
5047         return false;
5048       }
5049       if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5050         ErrInfo = "Dst register should use same physical register as preserved";
5051         return false;
5052       }
5053     }
5054   }
5055 
5056   // Verify MIMG / VIMAGE / VSAMPLE
5057   if (isImage(Opcode) && !MI.mayStore()) {
5058     // Ensure that the return type used is large enough for all the options
5059     // being used TFE/LWE require an extra result register.
5060     const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5061     if (DMask) {
5062       uint64_t DMaskImm = DMask->getImm();
5063       uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5064       const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5065       const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5066       const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5067 
5068       // Adjust for packed 16 bit values
5069       if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5070         RegCount = divideCeil(RegCount, 2);
5071 
5072       // Adjust if using LWE or TFE
5073       if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5074         RegCount += 1;
5075 
5076       const uint32_t DstIdx =
5077           AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5078       const MachineOperand &Dst = MI.getOperand(DstIdx);
5079       if (Dst.isReg()) {
5080         const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5081         uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5082         if (RegCount > DstSize) {
5083           ErrInfo = "Image instruction returns too many registers for dst "
5084                     "register class";
5085           return false;
5086         }
5087       }
5088     }
5089   }
5090 
5091   // Verify VOP*. Ignore multiple sgpr operands on writelane.
5092   if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5093     unsigned ConstantBusCount = 0;
5094     bool UsesLiteral = false;
5095     const MachineOperand *LiteralVal = nullptr;
5096 
5097     int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5098     if (ImmIdx != -1) {
5099       ++ConstantBusCount;
5100       UsesLiteral = true;
5101       LiteralVal = &MI.getOperand(ImmIdx);
5102     }
5103 
5104     SmallVector<Register, 2> SGPRsUsed;
5105     Register SGPRUsed;
5106 
5107     // Only look at the true operands. Only a real operand can use the constant
5108     // bus, and we don't want to check pseudo-operands like the source modifier
5109     // flags.
5110     for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5111       if (OpIdx == -1)
5112         continue;
5113       const MachineOperand &MO = MI.getOperand(OpIdx);
5114       if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5115         if (MO.isReg()) {
5116           SGPRUsed = MO.getReg();
5117           if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5118             ++ConstantBusCount;
5119             SGPRsUsed.push_back(SGPRUsed);
5120           }
5121         } else if (!MO.isFI()) { // Treat FI like a register.
5122           if (!UsesLiteral) {
5123             ++ConstantBusCount;
5124             UsesLiteral = true;
5125             LiteralVal = &MO;
5126           } else if (!MO.isIdenticalTo(*LiteralVal)) {
5127             assert(isVOP2(MI) || isVOP3(MI));
5128             ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5129             return false;
5130           }
5131         }
5132       }
5133     }
5134 
5135     SGPRUsed = findImplicitSGPRRead(MI);
5136     if (SGPRUsed) {
5137       // Implicit uses may safely overlap true operands
5138       if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5139             return !RI.regsOverlap(SGPRUsed, SGPR);
5140           })) {
5141         ++ConstantBusCount;
5142         SGPRsUsed.push_back(SGPRUsed);
5143       }
5144     }
5145 
5146     // v_writelane_b32 is an exception from constant bus restriction:
5147     // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5148     if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5149         Opcode != AMDGPU::V_WRITELANE_B32) {
5150       ErrInfo = "VOP* instruction violates constant bus restriction";
5151       return false;
5152     }
5153 
5154     if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5155       ErrInfo = "VOP3 instruction uses literal";
5156       return false;
5157     }
5158   }
5159 
5160   // Special case for writelane - this can break the multiple constant bus rule,
5161   // but still can't use more than one SGPR register
5162   if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5163     unsigned SGPRCount = 0;
5164     Register SGPRUsed;
5165 
5166     for (int OpIdx : {Src0Idx, Src1Idx}) {
5167       if (OpIdx == -1)
5168         break;
5169 
5170       const MachineOperand &MO = MI.getOperand(OpIdx);
5171 
5172       if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5173         if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5174           if (MO.getReg() != SGPRUsed)
5175             ++SGPRCount;
5176           SGPRUsed = MO.getReg();
5177         }
5178       }
5179       if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5180         ErrInfo = "WRITELANE instruction violates constant bus restriction";
5181         return false;
5182       }
5183     }
5184   }
5185 
5186   // Verify misc. restrictions on specific instructions.
5187   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5188       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5189     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5190     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5191     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5192     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5193       if (!compareMachineOp(Src0, Src1) &&
5194           !compareMachineOp(Src0, Src2)) {
5195         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5196         return false;
5197       }
5198     }
5199     if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5200          SISrcMods::ABS) ||
5201         (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5202          SISrcMods::ABS) ||
5203         (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5204          SISrcMods::ABS)) {
5205       ErrInfo = "ABS not allowed in VOP3B instructions";
5206       return false;
5207     }
5208   }
5209 
5210   if (isSOP2(MI) || isSOPC(MI)) {
5211     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5212     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5213 
5214     if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5215         !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5216         !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5217         !Src0.isIdenticalTo(Src1)) {
5218       ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5219       return false;
5220     }
5221   }
5222 
5223   if (isSOPK(MI)) {
5224     const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5225     if (Desc.isBranch()) {
5226       if (!Op->isMBB()) {
5227         ErrInfo = "invalid branch target for SOPK instruction";
5228         return false;
5229       }
5230     } else {
5231       uint64_t Imm = Op->getImm();
5232       if (sopkIsZext(Opcode)) {
5233         if (!isUInt<16>(Imm)) {
5234           ErrInfo = "invalid immediate for SOPK instruction";
5235           return false;
5236         }
5237       } else {
5238         if (!isInt<16>(Imm)) {
5239           ErrInfo = "invalid immediate for SOPK instruction";
5240           return false;
5241         }
5242       }
5243     }
5244   }
5245 
5246   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5247       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5248       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5249       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5250     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5251                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5252 
5253     const unsigned StaticNumOps =
5254         Desc.getNumOperands() + Desc.implicit_uses().size();
5255     const unsigned NumImplicitOps = IsDst ? 2 : 1;
5256 
5257     // Allow additional implicit operands. This allows a fixup done by the post
5258     // RA scheduler where the main implicit operand is killed and implicit-defs
5259     // are added for sub-registers that remain live after this instruction.
5260     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5261       ErrInfo = "missing implicit register operands";
5262       return false;
5263     }
5264 
5265     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5266     if (IsDst) {
5267       if (!Dst->isUse()) {
5268         ErrInfo = "v_movreld_b32 vdst should be a use operand";
5269         return false;
5270       }
5271 
5272       unsigned UseOpIdx;
5273       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5274           UseOpIdx != StaticNumOps + 1) {
5275         ErrInfo = "movrel implicit operands should be tied";
5276         return false;
5277       }
5278     }
5279 
5280     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5281     const MachineOperand &ImpUse
5282       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5283     if (!ImpUse.isReg() || !ImpUse.isUse() ||
5284         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5285       ErrInfo = "src0 should be subreg of implicit vector use";
5286       return false;
5287     }
5288   }
5289 
5290   // Make sure we aren't losing exec uses in the td files. This mostly requires
5291   // being careful when using let Uses to try to add other use registers.
5292   if (shouldReadExec(MI)) {
5293     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5294       ErrInfo = "VALU instruction does not implicitly read exec mask";
5295       return false;
5296     }
5297   }
5298 
5299   if (isSMRD(MI)) {
5300     if (MI.mayStore() &&
5301         ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5302       // The register offset form of scalar stores may only use m0 as the
5303       // soffset register.
5304       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5305       if (Soff && Soff->getReg() != AMDGPU::M0) {
5306         ErrInfo = "scalar stores must use m0 as offset register";
5307         return false;
5308       }
5309     }
5310   }
5311 
5312   if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5313     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5314     if (Offset->getImm() != 0) {
5315       ErrInfo = "subtarget does not support offsets in flat instructions";
5316       return false;
5317     }
5318   }
5319 
5320   if (isDS(MI) && !ST.hasGDS()) {
5321     const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5322     if (GDSOp && GDSOp->getImm() != 0) {
5323       ErrInfo = "GDS is not supported on this subtarget";
5324       return false;
5325     }
5326   }
5327 
5328   if (isImage(MI)) {
5329     const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5330     if (DimOp) {
5331       int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5332                                                  AMDGPU::OpName::vaddr0);
5333       AMDGPU::OpName RSrcOpName =
5334           isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5335       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5336       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5337       const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5338           AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5339       const AMDGPU::MIMGDimInfo *Dim =
5340           AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
5341 
5342       if (!Dim) {
5343         ErrInfo = "dim is out of range";
5344         return false;
5345       }
5346 
5347       bool IsA16 = false;
5348       if (ST.hasR128A16()) {
5349         const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5350         IsA16 = R128A16->getImm() != 0;
5351       } else if (ST.hasA16()) {
5352         const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5353         IsA16 = A16->getImm() != 0;
5354       }
5355 
5356       bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5357 
5358       unsigned AddrWords =
5359           AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5360 
5361       unsigned VAddrWords;
5362       if (IsNSA) {
5363         VAddrWords = RsrcIdx - VAddr0Idx;
5364         if (ST.hasPartialNSAEncoding() &&
5365             AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5366           unsigned LastVAddrIdx = RsrcIdx - 1;
5367           VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5368         }
5369       } else {
5370         VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5371         if (AddrWords > 12)
5372           AddrWords = 16;
5373       }
5374 
5375       if (VAddrWords != AddrWords) {
5376         LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5377                           << " but got " << VAddrWords << "\n");
5378         ErrInfo = "bad vaddr size";
5379         return false;
5380       }
5381     }
5382   }
5383 
5384   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5385   if (DppCt) {
5386     using namespace AMDGPU::DPP;
5387 
5388     unsigned DC = DppCt->getImm();
5389     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5390         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5391         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5392         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5393         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5394         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5395         (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5396       ErrInfo = "Invalid dpp_ctrl value";
5397       return false;
5398     }
5399     if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5400         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5401       ErrInfo = "Invalid dpp_ctrl value: "
5402                 "wavefront shifts are not supported on GFX10+";
5403       return false;
5404     }
5405     if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5406         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5407       ErrInfo = "Invalid dpp_ctrl value: "
5408                 "broadcasts are not supported on GFX10+";
5409       return false;
5410     }
5411     if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5412         ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5413       if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5414           DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5415           !ST.hasGFX90AInsts()) {
5416         ErrInfo = "Invalid dpp_ctrl value: "
5417                   "row_newbroadcast/row_share is not supported before "
5418                   "GFX90A/GFX10";
5419         return false;
5420       }
5421       if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5422         ErrInfo = "Invalid dpp_ctrl value: "
5423                   "row_share and row_xmask are not supported before GFX10";
5424         return false;
5425       }
5426     }
5427 
5428     if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5429         !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
5430       ErrInfo = "Invalid dpp_ctrl value: "
5431                 "DP ALU dpp only support row_newbcast";
5432       return false;
5433     }
5434   }
5435 
5436   if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5437     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5438     AMDGPU::OpName DataName =
5439         isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5440     const MachineOperand *Data = getNamedOperand(MI, DataName);
5441     const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5442     if (Data && !Data->isReg())
5443       Data = nullptr;
5444 
5445     if (ST.hasGFX90AInsts()) {
5446       if (Dst && Data &&
5447           (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5448         ErrInfo = "Invalid register class: "
5449                   "vdata and vdst should be both VGPR or AGPR";
5450         return false;
5451       }
5452       if (Data && Data2 &&
5453           (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5454         ErrInfo = "Invalid register class: "
5455                   "both data operands should be VGPR or AGPR";
5456         return false;
5457       }
5458     } else {
5459       if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5460           (Data && RI.isAGPR(MRI, Data->getReg())) ||
5461           (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5462         ErrInfo = "Invalid register class: "
5463                   "agpr loads and stores not supported on this GPU";
5464         return false;
5465       }
5466     }
5467   }
5468 
5469   if (ST.needsAlignedVGPRs()) {
5470     const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5471       const MachineOperand *Op = getNamedOperand(MI, OpName);
5472       if (!Op)
5473         return true;
5474       Register Reg = Op->getReg();
5475       if (Reg.isPhysical())
5476         return !(RI.getHWRegIndex(Reg) & 1);
5477       const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5478       return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5479              !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5480     };
5481 
5482     if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5483         Opcode == AMDGPU::DS_GWS_BARRIER) {
5484 
5485       if (!isAlignedReg(AMDGPU::OpName::data0)) {
5486         ErrInfo = "Subtarget requires even aligned vector registers "
5487                   "for DS_GWS instructions";
5488         return false;
5489       }
5490     }
5491 
5492     if (isMIMG(MI)) {
5493       if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5494         ErrInfo = "Subtarget requires even aligned vector registers "
5495                   "for vaddr operand of image instructions";
5496         return false;
5497       }
5498     }
5499   }
5500 
5501   if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5502     const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5503     if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5504       ErrInfo = "Invalid register class: "
5505                 "v_accvgpr_write with an SGPR is not supported on this GPU";
5506       return false;
5507     }
5508   }
5509 
5510   if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5511     const MachineOperand &SrcOp = MI.getOperand(1);
5512     if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5513       ErrInfo = "pseudo expects only physical SGPRs";
5514       return false;
5515     }
5516   }
5517 
5518   return true;
5519 }
5520 
5521 // It is more readable to list mapped opcodes on the same line.
5522 // clang-format off
5523 
getVALUOp(const MachineInstr & MI) const5524 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5525   switch (MI.getOpcode()) {
5526   default: return AMDGPU::INSTRUCTION_LIST_END;
5527   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5528   case AMDGPU::COPY: return AMDGPU::COPY;
5529   case AMDGPU::PHI: return AMDGPU::PHI;
5530   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5531   case AMDGPU::WQM: return AMDGPU::WQM;
5532   case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5533   case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5534   case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5535   case AMDGPU::S_MOV_B32: {
5536     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5537     return MI.getOperand(1).isReg() ||
5538            RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5539            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5540   }
5541   case AMDGPU::S_ADD_I32:
5542     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5543   case AMDGPU::S_ADDC_U32:
5544     return AMDGPU::V_ADDC_U32_e32;
5545   case AMDGPU::S_SUB_I32:
5546     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5547     // FIXME: These are not consistently handled, and selected when the carry is
5548     // used.
5549   case AMDGPU::S_ADD_U32:
5550     return AMDGPU::V_ADD_CO_U32_e32;
5551   case AMDGPU::S_SUB_U32:
5552     return AMDGPU::V_SUB_CO_U32_e32;
5553   case AMDGPU::S_ADD_U64_PSEUDO:
5554     return AMDGPU::V_ADD_U64_PSEUDO;
5555   case AMDGPU::S_SUB_U64_PSEUDO:
5556     return AMDGPU::V_SUB_U64_PSEUDO;
5557   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5558   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5559   case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5560   case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5561   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5562   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5563   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5564   case AMDGPU::S_XNOR_B32:
5565     return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5566   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5567   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5568   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5569   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5570   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5571   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5572   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5573   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5574   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5575   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5576   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5577   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5578   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5579   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5580   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5581   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5582   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5583   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5584   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5585   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5586   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5587   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5588   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5589   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5590   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5591   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5592   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5593   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5594   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5595   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5596   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5597   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5598   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5599   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5600   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5601   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5602   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5603   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5604   case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5605   case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5606   case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5607   case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5608   case AMDGPU::S_CVT_F32_F16:
5609   case AMDGPU::S_CVT_HI_F32_F16:
5610     return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5611                                    : AMDGPU::V_CVT_F32_F16_fake16_e64;
5612   case AMDGPU::S_CVT_F16_F32:
5613     return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5614                                    : AMDGPU::V_CVT_F16_F32_fake16_e64;
5615   case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5616   case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5617   case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5618   case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5619   case AMDGPU::S_CEIL_F16:
5620     return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5621                                    : AMDGPU::V_CEIL_F16_fake16_e64;
5622   case AMDGPU::S_FLOOR_F16:
5623     return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5624                                    : AMDGPU::V_FLOOR_F16_fake16_e64;
5625   case AMDGPU::S_TRUNC_F16:
5626     return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5627                                    : AMDGPU::V_TRUNC_F16_fake16_e64;
5628   case AMDGPU::S_RNDNE_F16:
5629     return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5630                                    : AMDGPU::V_RNDNE_F16_fake16_e64;
5631   case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5632   case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5633   case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5634   case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5635   case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5636   case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5637   case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5638   case AMDGPU::S_ADD_F16:
5639     return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5640                                    : AMDGPU::V_ADD_F16_fake16_e64;
5641   case AMDGPU::S_SUB_F16:
5642     return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5643                                    : AMDGPU::V_SUB_F16_fake16_e64;
5644   case AMDGPU::S_MIN_F16:
5645     return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5646                                    : AMDGPU::V_MIN_F16_fake16_e64;
5647   case AMDGPU::S_MAX_F16:
5648     return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5649                                    : AMDGPU::V_MAX_F16_fake16_e64;
5650   case AMDGPU::S_MINIMUM_F16:
5651     return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5652                                    : AMDGPU::V_MINIMUM_F16_fake16_e64;
5653   case AMDGPU::S_MAXIMUM_F16:
5654     return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5655                                    : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5656   case AMDGPU::S_MUL_F16:
5657     return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5658                                    : AMDGPU::V_MUL_F16_fake16_e64;
5659   case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5660   case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5661   case AMDGPU::S_FMAC_F16:
5662     return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5663                                    : AMDGPU::V_FMAC_F16_fake16_e64;
5664   case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5665   case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5666   case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5667   case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5668   case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5669   case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5670   case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5671   case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5672   case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5673   case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5674   case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5675   case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5676   case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5677   case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5678   case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5679   case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5680   case AMDGPU::S_CMP_LT_F16:
5681     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5682                                    : AMDGPU::V_CMP_LT_F16_fake16_e64;
5683   case AMDGPU::S_CMP_EQ_F16:
5684     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5685                                    : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5686   case AMDGPU::S_CMP_LE_F16:
5687     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5688                                    : AMDGPU::V_CMP_LE_F16_fake16_e64;
5689   case AMDGPU::S_CMP_GT_F16:
5690     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5691                                    : AMDGPU::V_CMP_GT_F16_fake16_e64;
5692   case AMDGPU::S_CMP_LG_F16:
5693     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5694                                    : AMDGPU::V_CMP_LG_F16_fake16_e64;
5695   case AMDGPU::S_CMP_GE_F16:
5696     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5697                                    : AMDGPU::V_CMP_GE_F16_fake16_e64;
5698   case AMDGPU::S_CMP_O_F16:
5699     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5700                                    : AMDGPU::V_CMP_O_F16_fake16_e64;
5701   case AMDGPU::S_CMP_U_F16:
5702     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5703                                    : AMDGPU::V_CMP_U_F16_fake16_e64;
5704   case AMDGPU::S_CMP_NGE_F16:
5705     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5706                                    : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5707   case AMDGPU::S_CMP_NLG_F16:
5708     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5709                                    : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5710   case AMDGPU::S_CMP_NGT_F16:
5711     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5712                                    : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5713   case AMDGPU::S_CMP_NLE_F16:
5714     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5715                                    : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5716   case AMDGPU::S_CMP_NEQ_F16:
5717     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5718                                    : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5719   case AMDGPU::S_CMP_NLT_F16:
5720     return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5721                                    : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5722   case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5723   case AMDGPU::V_S_EXP_F16_e64:
5724     return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5725                                    : AMDGPU::V_EXP_F16_fake16_e64;
5726   case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5727   case AMDGPU::V_S_LOG_F16_e64:
5728     return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5729                                    : AMDGPU::V_LOG_F16_fake16_e64;
5730   case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5731   case AMDGPU::V_S_RCP_F16_e64:
5732     return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5733                                    : AMDGPU::V_RCP_F16_fake16_e64;
5734   case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5735   case AMDGPU::V_S_RSQ_F16_e64:
5736     return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5737                                    : AMDGPU::V_RSQ_F16_fake16_e64;
5738   case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5739   case AMDGPU::V_S_SQRT_F16_e64:
5740     return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5741                                    : AMDGPU::V_SQRT_F16_fake16_e64;
5742   }
5743   llvm_unreachable(
5744       "Unexpected scalar opcode without corresponding vector one!");
5745 }
5746 
5747 // clang-format on
5748 
insertScratchExecCopy(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,Register Reg,bool IsSCCLive,SlotIndexes * Indexes) const5749 void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
5750                                         MachineBasicBlock &MBB,
5751                                         MachineBasicBlock::iterator MBBI,
5752                                         const DebugLoc &DL, Register Reg,
5753                                         bool IsSCCLive,
5754                                         SlotIndexes *Indexes) const {
5755   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5756   const SIInstrInfo *TII = ST.getInstrInfo();
5757   bool IsWave32 = ST.isWave32();
5758   if (IsSCCLive) {
5759     // Insert two move instructions, one to save the original value of EXEC and
5760     // the other to turn on all bits in EXEC. This is required as we can't use
5761     // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5762     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5763     MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5764     auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5765                            .addReg(Exec, RegState::Kill);
5766     auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5767     if (Indexes) {
5768       Indexes->insertMachineInstrInMaps(*StoreExecMI);
5769       Indexes->insertMachineInstrInMaps(*FlipExecMI);
5770     }
5771   } else {
5772     const unsigned OrSaveExec =
5773         IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5774     auto SaveExec =
5775         BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5776     SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5777     if (Indexes)
5778       Indexes->insertMachineInstrInMaps(*SaveExec);
5779   }
5780 }
5781 
restoreExec(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,Register Reg,SlotIndexes * Indexes) const5782 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
5783                               MachineBasicBlock::iterator MBBI,
5784                               const DebugLoc &DL, Register Reg,
5785                               SlotIndexes *Indexes) const {
5786   unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5787   MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5788   auto ExecRestoreMI =
5789       BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5790   if (Indexes)
5791     Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5792 }
5793 
5794 static const TargetRegisterClass *
adjustAllocatableRegClass(const GCNSubtarget & ST,const SIRegisterInfo & RI,const MachineRegisterInfo & MRI,const MCInstrDesc & TID,unsigned RCID,bool IsAllocatable)5795 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
5796                           const MachineRegisterInfo &MRI,
5797                           const MCInstrDesc &TID, unsigned RCID,
5798                           bool IsAllocatable) {
5799   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5800       (((TID.mayLoad() || TID.mayStore()) &&
5801         !(TID.TSFlags & SIInstrFlags::Spill)) ||
5802        (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
5803     switch (RCID) {
5804     case AMDGPU::AV_32RegClassID:
5805       RCID = AMDGPU::VGPR_32RegClassID;
5806       break;
5807     case AMDGPU::AV_64RegClassID:
5808       RCID = AMDGPU::VReg_64RegClassID;
5809       break;
5810     case AMDGPU::AV_96RegClassID:
5811       RCID = AMDGPU::VReg_96RegClassID;
5812       break;
5813     case AMDGPU::AV_128RegClassID:
5814       RCID = AMDGPU::VReg_128RegClassID;
5815       break;
5816     case AMDGPU::AV_160RegClassID:
5817       RCID = AMDGPU::VReg_160RegClassID;
5818       break;
5819     case AMDGPU::AV_512RegClassID:
5820       RCID = AMDGPU::VReg_512RegClassID;
5821       break;
5822     default:
5823       break;
5824     }
5825   }
5826 
5827   return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5828 }
5829 
getRegClass(const MCInstrDesc & TID,unsigned OpNum,const TargetRegisterInfo * TRI,const MachineFunction & MF) const5830 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
5831     unsigned OpNum, const TargetRegisterInfo *TRI,
5832     const MachineFunction &MF)
5833   const {
5834   if (OpNum >= TID.getNumOperands())
5835     return nullptr;
5836   auto RegClass = TID.operands()[OpNum].RegClass;
5837   bool IsAllocatable = false;
5838   if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
5839     // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5840     // with two data operands. Request register class constrained to VGPR only
5841     // of both operands present as Machine Copy Propagation can not check this
5842     // constraint and possibly other passes too.
5843     //
5844     // The check is limited to FLAT and DS because atomics in non-flat encoding
5845     // have their vdst and vdata tied to be the same register.
5846     const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5847                                                    AMDGPU::OpName::vdst);
5848     const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5849         (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5850                                          : AMDGPU::OpName::vdata);
5851     if (DataIdx != -1) {
5852       IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5853                                            TID.Opcode, AMDGPU::OpName::data1);
5854     }
5855   }
5856   return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5857                                    IsAllocatable);
5858 }
5859 
getOpRegClass(const MachineInstr & MI,unsigned OpNo) const5860 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
5861                                                       unsigned OpNo) const {
5862   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5863   const MCInstrDesc &Desc = get(MI.getOpcode());
5864   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5865       Desc.operands()[OpNo].RegClass == -1) {
5866     Register Reg = MI.getOperand(OpNo).getReg();
5867 
5868     if (Reg.isVirtual())
5869       return MRI.getRegClass(Reg);
5870     return RI.getPhysRegBaseClass(Reg);
5871   }
5872 
5873   unsigned RCID = Desc.operands()[OpNo].RegClass;
5874   return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5875 }
5876 
legalizeOpWithMove(MachineInstr & MI,unsigned OpIdx) const5877 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
5878   MachineBasicBlock::iterator I = MI;
5879   MachineBasicBlock *MBB = MI.getParent();
5880   MachineOperand &MO = MI.getOperand(OpIdx);
5881   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
5882   unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5883   const TargetRegisterClass *RC = RI.getRegClass(RCID);
5884   unsigned Size = RI.getRegSizeInBits(*RC);
5885   unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5886                     : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5887                                  : AMDGPU::V_MOV_B32_e32;
5888   if (MO.isReg())
5889     Opcode = AMDGPU::COPY;
5890   else if (RI.isSGPRClass(RC))
5891     Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5892 
5893   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5894   Register Reg = MRI.createVirtualRegister(VRC);
5895   DebugLoc DL = MBB->findDebugLoc(I);
5896   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5897   MO.ChangeToRegister(Reg, false);
5898 }
5899 
buildExtractSubReg(MachineBasicBlock::iterator MI,MachineRegisterInfo & MRI,const MachineOperand & SuperReg,const TargetRegisterClass * SuperRC,unsigned SubIdx,const TargetRegisterClass * SubRC) const5900 unsigned SIInstrInfo::buildExtractSubReg(
5901     MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
5902     const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5903     unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5904   if (!SuperReg.getReg().isVirtual())
5905     return RI.getSubReg(SuperReg.getReg(), SubIdx);
5906 
5907   MachineBasicBlock *MBB = MI->getParent();
5908   const DebugLoc &DL = MI->getDebugLoc();
5909   Register SubReg = MRI.createVirtualRegister(SubRC);
5910 
5911   unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5912   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5913       .addReg(SuperReg.getReg(), 0, NewSubIdx);
5914   return SubReg;
5915 }
5916 
buildExtractSubRegOrImm(MachineBasicBlock::iterator MII,MachineRegisterInfo & MRI,const MachineOperand & Op,const TargetRegisterClass * SuperRC,unsigned SubIdx,const TargetRegisterClass * SubRC) const5917 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
5918     MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
5919     const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5920     unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5921   if (Op.isImm()) {
5922     if (SubIdx == AMDGPU::sub0)
5923       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5924     if (SubIdx == AMDGPU::sub1)
5925       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5926 
5927     llvm_unreachable("Unhandled register index for immediate");
5928   }
5929 
5930   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5931                                        SubIdx, SubRC);
5932   return MachineOperand::CreateReg(SubReg, false);
5933 }
5934 
5935 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
swapOperands(MachineInstr & Inst) const5936 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5937   assert(Inst.getNumExplicitOperands() == 3);
5938   MachineOperand Op1 = Inst.getOperand(1);
5939   Inst.removeOperand(1);
5940   Inst.addOperand(Op1);
5941 }
5942 
isLegalRegOperand(const MachineRegisterInfo & MRI,const MCOperandInfo & OpInfo,const MachineOperand & MO) const5943 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
5944                                     const MCOperandInfo &OpInfo,
5945                                     const MachineOperand &MO) const {
5946   if (!MO.isReg())
5947     return false;
5948 
5949   Register Reg = MO.getReg();
5950 
5951   const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5952   if (Reg.isPhysical())
5953     return DRC->contains(Reg);
5954 
5955   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5956 
5957   if (MO.getSubReg()) {
5958     const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5959     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5960     if (!SuperRC)
5961       return false;
5962 
5963     DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5964     if (!DRC)
5965       return false;
5966   }
5967   return RC->hasSuperClassEq(DRC);
5968 }
5969 
isLegalRegOperand(const MachineInstr & MI,unsigned OpIdx,const MachineOperand & MO) const5970 bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
5971                                     const MachineOperand &MO) const {
5972   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5973   const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
5974   unsigned Opc = MI.getOpcode();
5975 
5976   if (!isLegalRegOperand(MRI, OpInfo, MO))
5977     return false;
5978 
5979   // check Accumulate GPR operand
5980   bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
5981   if (IsAGPR && !ST.hasMAIInsts())
5982     return false;
5983   if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5984       (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5985     return false;
5986   // Atomics should have both vdst and vdata either vgpr or agpr.
5987   const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5988   const int DataIdx = AMDGPU::getNamedOperandIdx(
5989       Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5990   if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5991       MI.getOperand(DataIdx).isReg() &&
5992       RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5993     return false;
5994   if ((int)OpIdx == DataIdx) {
5995     if (VDstIdx != -1 &&
5996         RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5997       return false;
5998     // DS instructions with 2 src operands also must have tied RC.
5999     const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6000     if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6001         RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6002       return false;
6003   }
6004 
6005   // Check V_ACCVGPR_WRITE_B32_e64
6006   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6007       (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6008       RI.isSGPRReg(MRI, MO.getReg()))
6009     return false;
6010   return true;
6011 }
6012 
isLegalVSrcOperand(const MachineRegisterInfo & MRI,const MCOperandInfo & OpInfo,const MachineOperand & MO) const6013 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
6014                                      const MCOperandInfo &OpInfo,
6015                                      const MachineOperand &MO) const {
6016   if (MO.isReg())
6017     return isLegalRegOperand(MRI, OpInfo, MO);
6018 
6019   // Handle non-register types that are treated like immediates.
6020   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6021   return true;
6022 }
6023 
isOperandLegal(const MachineInstr & MI,unsigned OpIdx,const MachineOperand * MO) const6024 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6025                                  const MachineOperand *MO) const {
6026   const MachineFunction &MF = *MI.getParent()->getParent();
6027   const MachineRegisterInfo &MRI = MF.getRegInfo();
6028   const MCInstrDesc &InstDesc = MI.getDesc();
6029   const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6030   const TargetRegisterClass *DefinedRC =
6031       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6032   if (!MO)
6033     MO = &MI.getOperand(OpIdx);
6034 
6035   const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6036 
6037   if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6038     const MachineOperand *UsedLiteral = nullptr;
6039 
6040     int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6041     int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6042 
6043     // TODO: Be more permissive with frame indexes.
6044     if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6045       if (!LiteralLimit--)
6046         return false;
6047 
6048       UsedLiteral = MO;
6049     }
6050 
6051     SmallDenseSet<RegSubRegPair> SGPRsUsed;
6052     if (MO->isReg())
6053       SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6054 
6055     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6056       if (i == OpIdx)
6057         continue;
6058       const MachineOperand &Op = MI.getOperand(i);
6059       if (Op.isReg()) {
6060         RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6061         if (!SGPRsUsed.count(SGPR) &&
6062             // FIXME: This can access off the end of the operands() array.
6063             usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
6064           if (--ConstantBusLimit <= 0)
6065             return false;
6066           SGPRsUsed.insert(SGPR);
6067         }
6068       } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
6069                  !isInlineConstant(Op, InstDesc.operands()[i])) {
6070         // The same literal may be used multiple times.
6071         if (!UsedLiteral)
6072           UsedLiteral = &Op;
6073         else if (UsedLiteral->isIdenticalTo(Op))
6074           continue;
6075 
6076         if (!LiteralLimit--)
6077           return false;
6078         if (--ConstantBusLimit <= 0)
6079           return false;
6080       }
6081     }
6082   } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6083     // There can be at most one literal operand, but it can be repeated.
6084     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6085       if (i == OpIdx)
6086         continue;
6087       const MachineOperand &Op = MI.getOperand(i);
6088       if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6089           !isInlineConstant(Op, InstDesc.operands()[i]) &&
6090           !Op.isIdenticalTo(*MO))
6091         return false;
6092 
6093       // Do not fold a frame index into an instruction that already has a frame
6094       // index. The frame index handling code doesn't handle fixing up operand
6095       // constraints if there are multiple indexes.
6096       if (Op.isFI() && MO->isFI())
6097         return false;
6098     }
6099   } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6100              isF16PseudoScalarTrans(MI.getOpcode())) {
6101     return false;
6102   }
6103 
6104   if (MO->isReg()) {
6105     if (!DefinedRC)
6106       return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6107     return isLegalRegOperand(MI, OpIdx, *MO);
6108   }
6109 
6110   if (MO->isImm()) {
6111     uint64_t Imm = MO->getImm();
6112     bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6113     bool Is64BitOp = Is64BitFPOp ||
6114                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6115                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6116                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6117     if (Is64BitOp &&
6118         !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6119       if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6120           (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6121         return false;
6122 
6123       // FIXME: We can use sign extended 64-bit literals, but only for signed
6124       //        operands. At the moment we do not know if an operand is signed.
6125       //        Such operand will be encoded as its low 32 bits and then either
6126       //        correctly sign extended or incorrectly zero extended by HW.
6127       //        If 64-bit literals are supported and the literal will be encoded
6128       //        as full 64 bit we still can use it.
6129       if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6130           (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6131         return false;
6132     }
6133   }
6134 
6135   // Handle non-register types that are treated like immediates.
6136   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6137 
6138   if (!DefinedRC) {
6139     // This operand expects an immediate.
6140     return true;
6141   }
6142 
6143   return isImmOperandLegal(MI, OpIdx, *MO);
6144 }
6145 
legalizeOperandsVOP2(MachineRegisterInfo & MRI,MachineInstr & MI) const6146 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6147                                        MachineInstr &MI) const {
6148   unsigned Opc = MI.getOpcode();
6149   const MCInstrDesc &InstrDesc = get(Opc);
6150 
6151   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6152   MachineOperand &Src0 = MI.getOperand(Src0Idx);
6153 
6154   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6155   MachineOperand &Src1 = MI.getOperand(Src1Idx);
6156 
6157   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6158   // we need to only have one constant bus use before GFX10.
6159   bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6160   if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6161       RI.isSGPRReg(MRI, Src0.getReg()))
6162     legalizeOpWithMove(MI, Src0Idx);
6163 
6164   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6165   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
6166   // src0/src1 with V_READFIRSTLANE.
6167   if (Opc == AMDGPU::V_WRITELANE_B32) {
6168     const DebugLoc &DL = MI.getDebugLoc();
6169     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6170       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6171       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6172           .add(Src0);
6173       Src0.ChangeToRegister(Reg, false);
6174     }
6175     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6176       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6177       const DebugLoc &DL = MI.getDebugLoc();
6178       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6179           .add(Src1);
6180       Src1.ChangeToRegister(Reg, false);
6181     }
6182     return;
6183   }
6184 
6185   // No VOP2 instructions support AGPRs.
6186   if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6187     legalizeOpWithMove(MI, Src0Idx);
6188 
6189   if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6190     legalizeOpWithMove(MI, Src1Idx);
6191 
6192   // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6193   if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6194     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6195     if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6196       legalizeOpWithMove(MI, Src2Idx);
6197   }
6198 
6199   // VOP2 src0 instructions support all operand types, so we don't need to check
6200   // their legality. If src1 is already legal, we don't need to do anything.
6201   if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6202     return;
6203 
6204   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6205   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6206   // select is uniform.
6207   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6208       RI.isVGPR(MRI, Src1.getReg())) {
6209     Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6210     const DebugLoc &DL = MI.getDebugLoc();
6211     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6212         .add(Src1);
6213     Src1.ChangeToRegister(Reg, false);
6214     return;
6215   }
6216 
6217   // We do not use commuteInstruction here because it is too aggressive and will
6218   // commute if it is possible. We only want to commute here if it improves
6219   // legality. This can be called a fairly large number of times so don't waste
6220   // compile time pointlessly swapping and checking legality again.
6221   if (HasImplicitSGPR || !MI.isCommutable()) {
6222     legalizeOpWithMove(MI, Src1Idx);
6223     return;
6224   }
6225 
6226   // If src0 can be used as src1, commuting will make the operands legal.
6227   // Otherwise we have to give up and insert a move.
6228   //
6229   // TODO: Other immediate-like operand kinds could be commuted if there was a
6230   // MachineOperand::ChangeTo* for them.
6231   if ((!Src1.isImm() && !Src1.isReg()) ||
6232       !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6233     legalizeOpWithMove(MI, Src1Idx);
6234     return;
6235   }
6236 
6237   int CommutedOpc = commuteOpcode(MI);
6238   if (CommutedOpc == -1) {
6239     legalizeOpWithMove(MI, Src1Idx);
6240     return;
6241   }
6242 
6243   MI.setDesc(get(CommutedOpc));
6244 
6245   Register Src0Reg = Src0.getReg();
6246   unsigned Src0SubReg = Src0.getSubReg();
6247   bool Src0Kill = Src0.isKill();
6248 
6249   if (Src1.isImm())
6250     Src0.ChangeToImmediate(Src1.getImm());
6251   else if (Src1.isReg()) {
6252     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6253     Src0.setSubReg(Src1.getSubReg());
6254   } else
6255     llvm_unreachable("Should only have register or immediate operands");
6256 
6257   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6258   Src1.setSubReg(Src0SubReg);
6259   fixImplicitOperands(MI);
6260 }
6261 
6262 // Legalize VOP3 operands. All operand types are supported for any operand
6263 // but only one literal constant and only starting from GFX10.
legalizeOperandsVOP3(MachineRegisterInfo & MRI,MachineInstr & MI) const6264 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6265                                        MachineInstr &MI) const {
6266   unsigned Opc = MI.getOpcode();
6267 
6268   int VOP3Idx[3] = {
6269     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6270     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6271     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6272   };
6273 
6274   if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6275       Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6276     // src1 and src2 must be scalar
6277     MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6278     MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6279     const DebugLoc &DL = MI.getDebugLoc();
6280     if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6281       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6282       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6283         .add(Src1);
6284       Src1.ChangeToRegister(Reg, false);
6285     }
6286     if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6287       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6288       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6289         .add(Src2);
6290       Src2.ChangeToRegister(Reg, false);
6291     }
6292   }
6293 
6294   // Find the one SGPR operand we are allowed to use.
6295   int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6296   int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6297   SmallDenseSet<unsigned> SGPRsUsed;
6298   Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6299   if (SGPRReg) {
6300     SGPRsUsed.insert(SGPRReg);
6301     --ConstantBusLimit;
6302   }
6303 
6304   for (int Idx : VOP3Idx) {
6305     if (Idx == -1)
6306       break;
6307     MachineOperand &MO = MI.getOperand(Idx);
6308 
6309     if (!MO.isReg()) {
6310       if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6311         continue;
6312 
6313       if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6314         --LiteralLimit;
6315         --ConstantBusLimit;
6316         continue;
6317       }
6318 
6319       --LiteralLimit;
6320       --ConstantBusLimit;
6321       legalizeOpWithMove(MI, Idx);
6322       continue;
6323     }
6324 
6325     if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6326         !isOperandLegal(MI, Idx, &MO)) {
6327       legalizeOpWithMove(MI, Idx);
6328       continue;
6329     }
6330 
6331     if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6332       continue; // VGPRs are legal
6333 
6334     // We can use one SGPR in each VOP3 instruction prior to GFX10
6335     // and two starting from GFX10.
6336     if (SGPRsUsed.count(MO.getReg()))
6337       continue;
6338     if (ConstantBusLimit > 0) {
6339       SGPRsUsed.insert(MO.getReg());
6340       --ConstantBusLimit;
6341       continue;
6342     }
6343 
6344     // If we make it this far, then the operand is not legal and we must
6345     // legalize it.
6346     legalizeOpWithMove(MI, Idx);
6347   }
6348 
6349   // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6350   if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6351       !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6352     legalizeOpWithMove(MI, VOP3Idx[2]);
6353 }
6354 
readlaneVGPRToSGPR(Register SrcReg,MachineInstr & UseMI,MachineRegisterInfo & MRI,const TargetRegisterClass * DstRC) const6355 Register SIInstrInfo::readlaneVGPRToSGPR(
6356     Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6357     const TargetRegisterClass *DstRC /*=nullptr*/) const {
6358   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6359   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6360   if (DstRC)
6361     SRC = RI.getCommonSubClass(SRC, DstRC);
6362 
6363   Register DstReg = MRI.createVirtualRegister(SRC);
6364   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6365 
6366   if (RI.hasAGPRs(VRC)) {
6367     VRC = RI.getEquivalentVGPRClass(VRC);
6368     Register NewSrcReg = MRI.createVirtualRegister(VRC);
6369     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6370             get(TargetOpcode::COPY), NewSrcReg)
6371         .addReg(SrcReg);
6372     SrcReg = NewSrcReg;
6373   }
6374 
6375   if (SubRegs == 1) {
6376     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6377             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6378         .addReg(SrcReg);
6379     return DstReg;
6380   }
6381 
6382   SmallVector<Register, 8> SRegs;
6383   for (unsigned i = 0; i < SubRegs; ++i) {
6384     Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6385     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6386             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6387         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6388     SRegs.push_back(SGPR);
6389   }
6390 
6391   MachineInstrBuilder MIB =
6392       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6393               get(AMDGPU::REG_SEQUENCE), DstReg);
6394   for (unsigned i = 0; i < SubRegs; ++i) {
6395     MIB.addReg(SRegs[i]);
6396     MIB.addImm(RI.getSubRegFromChannel(i));
6397   }
6398   return DstReg;
6399 }
6400 
legalizeOperandsSMRD(MachineRegisterInfo & MRI,MachineInstr & MI) const6401 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6402                                        MachineInstr &MI) const {
6403 
6404   // If the pointer is store in VGPRs, then we need to move them to
6405   // SGPRs using v_readfirstlane.  This is safe because we only select
6406   // loads with uniform pointers to SMRD instruction so we know the
6407   // pointer value is uniform.
6408   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6409   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6410     Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6411     SBase->setReg(SGPR);
6412   }
6413   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6414   if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6415     Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6416     SOff->setReg(SGPR);
6417   }
6418 }
6419 
moveFlatAddrToVGPR(MachineInstr & Inst) const6420 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6421   unsigned Opc = Inst.getOpcode();
6422   int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6423   if (OldSAddrIdx < 0)
6424     return false;
6425 
6426   assert(isSegmentSpecificFLAT(Inst));
6427 
6428   int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6429   if (NewOpc < 0)
6430     NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
6431   if (NewOpc < 0)
6432     return false;
6433 
6434   MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6435   MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6436   if (RI.isSGPRReg(MRI, SAddr.getReg()))
6437     return false;
6438 
6439   int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6440   if (NewVAddrIdx < 0)
6441     return false;
6442 
6443   int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6444 
6445   // Check vaddr, it shall be zero or absent.
6446   MachineInstr *VAddrDef = nullptr;
6447   if (OldVAddrIdx >= 0) {
6448     MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6449     VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6450     if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6451         !VAddrDef->getOperand(1).isImm() ||
6452         VAddrDef->getOperand(1).getImm() != 0)
6453       return false;
6454   }
6455 
6456   const MCInstrDesc &NewDesc = get(NewOpc);
6457   Inst.setDesc(NewDesc);
6458 
6459   // Callers expect iterator to be valid after this call, so modify the
6460   // instruction in place.
6461   if (OldVAddrIdx == NewVAddrIdx) {
6462     MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6463     // Clear use list from the old vaddr holding a zero register.
6464     MRI.removeRegOperandFromUseList(&NewVAddr);
6465     MRI.moveOperands(&NewVAddr, &SAddr, 1);
6466     Inst.removeOperand(OldSAddrIdx);
6467     // Update the use list with the pointer we have just moved from vaddr to
6468     // saddr position. Otherwise new vaddr will be missing from the use list.
6469     MRI.removeRegOperandFromUseList(&NewVAddr);
6470     MRI.addRegOperandToUseList(&NewVAddr);
6471   } else {
6472     assert(OldSAddrIdx == NewVAddrIdx);
6473 
6474     if (OldVAddrIdx >= 0) {
6475       int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6476                                                  AMDGPU::OpName::vdst_in);
6477 
6478       // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6479       // it asserts. Untie the operands for now and retie them afterwards.
6480       if (NewVDstIn != -1) {
6481         int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6482         Inst.untieRegOperand(OldVDstIn);
6483       }
6484 
6485       Inst.removeOperand(OldVAddrIdx);
6486 
6487       if (NewVDstIn != -1) {
6488         int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6489         Inst.tieOperands(NewVDst, NewVDstIn);
6490       }
6491     }
6492   }
6493 
6494   if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6495     VAddrDef->eraseFromParent();
6496 
6497   return true;
6498 }
6499 
6500 // FIXME: Remove this when SelectionDAG is obsoleted.
legalizeOperandsFLAT(MachineRegisterInfo & MRI,MachineInstr & MI) const6501 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6502                                        MachineInstr &MI) const {
6503   if (!isSegmentSpecificFLAT(MI))
6504     return;
6505 
6506   // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6507   // thinks they are uniform, so a readfirstlane should be valid.
6508   MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6509   if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6510     return;
6511 
6512   if (moveFlatAddrToVGPR(MI))
6513     return;
6514 
6515   const TargetRegisterClass *DeclaredRC = getRegClass(
6516       MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6517 
6518   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6519   SAddr->setReg(ToSGPR);
6520 }
6521 
legalizeGenericOperand(MachineBasicBlock & InsertMBB,MachineBasicBlock::iterator I,const TargetRegisterClass * DstRC,MachineOperand & Op,MachineRegisterInfo & MRI,const DebugLoc & DL) const6522 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
6523                                          MachineBasicBlock::iterator I,
6524                                          const TargetRegisterClass *DstRC,
6525                                          MachineOperand &Op,
6526                                          MachineRegisterInfo &MRI,
6527                                          const DebugLoc &DL) const {
6528   Register OpReg = Op.getReg();
6529   unsigned OpSubReg = Op.getSubReg();
6530 
6531   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6532       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6533 
6534   // Check if operand is already the correct register class.
6535   if (DstRC == OpRC)
6536     return;
6537 
6538   Register DstReg = MRI.createVirtualRegister(DstRC);
6539   auto Copy =
6540       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6541   Op.setReg(DstReg);
6542 
6543   MachineInstr *Def = MRI.getVRegDef(OpReg);
6544   if (!Def)
6545     return;
6546 
6547   // Try to eliminate the copy if it is copying an immediate value.
6548   if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6549     foldImmediate(*Copy, *Def, OpReg, &MRI);
6550 
6551   bool ImpDef = Def->isImplicitDef();
6552   while (!ImpDef && Def && Def->isCopy()) {
6553     if (Def->getOperand(1).getReg().isPhysical())
6554       break;
6555     Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6556     ImpDef = Def && Def->isImplicitDef();
6557   }
6558   if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6559       !ImpDef)
6560     Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6561 }
6562 
6563 // Emit the actual waterfall loop, executing the wrapped instruction for each
6564 // unique value of \p ScalarOps across all lanes. In the best case we execute 1
6565 // iteration, in the worst case we execute 64 (once per lane).
6566 static void
emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo & TII,MachineRegisterInfo & MRI,MachineBasicBlock & LoopBB,MachineBasicBlock & BodyBB,const DebugLoc & DL,ArrayRef<MachineOperand * > ScalarOps)6567 emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
6568                               MachineRegisterInfo &MRI,
6569                               MachineBasicBlock &LoopBB,
6570                               MachineBasicBlock &BodyBB,
6571                               const DebugLoc &DL,
6572                               ArrayRef<MachineOperand *> ScalarOps) {
6573   MachineFunction &MF = *LoopBB.getParent();
6574   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6575   const SIRegisterInfo *TRI = ST.getRegisterInfo();
6576   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6577   unsigned SaveExecOpc =
6578       ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6579   unsigned XorTermOpc =
6580       ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6581   unsigned AndOpc =
6582       ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6583   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6584 
6585   MachineBasicBlock::iterator I = LoopBB.begin();
6586   Register CondReg;
6587 
6588   for (MachineOperand *ScalarOp : ScalarOps) {
6589     unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6590     unsigned NumSubRegs = RegSize / 32;
6591     Register VScalarOp = ScalarOp->getReg();
6592 
6593     if (NumSubRegs == 1) {
6594       Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6595 
6596       BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6597           .addReg(VScalarOp);
6598 
6599       Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6600 
6601       BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6602           .addReg(CurReg)
6603           .addReg(VScalarOp);
6604 
6605       // Combine the comparison results with AND.
6606       if (!CondReg) // First.
6607         CondReg = NewCondReg;
6608       else { // If not the first, we create an AND.
6609         Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6610         BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6611             .addReg(CondReg)
6612             .addReg(NewCondReg);
6613         CondReg = AndReg;
6614       }
6615 
6616       // Update ScalarOp operand to use the SGPR ScalarOp.
6617       ScalarOp->setReg(CurReg);
6618       ScalarOp->setIsKill();
6619     } else {
6620       SmallVector<Register, 8> ReadlanePieces;
6621       unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6622       assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6623              "Unhandled register size");
6624 
6625       for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6626         Register CurRegLo =
6627             MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6628         Register CurRegHi =
6629             MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6630 
6631         // Read the next variant <- also loop target.
6632         BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6633             .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6634 
6635         // Read the next variant <- also loop target.
6636         BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6637             .addReg(VScalarOp, VScalarOpUndef,
6638                     TRI->getSubRegFromChannel(Idx + 1));
6639 
6640         ReadlanePieces.push_back(CurRegLo);
6641         ReadlanePieces.push_back(CurRegHi);
6642 
6643         // Comparison is to be done as 64-bit.
6644         Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6645         BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6646             .addReg(CurRegLo)
6647             .addImm(AMDGPU::sub0)
6648             .addReg(CurRegHi)
6649             .addImm(AMDGPU::sub1);
6650 
6651         Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6652         auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6653                            NewCondReg)
6654                        .addReg(CurReg);
6655         if (NumSubRegs <= 2)
6656           Cmp.addReg(VScalarOp);
6657         else
6658           Cmp.addReg(VScalarOp, VScalarOpUndef,
6659                      TRI->getSubRegFromChannel(Idx, 2));
6660 
6661         // Combine the comparison results with AND.
6662         if (!CondReg) // First.
6663           CondReg = NewCondReg;
6664         else { // If not the first, we create an AND.
6665           Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6666           BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6667               .addReg(CondReg)
6668               .addReg(NewCondReg);
6669           CondReg = AndReg;
6670         }
6671       } // End for loop.
6672 
6673       const auto *SScalarOpRC =
6674           TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6675       Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6676 
6677       // Build scalar ScalarOp.
6678       auto Merge =
6679           BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6680       unsigned Channel = 0;
6681       for (Register Piece : ReadlanePieces) {
6682         Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6683       }
6684 
6685       // Update ScalarOp operand to use the SGPR ScalarOp.
6686       ScalarOp->setReg(SScalarOp);
6687       ScalarOp->setIsKill();
6688     }
6689   }
6690 
6691   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6692   MRI.setSimpleHint(SaveExec, CondReg);
6693 
6694   // Update EXEC to matching lanes, saving original to SaveExec.
6695   BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6696       .addReg(CondReg, RegState::Kill);
6697 
6698   // The original instruction is here; we insert the terminators after it.
6699   I = BodyBB.end();
6700 
6701   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6702   BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6703       .addReg(Exec)
6704       .addReg(SaveExec);
6705 
6706   BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6707 }
6708 
6709 // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6710 // with SGPRs by iterating over all unique values across all lanes.
6711 // Returns the loop basic block that now contains \p MI.
6712 static MachineBasicBlock *
loadMBUFScalarOperandsFromVGPR(const SIInstrInfo & TII,MachineInstr & MI,ArrayRef<MachineOperand * > ScalarOps,MachineDominatorTree * MDT,MachineBasicBlock::iterator Begin=nullptr,MachineBasicBlock::iterator End=nullptr)6713 loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
6714                                ArrayRef<MachineOperand *> ScalarOps,
6715                                MachineDominatorTree *MDT,
6716                                MachineBasicBlock::iterator Begin = nullptr,
6717                                MachineBasicBlock::iterator End = nullptr) {
6718   MachineBasicBlock &MBB = *MI.getParent();
6719   MachineFunction &MF = *MBB.getParent();
6720   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6721   const SIRegisterInfo *TRI = ST.getRegisterInfo();
6722   MachineRegisterInfo &MRI = MF.getRegInfo();
6723   if (!Begin.isValid())
6724     Begin = &MI;
6725   if (!End.isValid()) {
6726     End = &MI;
6727     ++End;
6728   }
6729   const DebugLoc &DL = MI.getDebugLoc();
6730   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6731   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6732   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6733 
6734   // Save SCC. Waterfall Loop may overwrite SCC.
6735   Register SaveSCCReg;
6736 
6737   // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6738   // rather than unlimited scan everywhere
6739   bool SCCNotDead =
6740       MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6741                                   std::numeric_limits<unsigned>::max()) !=
6742       MachineBasicBlock::LQR_Dead;
6743   if (SCCNotDead) {
6744     SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6745     BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6746         .addImm(1)
6747         .addImm(0);
6748   }
6749 
6750   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6751 
6752   // Save the EXEC mask
6753   BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6754 
6755   // Killed uses in the instruction we are waterfalling around will be
6756   // incorrect due to the added control-flow.
6757   MachineBasicBlock::iterator AfterMI = MI;
6758   ++AfterMI;
6759   for (auto I = Begin; I != AfterMI; I++) {
6760     for (auto &MO : I->all_uses())
6761       MRI.clearKillFlags(MO.getReg());
6762   }
6763 
6764   // To insert the loop we need to split the block. Move everything after this
6765   // point to a new block, and insert a new empty block between the two.
6766   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
6767   MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
6768   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6769   MachineFunction::iterator MBBI(MBB);
6770   ++MBBI;
6771 
6772   MF.insert(MBBI, LoopBB);
6773   MF.insert(MBBI, BodyBB);
6774   MF.insert(MBBI, RemainderBB);
6775 
6776   LoopBB->addSuccessor(BodyBB);
6777   BodyBB->addSuccessor(LoopBB);
6778   BodyBB->addSuccessor(RemainderBB);
6779 
6780   // Move Begin to MI to the BodyBB, and the remainder of the block to
6781   // RemainderBB.
6782   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6783   RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6784   BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6785 
6786   MBB.addSuccessor(LoopBB);
6787 
6788   // Update dominators. We know that MBB immediately dominates LoopBB, that
6789   // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6790   // RemainderBB. RemainderBB immediately dominates all of the successors
6791   // transferred to it from MBB that MBB used to properly dominate.
6792   if (MDT) {
6793     MDT->addNewBlock(LoopBB, &MBB);
6794     MDT->addNewBlock(BodyBB, LoopBB);
6795     MDT->addNewBlock(RemainderBB, BodyBB);
6796     for (auto &Succ : RemainderBB->successors()) {
6797       if (MDT->properlyDominates(&MBB, Succ)) {
6798         MDT->changeImmediateDominator(Succ, RemainderBB);
6799       }
6800     }
6801   }
6802 
6803   emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
6804 
6805   MachineBasicBlock::iterator First = RemainderBB->begin();
6806   // Restore SCC
6807   if (SCCNotDead) {
6808     BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6809         .addReg(SaveSCCReg, RegState::Kill)
6810         .addImm(0);
6811   }
6812 
6813   // Restore the EXEC mask
6814   BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6815   return BodyBB;
6816 }
6817 
6818 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6819 static std::tuple<unsigned, unsigned>
extractRsrcPtr(const SIInstrInfo & TII,MachineInstr & MI,MachineOperand & Rsrc)6820 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
6821   MachineBasicBlock &MBB = *MI.getParent();
6822   MachineFunction &MF = *MBB.getParent();
6823   MachineRegisterInfo &MRI = MF.getRegInfo();
6824 
6825   // Extract the ptr from the resource descriptor.
6826   unsigned RsrcPtr =
6827       TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6828                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6829 
6830   // Create an empty resource descriptor
6831   Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6832   Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6833   Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6834   Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6835   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6836 
6837   // Zero64 = 0
6838   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6839       .addImm(0);
6840 
6841   // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6842   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6843       .addImm(Lo_32(RsrcDataFormat));
6844 
6845   // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6846   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6847       .addImm(Hi_32(RsrcDataFormat));
6848 
6849   // NewSRsrc = {Zero64, SRsrcFormat}
6850   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6851       .addReg(Zero64)
6852       .addImm(AMDGPU::sub0_sub1)
6853       .addReg(SRsrcFormatLo)
6854       .addImm(AMDGPU::sub2)
6855       .addReg(SRsrcFormatHi)
6856       .addImm(AMDGPU::sub3);
6857 
6858   return std::tuple(RsrcPtr, NewSRsrc);
6859 }
6860 
6861 MachineBasicBlock *
legalizeOperands(MachineInstr & MI,MachineDominatorTree * MDT) const6862 SIInstrInfo::legalizeOperands(MachineInstr &MI,
6863                               MachineDominatorTree *MDT) const {
6864   MachineFunction &MF = *MI.getParent()->getParent();
6865   MachineRegisterInfo &MRI = MF.getRegInfo();
6866   MachineBasicBlock *CreatedBB = nullptr;
6867 
6868   // Legalize VOP2
6869   if (isVOP2(MI) || isVOPC(MI)) {
6870     legalizeOperandsVOP2(MRI, MI);
6871     return CreatedBB;
6872   }
6873 
6874   // Legalize VOP3
6875   if (isVOP3(MI)) {
6876     legalizeOperandsVOP3(MRI, MI);
6877     return CreatedBB;
6878   }
6879 
6880   // Legalize SMRD
6881   if (isSMRD(MI)) {
6882     legalizeOperandsSMRD(MRI, MI);
6883     return CreatedBB;
6884   }
6885 
6886   // Legalize FLAT
6887   if (isFLAT(MI)) {
6888     legalizeOperandsFLAT(MRI, MI);
6889     return CreatedBB;
6890   }
6891 
6892   // Legalize REG_SEQUENCE and PHI
6893   // The register class of the operands much be the same type as the register
6894   // class of the output.
6895   if (MI.getOpcode() == AMDGPU::PHI) {
6896     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6897     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6898       if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6899         continue;
6900       const TargetRegisterClass *OpRC =
6901           MRI.getRegClass(MI.getOperand(i).getReg());
6902       if (RI.hasVectorRegisters(OpRC)) {
6903         VRC = OpRC;
6904       } else {
6905         SRC = OpRC;
6906       }
6907     }
6908 
6909     // If any of the operands are VGPR registers, then they all most be
6910     // otherwise we will create illegal VGPR->SGPR copies when legalizing
6911     // them.
6912     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6913       if (!VRC) {
6914         assert(SRC);
6915         if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6916           VRC = &AMDGPU::VReg_1RegClass;
6917         } else
6918           VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6919                     ? RI.getEquivalentAGPRClass(SRC)
6920                     : RI.getEquivalentVGPRClass(SRC);
6921       } else {
6922         VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6923                   ? RI.getEquivalentAGPRClass(VRC)
6924                   : RI.getEquivalentVGPRClass(VRC);
6925       }
6926       RC = VRC;
6927     } else {
6928       RC = SRC;
6929     }
6930 
6931     // Update all the operands so they have the same type.
6932     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6933       MachineOperand &Op = MI.getOperand(I);
6934       if (!Op.isReg() || !Op.getReg().isVirtual())
6935         continue;
6936 
6937       // MI is a PHI instruction.
6938       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6939       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
6940 
6941       // Avoid creating no-op copies with the same src and dst reg class.  These
6942       // confuse some of the machine passes.
6943       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6944     }
6945   }
6946 
6947   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6948   // VGPR dest type and SGPR sources, insert copies so all operands are
6949   // VGPRs. This seems to help operand folding / the register coalescer.
6950   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6951     MachineBasicBlock *MBB = MI.getParent();
6952     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6953     if (RI.hasVGPRs(DstRC)) {
6954       // Update all the operands so they are VGPR register classes. These may
6955       // not be the same register class because REG_SEQUENCE supports mixing
6956       // subregister index types e.g. sub0_sub1 + sub2 + sub3
6957       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6958         MachineOperand &Op = MI.getOperand(I);
6959         if (!Op.isReg() || !Op.getReg().isVirtual())
6960           continue;
6961 
6962         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6963         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6964         if (VRC == OpRC)
6965           continue;
6966 
6967         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6968         Op.setIsKill();
6969       }
6970     }
6971 
6972     return CreatedBB;
6973   }
6974 
6975   // Legalize INSERT_SUBREG
6976   // src0 must have the same register class as dst
6977   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6978     Register Dst = MI.getOperand(0).getReg();
6979     Register Src0 = MI.getOperand(1).getReg();
6980     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6981     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6982     if (DstRC != Src0RC) {
6983       MachineBasicBlock *MBB = MI.getParent();
6984       MachineOperand &Op = MI.getOperand(1);
6985       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6986     }
6987     return CreatedBB;
6988   }
6989 
6990   // Legalize SI_INIT_M0
6991   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6992     MachineOperand &Src = MI.getOperand(0);
6993     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6994       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6995     return CreatedBB;
6996   }
6997 
6998   // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6999   if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7000       MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7001       MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7002       MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7003       MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7004       MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7005       MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7006     MachineOperand &Src = MI.getOperand(1);
7007     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7008       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7009     return CreatedBB;
7010   }
7011 
7012   // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7013   //
7014   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7015   // scratch memory access. In both cases, the legalization never involves
7016   // conversion to the addr64 form.
7017   if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
7018                       (isMUBUF(MI) || isMTBUF(MI)))) {
7019     AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7020                                     ? AMDGPU::OpName::rsrc
7021                                     : AMDGPU::OpName::srsrc;
7022     MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7023     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7024       CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7025 
7026     AMDGPU::OpName SampOpName =
7027         isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7028     MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7029     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7030       CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7031 
7032     return CreatedBB;
7033   }
7034 
7035   // Legalize SI_CALL
7036   if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7037     MachineOperand *Dest = &MI.getOperand(0);
7038     if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7039       // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7040       // following copies, we also need to move copies from and to physical
7041       // registers into the loop block.
7042       unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7043       unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7044 
7045       // Also move the copies to physical registers into the loop block
7046       MachineBasicBlock &MBB = *MI.getParent();
7047       MachineBasicBlock::iterator Start(&MI);
7048       while (Start->getOpcode() != FrameSetupOpcode)
7049         --Start;
7050       MachineBasicBlock::iterator End(&MI);
7051       while (End->getOpcode() != FrameDestroyOpcode)
7052         ++End;
7053       // Also include following copies of the return value
7054       ++End;
7055       while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7056              MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7057         ++End;
7058       CreatedBB =
7059           loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7060     }
7061   }
7062 
7063   // Legalize s_sleep_var.
7064   if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7065     const DebugLoc &DL = MI.getDebugLoc();
7066     Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7067     int Src0Idx =
7068         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7069     MachineOperand &Src0 = MI.getOperand(Src0Idx);
7070     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7071         .add(Src0);
7072     Src0.ChangeToRegister(Reg, false);
7073     return nullptr;
7074   }
7075 
7076   // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7077   // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7078   if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7079       MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7080       MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7081       MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7082     for (MachineOperand &Src : MI.explicit_operands()) {
7083       if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7084         Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7085     }
7086     return CreatedBB;
7087   }
7088 
7089   // Legalize MUBUF instructions.
7090   bool isSoffsetLegal = true;
7091   int SoffsetIdx =
7092       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7093   if (SoffsetIdx != -1) {
7094     MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7095     if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7096         !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7097       isSoffsetLegal = false;
7098     }
7099   }
7100 
7101   bool isRsrcLegal = true;
7102   int RsrcIdx =
7103       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7104   if (RsrcIdx != -1) {
7105     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7106     if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7107       isRsrcLegal = false;
7108   }
7109 
7110   // The operands are legal.
7111   if (isRsrcLegal && isSoffsetLegal)
7112     return CreatedBB;
7113 
7114   if (!isRsrcLegal) {
7115     // Legalize a VGPR Rsrc
7116     //
7117     // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7118     // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7119     // a zero-value SRsrc.
7120     //
7121     // If the instruction is _OFFSET (both idxen and offen disabled), and we
7122     // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7123     // above.
7124     //
7125     // Otherwise we are on non-ADDR64 hardware, and/or we have
7126     // idxen/offen/bothen and we fall back to a waterfall loop.
7127 
7128     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7129     MachineBasicBlock &MBB = *MI.getParent();
7130 
7131     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7132     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7133       // This is already an ADDR64 instruction so we need to add the pointer
7134       // extracted from the resource descriptor to the current value of VAddr.
7135       Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7136       Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7137       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7138 
7139       const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7140       Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7141       Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7142 
7143       unsigned RsrcPtr, NewSRsrc;
7144       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7145 
7146       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7147       const DebugLoc &DL = MI.getDebugLoc();
7148       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7149         .addDef(CondReg0)
7150         .addReg(RsrcPtr, 0, AMDGPU::sub0)
7151         .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7152         .addImm(0);
7153 
7154       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7155       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7156         .addDef(CondReg1, RegState::Dead)
7157         .addReg(RsrcPtr, 0, AMDGPU::sub1)
7158         .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7159         .addReg(CondReg0, RegState::Kill)
7160         .addImm(0);
7161 
7162       // NewVaddr = {NewVaddrHi, NewVaddrLo}
7163       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7164           .addReg(NewVAddrLo)
7165           .addImm(AMDGPU::sub0)
7166           .addReg(NewVAddrHi)
7167           .addImm(AMDGPU::sub1);
7168 
7169       VAddr->setReg(NewVAddr);
7170       Rsrc->setReg(NewSRsrc);
7171     } else if (!VAddr && ST.hasAddr64()) {
7172       // This instructions is the _OFFSET variant, so we need to convert it to
7173       // ADDR64.
7174       assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7175              "FIXME: Need to emit flat atomics here");
7176 
7177       unsigned RsrcPtr, NewSRsrc;
7178       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7179 
7180       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7181       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7182       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7183       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7184       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7185 
7186       // Atomics with return have an additional tied operand and are
7187       // missing some of the special bits.
7188       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7189       MachineInstr *Addr64;
7190 
7191       if (!VDataIn) {
7192         // Regular buffer load / store.
7193         MachineInstrBuilder MIB =
7194             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7195                 .add(*VData)
7196                 .addReg(NewVAddr)
7197                 .addReg(NewSRsrc)
7198                 .add(*SOffset)
7199                 .add(*Offset);
7200 
7201         if (const MachineOperand *CPol =
7202                 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7203           MIB.addImm(CPol->getImm());
7204         }
7205 
7206         if (const MachineOperand *TFE =
7207                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7208           MIB.addImm(TFE->getImm());
7209         }
7210 
7211         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7212 
7213         MIB.cloneMemRefs(MI);
7214         Addr64 = MIB;
7215       } else {
7216         // Atomics with return.
7217         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7218                      .add(*VData)
7219                      .add(*VDataIn)
7220                      .addReg(NewVAddr)
7221                      .addReg(NewSRsrc)
7222                      .add(*SOffset)
7223                      .add(*Offset)
7224                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7225                      .cloneMemRefs(MI);
7226       }
7227 
7228       MI.removeFromParent();
7229 
7230       // NewVaddr = {NewVaddrHi, NewVaddrLo}
7231       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7232               NewVAddr)
7233           .addReg(RsrcPtr, 0, AMDGPU::sub0)
7234           .addImm(AMDGPU::sub0)
7235           .addReg(RsrcPtr, 0, AMDGPU::sub1)
7236           .addImm(AMDGPU::sub1);
7237     } else {
7238       // Legalize a VGPR Rsrc and soffset together.
7239       if (!isSoffsetLegal) {
7240         MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7241         CreatedBB =
7242             loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7243         return CreatedBB;
7244       }
7245       CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7246       return CreatedBB;
7247     }
7248   }
7249 
7250   // Legalize a VGPR soffset.
7251   if (!isSoffsetLegal) {
7252     MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7253     CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7254     return CreatedBB;
7255   }
7256   return CreatedBB;
7257 }
7258 
insert(MachineInstr * MI)7259 void SIInstrWorklist::insert(MachineInstr *MI) {
7260   InstrList.insert(MI);
7261   // Add MBUF instructiosn to deferred list.
7262   int RsrcIdx =
7263       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7264   if (RsrcIdx != -1) {
7265     DeferredList.insert(MI);
7266   }
7267 }
7268 
isDeferred(MachineInstr * MI)7269 bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7270   return DeferredList.contains(MI);
7271 }
7272 
7273 // Legalize size mismatches between 16bit and 32bit registers in v2s copy
7274 // lowering (change spgr to vgpr).
7275 // This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7276 // size. Need to legalize the size of the operands during the vgpr lowering
7277 // chain. This can be removed after we have sgpr16 in place
legalizeOperandsVALUt16(MachineInstr & MI,unsigned OpIdx,MachineRegisterInfo & MRI) const7278 void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7279                                           MachineRegisterInfo &MRI) const {
7280   if (!ST.useRealTrue16Insts())
7281     return;
7282 
7283   unsigned Opcode = MI.getOpcode();
7284   MachineBasicBlock *MBB = MI.getParent();
7285   // Legalize operands and check for size mismatch
7286   if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7287       OpIdx >= get(Opcode).getNumOperands() ||
7288       get(Opcode).operands()[OpIdx].RegClass == -1)
7289     return;
7290 
7291   MachineOperand &Op = MI.getOperand(OpIdx);
7292   if (!Op.isReg() || !Op.getReg().isVirtual())
7293     return;
7294 
7295   const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7296   if (!RI.isVGPRClass(CurrRC))
7297     return;
7298 
7299   unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7300   const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7301   if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7302     Op.setSubReg(AMDGPU::lo16);
7303   } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7304     const DebugLoc &DL = MI.getDebugLoc();
7305     Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7306     Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7307     BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7308     BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7309         .addReg(Op.getReg())
7310         .addImm(AMDGPU::lo16)
7311         .addReg(Undef)
7312         .addImm(AMDGPU::hi16);
7313     Op.setReg(NewDstReg);
7314   }
7315 }
legalizeOperandsVALUt16(MachineInstr & MI,MachineRegisterInfo & MRI) const7316 void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7317                                           MachineRegisterInfo &MRI) const {
7318   for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7319     legalizeOperandsVALUt16(MI, OpIdx, MRI);
7320 }
7321 
moveToVALU(SIInstrWorklist & Worklist,MachineDominatorTree * MDT) const7322 void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7323                              MachineDominatorTree *MDT) const {
7324 
7325   while (!Worklist.empty()) {
7326     MachineInstr &Inst = *Worklist.top();
7327     Worklist.erase_top();
7328     // Skip MachineInstr in the deferred list.
7329     if (Worklist.isDeferred(&Inst))
7330       continue;
7331     moveToVALUImpl(Worklist, MDT, Inst);
7332   }
7333 
7334   // Deferred list of instructions will be processed once
7335   // all the MachineInstr in the worklist are done.
7336   for (MachineInstr *Inst : Worklist.getDeferredList()) {
7337     moveToVALUImpl(Worklist, MDT, *Inst);
7338     assert(Worklist.empty() &&
7339            "Deferred MachineInstr are not supposed to re-populate worklist");
7340   }
7341 }
7342 
moveToVALUImpl(SIInstrWorklist & Worklist,MachineDominatorTree * MDT,MachineInstr & Inst) const7343 void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7344                                  MachineDominatorTree *MDT,
7345                                  MachineInstr &Inst) const {
7346 
7347   MachineBasicBlock *MBB = Inst.getParent();
7348   if (!MBB)
7349     return;
7350   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7351   unsigned Opcode = Inst.getOpcode();
7352   unsigned NewOpcode = getVALUOp(Inst);
7353   // Handle some special cases
7354   switch (Opcode) {
7355   default:
7356     break;
7357   case AMDGPU::S_ADD_I32:
7358   case AMDGPU::S_SUB_I32: {
7359     // FIXME: The u32 versions currently selected use the carry.
7360     bool Changed;
7361     MachineBasicBlock *CreatedBBTmp = nullptr;
7362     std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7363     if (Changed)
7364       return;
7365 
7366     // Default handling
7367     break;
7368   }
7369 
7370   case AMDGPU::S_MUL_U64:
7371     // Split s_mul_u64 in 32-bit vector multiplications.
7372     splitScalarSMulU64(Worklist, Inst, MDT);
7373     Inst.eraseFromParent();
7374     return;
7375 
7376   case AMDGPU::S_MUL_U64_U32_PSEUDO:
7377   case AMDGPU::S_MUL_I64_I32_PSEUDO:
7378     // This is a special case of s_mul_u64 where all the operands are either
7379     // zero extended or sign extended.
7380     splitScalarSMulPseudo(Worklist, Inst, MDT);
7381     Inst.eraseFromParent();
7382     return;
7383 
7384   case AMDGPU::S_AND_B64:
7385     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7386     Inst.eraseFromParent();
7387     return;
7388 
7389   case AMDGPU::S_OR_B64:
7390     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7391     Inst.eraseFromParent();
7392     return;
7393 
7394   case AMDGPU::S_XOR_B64:
7395     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7396     Inst.eraseFromParent();
7397     return;
7398 
7399   case AMDGPU::S_NAND_B64:
7400     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7401     Inst.eraseFromParent();
7402     return;
7403 
7404   case AMDGPU::S_NOR_B64:
7405     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7406     Inst.eraseFromParent();
7407     return;
7408 
7409   case AMDGPU::S_XNOR_B64:
7410     if (ST.hasDLInsts())
7411       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7412     else
7413       splitScalar64BitXnor(Worklist, Inst, MDT);
7414     Inst.eraseFromParent();
7415     return;
7416 
7417   case AMDGPU::S_ANDN2_B64:
7418     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7419     Inst.eraseFromParent();
7420     return;
7421 
7422   case AMDGPU::S_ORN2_B64:
7423     splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7424     Inst.eraseFromParent();
7425     return;
7426 
7427   case AMDGPU::S_BREV_B64:
7428     splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7429     Inst.eraseFromParent();
7430     return;
7431 
7432   case AMDGPU::S_NOT_B64:
7433     splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7434     Inst.eraseFromParent();
7435     return;
7436 
7437   case AMDGPU::S_BCNT1_I32_B64:
7438     splitScalar64BitBCNT(Worklist, Inst);
7439     Inst.eraseFromParent();
7440     return;
7441 
7442   case AMDGPU::S_BFE_I64:
7443     splitScalar64BitBFE(Worklist, Inst);
7444     Inst.eraseFromParent();
7445     return;
7446 
7447   case AMDGPU::S_FLBIT_I32_B64:
7448     splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7449     Inst.eraseFromParent();
7450     return;
7451   case AMDGPU::S_FF1_I32_B64:
7452     splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7453     Inst.eraseFromParent();
7454     return;
7455 
7456   case AMDGPU::S_LSHL_B32:
7457     if (ST.hasOnlyRevVALUShifts()) {
7458       NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7459       swapOperands(Inst);
7460     }
7461     break;
7462   case AMDGPU::S_ASHR_I32:
7463     if (ST.hasOnlyRevVALUShifts()) {
7464       NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7465       swapOperands(Inst);
7466     }
7467     break;
7468   case AMDGPU::S_LSHR_B32:
7469     if (ST.hasOnlyRevVALUShifts()) {
7470       NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7471       swapOperands(Inst);
7472     }
7473     break;
7474   case AMDGPU::S_LSHL_B64:
7475     if (ST.hasOnlyRevVALUShifts()) {
7476       NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7477                       ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7478                       : AMDGPU::V_LSHLREV_B64_e64;
7479       swapOperands(Inst);
7480     }
7481     break;
7482   case AMDGPU::S_ASHR_I64:
7483     if (ST.hasOnlyRevVALUShifts()) {
7484       NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7485       swapOperands(Inst);
7486     }
7487     break;
7488   case AMDGPU::S_LSHR_B64:
7489     if (ST.hasOnlyRevVALUShifts()) {
7490       NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7491       swapOperands(Inst);
7492     }
7493     break;
7494 
7495   case AMDGPU::S_ABS_I32:
7496     lowerScalarAbs(Worklist, Inst);
7497     Inst.eraseFromParent();
7498     return;
7499 
7500   case AMDGPU::S_CBRANCH_SCC0:
7501   case AMDGPU::S_CBRANCH_SCC1: {
7502     // Clear unused bits of vcc
7503     Register CondReg = Inst.getOperand(1).getReg();
7504     bool IsSCC = CondReg == AMDGPU::SCC;
7505     Register VCC = RI.getVCC();
7506     Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7507     unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7508     BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7509         .addReg(EXEC)
7510         .addReg(IsSCC ? VCC : CondReg);
7511     Inst.removeOperand(1);
7512   } break;
7513 
7514   case AMDGPU::S_BFE_U64:
7515   case AMDGPU::S_BFM_B64:
7516     llvm_unreachable("Moving this op to VALU not implemented");
7517 
7518   case AMDGPU::S_PACK_LL_B32_B16:
7519   case AMDGPU::S_PACK_LH_B32_B16:
7520   case AMDGPU::S_PACK_HL_B32_B16:
7521   case AMDGPU::S_PACK_HH_B32_B16:
7522     movePackToVALU(Worklist, MRI, Inst);
7523     Inst.eraseFromParent();
7524     return;
7525 
7526   case AMDGPU::S_XNOR_B32:
7527     lowerScalarXnor(Worklist, Inst);
7528     Inst.eraseFromParent();
7529     return;
7530 
7531   case AMDGPU::S_NAND_B32:
7532     splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7533     Inst.eraseFromParent();
7534     return;
7535 
7536   case AMDGPU::S_NOR_B32:
7537     splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7538     Inst.eraseFromParent();
7539     return;
7540 
7541   case AMDGPU::S_ANDN2_B32:
7542     splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7543     Inst.eraseFromParent();
7544     return;
7545 
7546   case AMDGPU::S_ORN2_B32:
7547     splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7548     Inst.eraseFromParent();
7549     return;
7550 
7551   // TODO: remove as soon as everything is ready
7552   // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7553   // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7554   // can only be selected from the uniform SDNode.
7555   case AMDGPU::S_ADD_CO_PSEUDO:
7556   case AMDGPU::S_SUB_CO_PSEUDO: {
7557     unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7558                        ? AMDGPU::V_ADDC_U32_e64
7559                        : AMDGPU::V_SUBB_U32_e64;
7560     const auto *CarryRC = RI.getWaveMaskRegClass();
7561 
7562     Register CarryInReg = Inst.getOperand(4).getReg();
7563     if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7564       Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7565       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7566           .addReg(CarryInReg);
7567     }
7568 
7569     Register CarryOutReg = Inst.getOperand(1).getReg();
7570 
7571     Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7572         MRI.getRegClass(Inst.getOperand(0).getReg())));
7573     MachineInstr *CarryOp =
7574         BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7575             .addReg(CarryOutReg, RegState::Define)
7576             .add(Inst.getOperand(2))
7577             .add(Inst.getOperand(3))
7578             .addReg(CarryInReg)
7579             .addImm(0);
7580     legalizeOperands(*CarryOp);
7581     MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7582     addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7583     Inst.eraseFromParent();
7584   }
7585     return;
7586   case AMDGPU::S_UADDO_PSEUDO:
7587   case AMDGPU::S_USUBO_PSEUDO: {
7588     const DebugLoc &DL = Inst.getDebugLoc();
7589     MachineOperand &Dest0 = Inst.getOperand(0);
7590     MachineOperand &Dest1 = Inst.getOperand(1);
7591     MachineOperand &Src0 = Inst.getOperand(2);
7592     MachineOperand &Src1 = Inst.getOperand(3);
7593 
7594     unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7595                        ? AMDGPU::V_ADD_CO_U32_e64
7596                        : AMDGPU::V_SUB_CO_U32_e64;
7597     const TargetRegisterClass *NewRC =
7598         RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7599     Register DestReg = MRI.createVirtualRegister(NewRC);
7600     MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7601                                  .addReg(Dest1.getReg(), RegState::Define)
7602                                  .add(Src0)
7603                                  .add(Src1)
7604                                  .addImm(0); // clamp bit
7605 
7606     legalizeOperands(*NewInstr, MDT);
7607     MRI.replaceRegWith(Dest0.getReg(), DestReg);
7608     addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7609                                  Worklist);
7610     Inst.eraseFromParent();
7611   }
7612     return;
7613 
7614   case AMDGPU::S_CSELECT_B32:
7615   case AMDGPU::S_CSELECT_B64:
7616     lowerSelect(Worklist, Inst, MDT);
7617     Inst.eraseFromParent();
7618     return;
7619   case AMDGPU::S_CMP_EQ_I32:
7620   case AMDGPU::S_CMP_LG_I32:
7621   case AMDGPU::S_CMP_GT_I32:
7622   case AMDGPU::S_CMP_GE_I32:
7623   case AMDGPU::S_CMP_LT_I32:
7624   case AMDGPU::S_CMP_LE_I32:
7625   case AMDGPU::S_CMP_EQ_U32:
7626   case AMDGPU::S_CMP_LG_U32:
7627   case AMDGPU::S_CMP_GT_U32:
7628   case AMDGPU::S_CMP_GE_U32:
7629   case AMDGPU::S_CMP_LT_U32:
7630   case AMDGPU::S_CMP_LE_U32:
7631   case AMDGPU::S_CMP_EQ_U64:
7632   case AMDGPU::S_CMP_LG_U64:
7633   case AMDGPU::S_CMP_LT_F32:
7634   case AMDGPU::S_CMP_EQ_F32:
7635   case AMDGPU::S_CMP_LE_F32:
7636   case AMDGPU::S_CMP_GT_F32:
7637   case AMDGPU::S_CMP_LG_F32:
7638   case AMDGPU::S_CMP_GE_F32:
7639   case AMDGPU::S_CMP_O_F32:
7640   case AMDGPU::S_CMP_U_F32:
7641   case AMDGPU::S_CMP_NGE_F32:
7642   case AMDGPU::S_CMP_NLG_F32:
7643   case AMDGPU::S_CMP_NGT_F32:
7644   case AMDGPU::S_CMP_NLE_F32:
7645   case AMDGPU::S_CMP_NEQ_F32:
7646   case AMDGPU::S_CMP_NLT_F32: {
7647     Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7648     auto NewInstr =
7649         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7650             .setMIFlags(Inst.getFlags());
7651     if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7652         0) {
7653       NewInstr
7654           .addImm(0)               // src0_modifiers
7655           .add(Inst.getOperand(0)) // src0
7656           .addImm(0)               // src1_modifiers
7657           .add(Inst.getOperand(1)) // src1
7658           .addImm(0);              // clamp
7659     } else {
7660       NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7661     }
7662     legalizeOperands(*NewInstr, MDT);
7663     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7664     MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7665     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7666     Inst.eraseFromParent();
7667     return;
7668   }
7669   case AMDGPU::S_CMP_LT_F16:
7670   case AMDGPU::S_CMP_EQ_F16:
7671   case AMDGPU::S_CMP_LE_F16:
7672   case AMDGPU::S_CMP_GT_F16:
7673   case AMDGPU::S_CMP_LG_F16:
7674   case AMDGPU::S_CMP_GE_F16:
7675   case AMDGPU::S_CMP_O_F16:
7676   case AMDGPU::S_CMP_U_F16:
7677   case AMDGPU::S_CMP_NGE_F16:
7678   case AMDGPU::S_CMP_NLG_F16:
7679   case AMDGPU::S_CMP_NGT_F16:
7680   case AMDGPU::S_CMP_NLE_F16:
7681   case AMDGPU::S_CMP_NEQ_F16:
7682   case AMDGPU::S_CMP_NLT_F16: {
7683     Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7684     auto NewInstr =
7685         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7686         .setMIFlags(Inst.getFlags());
7687     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7688       NewInstr
7689           .addImm(0)               // src0_modifiers
7690           .add(Inst.getOperand(0)) // src0
7691           .addImm(0)               // src1_modifiers
7692           .add(Inst.getOperand(1)) // src1
7693           .addImm(0);              // clamp
7694       if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7695         NewInstr.addImm(0); // op_sel0
7696     } else {
7697       NewInstr
7698           .add(Inst.getOperand(0))
7699           .add(Inst.getOperand(1));
7700     }
7701     legalizeOperandsVALUt16(*NewInstr, MRI);
7702     legalizeOperands(*NewInstr, MDT);
7703     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7704     MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7705     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7706     Inst.eraseFromParent();
7707     return;
7708   }
7709   case AMDGPU::S_CVT_HI_F32_F16: {
7710     const DebugLoc &DL = Inst.getDebugLoc();
7711     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7712     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7713     if (ST.useRealTrue16Insts()) {
7714       BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7715           .add(Inst.getOperand(1));
7716       BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7717           .addImm(0) // src0_modifiers
7718           .addReg(TmpReg, 0, AMDGPU::hi16)
7719           .addImm(0)  // clamp
7720           .addImm(0)  // omod
7721           .addImm(0); // op_sel0
7722     } else {
7723       BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7724           .addImm(16)
7725           .add(Inst.getOperand(1));
7726       BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7727           .addImm(0) // src0_modifiers
7728           .addReg(TmpReg)
7729           .addImm(0)  // clamp
7730           .addImm(0); // omod
7731     }
7732 
7733     MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7734     addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7735     Inst.eraseFromParent();
7736     return;
7737   }
7738   case AMDGPU::S_MINIMUM_F32:
7739   case AMDGPU::S_MAXIMUM_F32: {
7740     const DebugLoc &DL = Inst.getDebugLoc();
7741     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7742     MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7743                                  .addImm(0) // src0_modifiers
7744                                  .add(Inst.getOperand(1))
7745                                  .addImm(0) // src1_modifiers
7746                                  .add(Inst.getOperand(2))
7747                                  .addImm(0)  // clamp
7748                                  .addImm(0); // omod
7749     MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7750 
7751     legalizeOperands(*NewInstr, MDT);
7752     addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7753     Inst.eraseFromParent();
7754     return;
7755   }
7756   case AMDGPU::S_MINIMUM_F16:
7757   case AMDGPU::S_MAXIMUM_F16: {
7758     const DebugLoc &DL = Inst.getDebugLoc();
7759     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7760                                                     ? &AMDGPU::VGPR_16RegClass
7761                                                     : &AMDGPU::VGPR_32RegClass);
7762     MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7763                                  .addImm(0) // src0_modifiers
7764                                  .add(Inst.getOperand(1))
7765                                  .addImm(0) // src1_modifiers
7766                                  .add(Inst.getOperand(2))
7767                                  .addImm(0)  // clamp
7768                                  .addImm(0)  // omod
7769                                  .addImm(0); // opsel0
7770     MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7771     legalizeOperandsVALUt16(*NewInstr, MRI);
7772     legalizeOperands(*NewInstr, MDT);
7773     addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7774     Inst.eraseFromParent();
7775     return;
7776   }
7777   case AMDGPU::V_S_EXP_F16_e64:
7778   case AMDGPU::V_S_LOG_F16_e64:
7779   case AMDGPU::V_S_RCP_F16_e64:
7780   case AMDGPU::V_S_RSQ_F16_e64:
7781   case AMDGPU::V_S_SQRT_F16_e64: {
7782     const DebugLoc &DL = Inst.getDebugLoc();
7783     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7784                                                     ? &AMDGPU::VGPR_16RegClass
7785                                                     : &AMDGPU::VGPR_32RegClass);
7786     auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7787                         .add(Inst.getOperand(1)) // src0_modifiers
7788                         .add(Inst.getOperand(2))
7789                         .add(Inst.getOperand(3)) // clamp
7790                         .add(Inst.getOperand(4)) // omod
7791                         .setMIFlags(Inst.getFlags());
7792     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7793       NewInstr.addImm(0); // opsel0
7794     MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7795     legalizeOperandsVALUt16(*NewInstr, MRI);
7796     legalizeOperands(*NewInstr, MDT);
7797     addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7798     Inst.eraseFromParent();
7799     return;
7800   }
7801   }
7802 
7803   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7804     // We cannot move this instruction to the VALU, so we should try to
7805     // legalize its operands instead.
7806     legalizeOperands(Inst, MDT);
7807     return;
7808   }
7809   // Handle converting generic instructions like COPY-to-SGPR into
7810   // COPY-to-VGPR.
7811   if (NewOpcode == Opcode) {
7812     Register DstReg = Inst.getOperand(0).getReg();
7813     const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7814 
7815     // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7816     // hope for the best.
7817     if (Inst.isCopy() && DstReg.isPhysical() &&
7818         RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7819       // TODO: Only works for 32 bit registers.
7820       if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
7821         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7822                 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
7823             .add(Inst.getOperand(1));
7824       } else {
7825         Register NewDst =
7826             MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7827         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7828                 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7829             .add(Inst.getOperand(1));
7830         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7831                 DstReg)
7832             .addReg(NewDst);
7833       }
7834       Inst.eraseFromParent();
7835       return;
7836     }
7837 
7838     if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7839         NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7840       // Instead of creating a copy where src and dst are the same register
7841       // class, we just replace all uses of dst with src.  These kinds of
7842       // copies interfere with the heuristics MachineSink uses to decide
7843       // whether or not to split a critical edge.  Since the pass assumes
7844       // that copies will end up as machine instructions and not be
7845       // eliminated.
7846       addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7847       Register NewDstReg = Inst.getOperand(1).getReg();
7848       MRI.replaceRegWith(DstReg, NewDstReg);
7849       MRI.clearKillFlags(NewDstReg);
7850       Inst.getOperand(0).setReg(DstReg);
7851       // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7852       // these are deleted later, but at -O0 it would leave a suspicious
7853       // looking illegal copy of an undef register.
7854       for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7855         Inst.removeOperand(I);
7856       Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7857       // Legalize t16 operand since replaceReg is called after addUsersToVALU
7858       for (MachineOperand &MO :
7859            make_early_inc_range(MRI.use_operands(NewDstReg))) {
7860         legalizeOperandsVALUt16(*MO.getParent(), MRI);
7861       }
7862       return;
7863     }
7864 
7865     // If this is a v2s copy between 16bit and 32bit reg,
7866     // replace vgpr copy to reg_sequence/extract_subreg
7867     // This can be remove after we have sgpr16 in place
7868     if (ST.useRealTrue16Insts() && Inst.isCopy() &&
7869         Inst.getOperand(1).getReg().isVirtual() &&
7870         RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7871       const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
7872       if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7873         Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7874         Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7875         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7876                 get(AMDGPU::IMPLICIT_DEF), Undef);
7877         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7878                 get(AMDGPU::REG_SEQUENCE), NewDstReg)
7879             .addReg(Inst.getOperand(1).getReg())
7880             .addImm(AMDGPU::lo16)
7881             .addReg(Undef)
7882             .addImm(AMDGPU::hi16);
7883         Inst.eraseFromParent();
7884         MRI.replaceRegWith(DstReg, NewDstReg);
7885         addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7886         return;
7887       } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
7888                                              AMDGPU::lo16)) {
7889         Inst.getOperand(1).setSubReg(AMDGPU::lo16);
7890         Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7891         MRI.replaceRegWith(DstReg, NewDstReg);
7892         addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7893         return;
7894       }
7895     }
7896 
7897     Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7898     MRI.replaceRegWith(DstReg, NewDstReg);
7899     legalizeOperands(Inst, MDT);
7900     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7901     return;
7902   }
7903 
7904   // Use the new VALU Opcode.
7905   auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7906                       .setMIFlags(Inst.getFlags());
7907   if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7908     // Intersperse VOP3 modifiers among the SALU operands.
7909     NewInstr->addOperand(Inst.getOperand(0));
7910     if (AMDGPU::getNamedOperandIdx(NewOpcode,
7911                                    AMDGPU::OpName::src0_modifiers) >= 0)
7912       NewInstr.addImm(0);
7913     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7914       MachineOperand Src = Inst.getOperand(1);
7915       NewInstr->addOperand(Src);
7916     }
7917 
7918     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7919       // We are converting these to a BFE, so we need to add the missing
7920       // operands for the size and offset.
7921       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7922       NewInstr.addImm(0);
7923       NewInstr.addImm(Size);
7924     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7925       // The VALU version adds the second operand to the result, so insert an
7926       // extra 0 operand.
7927       NewInstr.addImm(0);
7928     } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7929       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7930       // If we need to move this to VGPRs, we need to unpack the second
7931       // operand back into the 2 separate ones for bit offset and width.
7932       assert(OffsetWidthOp.isImm() &&
7933              "Scalar BFE is only implemented for constant width and offset");
7934       uint32_t Imm = OffsetWidthOp.getImm();
7935 
7936       uint32_t Offset = Imm & 0x3f;               // Extract bits [5:0].
7937       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7938       NewInstr.addImm(Offset);
7939       NewInstr.addImm(BitWidth);
7940     } else {
7941       if (AMDGPU::getNamedOperandIdx(NewOpcode,
7942                                      AMDGPU::OpName::src1_modifiers) >= 0)
7943         NewInstr.addImm(0);
7944       if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7945         NewInstr->addOperand(Inst.getOperand(2));
7946       if (AMDGPU::getNamedOperandIdx(NewOpcode,
7947                                      AMDGPU::OpName::src2_modifiers) >= 0)
7948         NewInstr.addImm(0);
7949       if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7950         NewInstr->addOperand(Inst.getOperand(3));
7951       if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7952         NewInstr.addImm(0);
7953       if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7954         NewInstr.addImm(0);
7955       if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7956         NewInstr.addImm(0);
7957     }
7958   } else {
7959     // Just copy the SALU operands.
7960     for (const MachineOperand &Op : Inst.explicit_operands())
7961       NewInstr->addOperand(Op);
7962   }
7963 
7964   // Remove any references to SCC. Vector instructions can't read from it, and
7965   // We're just about to add the implicit use / defs of VCC, and we don't want
7966   // both.
7967   for (MachineOperand &Op : Inst.implicit_operands()) {
7968     if (Op.getReg() == AMDGPU::SCC) {
7969       // Only propagate through live-def of SCC.
7970       if (Op.isDef() && !Op.isDead())
7971         addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7972       if (Op.isUse())
7973         addSCCDefsToVALUWorklist(NewInstr, Worklist);
7974     }
7975   }
7976   Inst.eraseFromParent();
7977   Register NewDstReg;
7978   if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7979     Register DstReg = NewInstr->getOperand(0).getReg();
7980     assert(DstReg.isVirtual());
7981     // Update the destination register class.
7982     const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7983     assert(NewDstRC);
7984     NewDstReg = MRI.createVirtualRegister(NewDstRC);
7985     MRI.replaceRegWith(DstReg, NewDstReg);
7986   }
7987   fixImplicitOperands(*NewInstr);
7988 
7989   legalizeOperandsVALUt16(*NewInstr, MRI);
7990 
7991   // Legalize the operands
7992   legalizeOperands(*NewInstr, MDT);
7993   if (NewDstReg)
7994     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7995 }
7996 
7997 // Add/sub require special handling to deal with carry outs.
7998 std::pair<bool, MachineBasicBlock *>
moveScalarAddSub(SIInstrWorklist & Worklist,MachineInstr & Inst,MachineDominatorTree * MDT) const7999 SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8000                               MachineDominatorTree *MDT) const {
8001   if (ST.hasAddNoCarry()) {
8002     // Assume there is no user of scc since we don't select this in that case.
8003     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8004     // is used.
8005 
8006     MachineBasicBlock &MBB = *Inst.getParent();
8007     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8008 
8009     Register OldDstReg = Inst.getOperand(0).getReg();
8010     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8011 
8012     unsigned Opc = Inst.getOpcode();
8013     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8014 
8015     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8016       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8017 
8018     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8019     Inst.removeOperand(3);
8020 
8021     Inst.setDesc(get(NewOpc));
8022     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8023     Inst.addImplicitDefUseOperands(*MBB.getParent());
8024     MRI.replaceRegWith(OldDstReg, ResultReg);
8025     MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8026 
8027     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8028     return std::pair(true, NewBB);
8029   }
8030 
8031   return std::pair(false, nullptr);
8032 }
8033 
lowerSelect(SIInstrWorklist & Worklist,MachineInstr & Inst,MachineDominatorTree * MDT) const8034 void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8035                               MachineDominatorTree *MDT) const {
8036 
8037   MachineBasicBlock &MBB = *Inst.getParent();
8038   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8039   MachineBasicBlock::iterator MII = Inst;
8040   DebugLoc DL = Inst.getDebugLoc();
8041 
8042   MachineOperand &Dest = Inst.getOperand(0);
8043   MachineOperand &Src0 = Inst.getOperand(1);
8044   MachineOperand &Src1 = Inst.getOperand(2);
8045   MachineOperand &Cond = Inst.getOperand(3);
8046 
8047   Register CondReg = Cond.getReg();
8048   bool IsSCC = (CondReg == AMDGPU::SCC);
8049 
8050   // If this is a trivial select where the condition is effectively not SCC
8051   // (CondReg is a source of copy to SCC), then the select is semantically
8052   // equivalent to copying CondReg. Hence, there is no need to create
8053   // V_CNDMASK, we can just use that and bail out.
8054   if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8055       (Src1.getImm() == 0)) {
8056     MRI.replaceRegWith(Dest.getReg(), CondReg);
8057     return;
8058   }
8059 
8060   Register NewCondReg = CondReg;
8061   if (IsSCC) {
8062     const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8063     NewCondReg = MRI.createVirtualRegister(TC);
8064 
8065     // Now look for the closest SCC def if it is a copy
8066     // replacing the CondReg with the COPY source register
8067     bool CopyFound = false;
8068     for (MachineInstr &CandI :
8069          make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
8070                     Inst.getParent()->rend())) {
8071       if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8072           -1) {
8073         if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8074           BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8075               .addReg(CandI.getOperand(1).getReg());
8076           CopyFound = true;
8077         }
8078         break;
8079       }
8080     }
8081     if (!CopyFound) {
8082       // SCC def is not a copy
8083       // Insert a trivial select instead of creating a copy, because a copy from
8084       // SCC would semantically mean just copying a single bit, but we may need
8085       // the result to be a vector condition mask that needs preserving.
8086       unsigned Opcode =
8087           ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8088       auto NewSelect =
8089           BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8090       NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8091     }
8092   }
8093 
8094   Register NewDestReg = MRI.createVirtualRegister(
8095       RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8096   MachineInstr *NewInst;
8097   if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8098     NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8099                   .addImm(0)
8100                   .add(Src1) // False
8101                   .addImm(0)
8102                   .add(Src0) // True
8103                   .addReg(NewCondReg);
8104   } else {
8105     NewInst =
8106         BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8107             .add(Src1) // False
8108             .add(Src0) // True
8109             .addReg(NewCondReg);
8110   }
8111   MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8112   legalizeOperands(*NewInst, MDT);
8113   addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8114 }
8115 
lowerScalarAbs(SIInstrWorklist & Worklist,MachineInstr & Inst) const8116 void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8117                                  MachineInstr &Inst) const {
8118   MachineBasicBlock &MBB = *Inst.getParent();
8119   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8120   MachineBasicBlock::iterator MII = Inst;
8121   DebugLoc DL = Inst.getDebugLoc();
8122 
8123   MachineOperand &Dest = Inst.getOperand(0);
8124   MachineOperand &Src = Inst.getOperand(1);
8125   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8126   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8127 
8128   unsigned SubOp = ST.hasAddNoCarry() ?
8129     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8130 
8131   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8132     .addImm(0)
8133     .addReg(Src.getReg());
8134 
8135   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8136     .addReg(Src.getReg())
8137     .addReg(TmpReg);
8138 
8139   MRI.replaceRegWith(Dest.getReg(), ResultReg);
8140   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8141 }
8142 
lowerScalarXnor(SIInstrWorklist & Worklist,MachineInstr & Inst) const8143 void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8144                                   MachineInstr &Inst) const {
8145   MachineBasicBlock &MBB = *Inst.getParent();
8146   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8147   MachineBasicBlock::iterator MII = Inst;
8148   const DebugLoc &DL = Inst.getDebugLoc();
8149 
8150   MachineOperand &Dest = Inst.getOperand(0);
8151   MachineOperand &Src0 = Inst.getOperand(1);
8152   MachineOperand &Src1 = Inst.getOperand(2);
8153 
8154   if (ST.hasDLInsts()) {
8155     Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8156     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8157     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8158 
8159     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8160       .add(Src0)
8161       .add(Src1);
8162 
8163     MRI.replaceRegWith(Dest.getReg(), NewDest);
8164     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8165   } else {
8166     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8167     // invert either source and then perform the XOR. If either source is a
8168     // scalar register, then we can leave the inversion on the scalar unit to
8169     // achieve a better distribution of scalar and vector instructions.
8170     bool Src0IsSGPR = Src0.isReg() &&
8171                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8172     bool Src1IsSGPR = Src1.isReg() &&
8173                       RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8174     MachineInstr *Xor;
8175     Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8176     Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8177 
8178     // Build a pair of scalar instructions and add them to the work list.
8179     // The next iteration over the work list will lower these to the vector
8180     // unit as necessary.
8181     if (Src0IsSGPR) {
8182       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8183       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8184       .addReg(Temp)
8185       .add(Src1);
8186     } else if (Src1IsSGPR) {
8187       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8188       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8189       .add(Src0)
8190       .addReg(Temp);
8191     } else {
8192       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8193         .add(Src0)
8194         .add(Src1);
8195       MachineInstr *Not =
8196           BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8197       Worklist.insert(Not);
8198     }
8199 
8200     MRI.replaceRegWith(Dest.getReg(), NewDest);
8201 
8202     Worklist.insert(Xor);
8203 
8204     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8205   }
8206 }
8207 
splitScalarNotBinop(SIInstrWorklist & Worklist,MachineInstr & Inst,unsigned Opcode) const8208 void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8209                                       MachineInstr &Inst,
8210                                       unsigned Opcode) const {
8211   MachineBasicBlock &MBB = *Inst.getParent();
8212   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8213   MachineBasicBlock::iterator MII = Inst;
8214   const DebugLoc &DL = Inst.getDebugLoc();
8215 
8216   MachineOperand &Dest = Inst.getOperand(0);
8217   MachineOperand &Src0 = Inst.getOperand(1);
8218   MachineOperand &Src1 = Inst.getOperand(2);
8219 
8220   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8221   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8222 
8223   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8224     .add(Src0)
8225     .add(Src1);
8226 
8227   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8228     .addReg(Interm);
8229 
8230   Worklist.insert(&Op);
8231   Worklist.insert(&Not);
8232 
8233   MRI.replaceRegWith(Dest.getReg(), NewDest);
8234   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8235 }
8236 
splitScalarBinOpN2(SIInstrWorklist & Worklist,MachineInstr & Inst,unsigned Opcode) const8237 void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8238                                      MachineInstr &Inst,
8239                                      unsigned Opcode) const {
8240   MachineBasicBlock &MBB = *Inst.getParent();
8241   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8242   MachineBasicBlock::iterator MII = Inst;
8243   const DebugLoc &DL = Inst.getDebugLoc();
8244 
8245   MachineOperand &Dest = Inst.getOperand(0);
8246   MachineOperand &Src0 = Inst.getOperand(1);
8247   MachineOperand &Src1 = Inst.getOperand(2);
8248 
8249   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8250   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8251 
8252   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8253     .add(Src1);
8254 
8255   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8256     .add(Src0)
8257     .addReg(Interm);
8258 
8259   Worklist.insert(&Not);
8260   Worklist.insert(&Op);
8261 
8262   MRI.replaceRegWith(Dest.getReg(), NewDest);
8263   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8264 }
8265 
splitScalar64BitUnaryOp(SIInstrWorklist & Worklist,MachineInstr & Inst,unsigned Opcode,bool Swap) const8266 void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8267                                           MachineInstr &Inst, unsigned Opcode,
8268                                           bool Swap) const {
8269   MachineBasicBlock &MBB = *Inst.getParent();
8270   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8271 
8272   MachineOperand &Dest = Inst.getOperand(0);
8273   MachineOperand &Src0 = Inst.getOperand(1);
8274   DebugLoc DL = Inst.getDebugLoc();
8275 
8276   MachineBasicBlock::iterator MII = Inst;
8277 
8278   const MCInstrDesc &InstDesc = get(Opcode);
8279   const TargetRegisterClass *Src0RC = Src0.isReg() ?
8280     MRI.getRegClass(Src0.getReg()) :
8281     &AMDGPU::SGPR_32RegClass;
8282 
8283   const TargetRegisterClass *Src0SubRC =
8284       RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8285 
8286   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8287                                                        AMDGPU::sub0, Src0SubRC);
8288 
8289   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8290   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8291   const TargetRegisterClass *NewDestSubRC =
8292       RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8293 
8294   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8295   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8296 
8297   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8298                                                        AMDGPU::sub1, Src0SubRC);
8299 
8300   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8301   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8302 
8303   if (Swap)
8304     std::swap(DestSub0, DestSub1);
8305 
8306   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8307   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8308     .addReg(DestSub0)
8309     .addImm(AMDGPU::sub0)
8310     .addReg(DestSub1)
8311     .addImm(AMDGPU::sub1);
8312 
8313   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8314 
8315   Worklist.insert(&LoHalf);
8316   Worklist.insert(&HiHalf);
8317 
8318   // We don't need to legalizeOperands here because for a single operand, src0
8319   // will support any kind of input.
8320 
8321   // Move all users of this moved value.
8322   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8323 }
8324 
8325 // There is not a vector equivalent of s_mul_u64. For this reason, we need to
8326 // split the s_mul_u64 in 32-bit vector multiplications.
splitScalarSMulU64(SIInstrWorklist & Worklist,MachineInstr & Inst,MachineDominatorTree * MDT) const8327 void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8328                                      MachineInstr &Inst,
8329                                      MachineDominatorTree *MDT) const {
8330   MachineBasicBlock &MBB = *Inst.getParent();
8331   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8332 
8333   Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8334   Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8335   Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8336 
8337   MachineOperand &Dest = Inst.getOperand(0);
8338   MachineOperand &Src0 = Inst.getOperand(1);
8339   MachineOperand &Src1 = Inst.getOperand(2);
8340   const DebugLoc &DL = Inst.getDebugLoc();
8341   MachineBasicBlock::iterator MII = Inst;
8342 
8343   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8344   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8345   const TargetRegisterClass *Src0SubRC =
8346       RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8347   if (RI.isSGPRClass(Src0SubRC))
8348     Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8349   const TargetRegisterClass *Src1SubRC =
8350       RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8351   if (RI.isSGPRClass(Src1SubRC))
8352     Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8353 
8354   // First, we extract the low 32-bit and high 32-bit values from each of the
8355   // operands.
8356   MachineOperand Op0L =
8357       buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8358   MachineOperand Op1L =
8359       buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8360   MachineOperand Op0H =
8361       buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8362   MachineOperand Op1H =
8363       buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8364 
8365   // The multilication is done as follows:
8366   //
8367   //                            Op1H  Op1L
8368   //                          * Op0H  Op0L
8369   //                       --------------------
8370   //                       Op1H*Op0L  Op1L*Op0L
8371   //          + Op1H*Op0H  Op1L*Op0H
8372   // -----------------------------------------
8373   // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L
8374   //
8375   //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8376   //  value and that would overflow.
8377   //  The low 32-bit value is Op1L*Op0L.
8378   //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8379 
8380   Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8381   MachineInstr *Op1L_Op0H =
8382       BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8383           .add(Op1L)
8384           .add(Op0H);
8385 
8386   Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8387   MachineInstr *Op1H_Op0L =
8388       BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8389           .add(Op1H)
8390           .add(Op0L);
8391 
8392   Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8393   MachineInstr *Carry =
8394       BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8395           .add(Op1L)
8396           .add(Op0L);
8397 
8398   MachineInstr *LoHalf =
8399       BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8400           .add(Op1L)
8401           .add(Op0L);
8402 
8403   Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8404   MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8405                           .addReg(Op1L_Op0H_Reg)
8406                           .addReg(Op1H_Op0L_Reg);
8407 
8408   MachineInstr *HiHalf =
8409       BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8410           .addReg(AddReg)
8411           .addReg(CarryReg);
8412 
8413   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8414       .addReg(DestSub0)
8415       .addImm(AMDGPU::sub0)
8416       .addReg(DestSub1)
8417       .addImm(AMDGPU::sub1);
8418 
8419   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8420 
8421   // Try to legalize the operands in case we need to swap the order to keep it
8422   // valid.
8423   legalizeOperands(*Op1L_Op0H, MDT);
8424   legalizeOperands(*Op1H_Op0L, MDT);
8425   legalizeOperands(*Carry, MDT);
8426   legalizeOperands(*LoHalf, MDT);
8427   legalizeOperands(*Add, MDT);
8428   legalizeOperands(*HiHalf, MDT);
8429 
8430   // Move all users of this moved value.
8431   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8432 }
8433 
8434 // Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8435 // multiplications.
splitScalarSMulPseudo(SIInstrWorklist & Worklist,MachineInstr & Inst,MachineDominatorTree * MDT) const8436 void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8437                                         MachineInstr &Inst,
8438                                         MachineDominatorTree *MDT) const {
8439   MachineBasicBlock &MBB = *Inst.getParent();
8440   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8441 
8442   Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8443   Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8444   Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8445 
8446   MachineOperand &Dest = Inst.getOperand(0);
8447   MachineOperand &Src0 = Inst.getOperand(1);
8448   MachineOperand &Src1 = Inst.getOperand(2);
8449   const DebugLoc &DL = Inst.getDebugLoc();
8450   MachineBasicBlock::iterator MII = Inst;
8451 
8452   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8453   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8454   const TargetRegisterClass *Src0SubRC =
8455       RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8456   if (RI.isSGPRClass(Src0SubRC))
8457     Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8458   const TargetRegisterClass *Src1SubRC =
8459       RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8460   if (RI.isSGPRClass(Src1SubRC))
8461     Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8462 
8463   // First, we extract the low 32-bit and high 32-bit values from each of the
8464   // operands.
8465   MachineOperand Op0L =
8466       buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8467   MachineOperand Op1L =
8468       buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8469 
8470   unsigned Opc = Inst.getOpcode();
8471   unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8472                         ? AMDGPU::V_MUL_HI_U32_e64
8473                         : AMDGPU::V_MUL_HI_I32_e64;
8474   MachineInstr *HiHalf =
8475       BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8476 
8477   MachineInstr *LoHalf =
8478       BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8479           .add(Op1L)
8480           .add(Op0L);
8481 
8482   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8483       .addReg(DestSub0)
8484       .addImm(AMDGPU::sub0)
8485       .addReg(DestSub1)
8486       .addImm(AMDGPU::sub1);
8487 
8488   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8489 
8490   // Try to legalize the operands in case we need to swap the order to keep it
8491   // valid.
8492   legalizeOperands(*HiHalf, MDT);
8493   legalizeOperands(*LoHalf, MDT);
8494 
8495   // Move all users of this moved value.
8496   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8497 }
8498 
splitScalar64BitBinaryOp(SIInstrWorklist & Worklist,MachineInstr & Inst,unsigned Opcode,MachineDominatorTree * MDT) const8499 void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8500                                            MachineInstr &Inst, unsigned Opcode,
8501                                            MachineDominatorTree *MDT) const {
8502   MachineBasicBlock &MBB = *Inst.getParent();
8503   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8504 
8505   MachineOperand &Dest = Inst.getOperand(0);
8506   MachineOperand &Src0 = Inst.getOperand(1);
8507   MachineOperand &Src1 = Inst.getOperand(2);
8508   DebugLoc DL = Inst.getDebugLoc();
8509 
8510   MachineBasicBlock::iterator MII = Inst;
8511 
8512   const MCInstrDesc &InstDesc = get(Opcode);
8513   const TargetRegisterClass *Src0RC = Src0.isReg() ?
8514     MRI.getRegClass(Src0.getReg()) :
8515     &AMDGPU::SGPR_32RegClass;
8516 
8517   const TargetRegisterClass *Src0SubRC =
8518       RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8519   const TargetRegisterClass *Src1RC = Src1.isReg() ?
8520     MRI.getRegClass(Src1.getReg()) :
8521     &AMDGPU::SGPR_32RegClass;
8522 
8523   const TargetRegisterClass *Src1SubRC =
8524       RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8525 
8526   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8527                                                        AMDGPU::sub0, Src0SubRC);
8528   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8529                                                        AMDGPU::sub0, Src1SubRC);
8530   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8531                                                        AMDGPU::sub1, Src0SubRC);
8532   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8533                                                        AMDGPU::sub1, Src1SubRC);
8534 
8535   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8536   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8537   const TargetRegisterClass *NewDestSubRC =
8538       RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8539 
8540   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8541   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8542                               .add(SrcReg0Sub0)
8543                               .add(SrcReg1Sub0);
8544 
8545   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8546   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8547                               .add(SrcReg0Sub1)
8548                               .add(SrcReg1Sub1);
8549 
8550   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8551   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8552     .addReg(DestSub0)
8553     .addImm(AMDGPU::sub0)
8554     .addReg(DestSub1)
8555     .addImm(AMDGPU::sub1);
8556 
8557   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8558 
8559   Worklist.insert(&LoHalf);
8560   Worklist.insert(&HiHalf);
8561 
8562   // Move all users of this moved value.
8563   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8564 }
8565 
splitScalar64BitXnor(SIInstrWorklist & Worklist,MachineInstr & Inst,MachineDominatorTree * MDT) const8566 void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8567                                        MachineInstr &Inst,
8568                                        MachineDominatorTree *MDT) const {
8569   MachineBasicBlock &MBB = *Inst.getParent();
8570   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8571 
8572   MachineOperand &Dest = Inst.getOperand(0);
8573   MachineOperand &Src0 = Inst.getOperand(1);
8574   MachineOperand &Src1 = Inst.getOperand(2);
8575   const DebugLoc &DL = Inst.getDebugLoc();
8576 
8577   MachineBasicBlock::iterator MII = Inst;
8578 
8579   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8580 
8581   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8582 
8583   MachineOperand* Op0;
8584   MachineOperand* Op1;
8585 
8586   if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8587     Op0 = &Src0;
8588     Op1 = &Src1;
8589   } else {
8590     Op0 = &Src1;
8591     Op1 = &Src0;
8592   }
8593 
8594   BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8595     .add(*Op0);
8596 
8597   Register NewDest = MRI.createVirtualRegister(DestRC);
8598 
8599   MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8600     .addReg(Interm)
8601     .add(*Op1);
8602 
8603   MRI.replaceRegWith(Dest.getReg(), NewDest);
8604 
8605   Worklist.insert(&Xor);
8606 }
8607 
splitScalar64BitBCNT(SIInstrWorklist & Worklist,MachineInstr & Inst) const8608 void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8609                                        MachineInstr &Inst) const {
8610   MachineBasicBlock &MBB = *Inst.getParent();
8611   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8612 
8613   MachineBasicBlock::iterator MII = Inst;
8614   const DebugLoc &DL = Inst.getDebugLoc();
8615 
8616   MachineOperand &Dest = Inst.getOperand(0);
8617   MachineOperand &Src = Inst.getOperand(1);
8618 
8619   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8620   const TargetRegisterClass *SrcRC = Src.isReg() ?
8621     MRI.getRegClass(Src.getReg()) :
8622     &AMDGPU::SGPR_32RegClass;
8623 
8624   Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8625   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8626 
8627   const TargetRegisterClass *SrcSubRC =
8628       RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8629 
8630   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8631                                                       AMDGPU::sub0, SrcSubRC);
8632   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8633                                                       AMDGPU::sub1, SrcSubRC);
8634 
8635   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8636 
8637   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8638 
8639   MRI.replaceRegWith(Dest.getReg(), ResultReg);
8640 
8641   // We don't need to legalize operands here. src0 for either instruction can be
8642   // an SGPR, and the second input is unused or determined here.
8643   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8644 }
8645 
splitScalar64BitBFE(SIInstrWorklist & Worklist,MachineInstr & Inst) const8646 void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8647                                       MachineInstr &Inst) const {
8648   MachineBasicBlock &MBB = *Inst.getParent();
8649   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8650   MachineBasicBlock::iterator MII = Inst;
8651   const DebugLoc &DL = Inst.getDebugLoc();
8652 
8653   MachineOperand &Dest = Inst.getOperand(0);
8654   uint32_t Imm = Inst.getOperand(2).getImm();
8655   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8656   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8657 
8658   (void) Offset;
8659 
8660   // Only sext_inreg cases handled.
8661   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8662          Offset == 0 && "Not implemented");
8663 
8664   if (BitWidth < 32) {
8665     Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8666     Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8667     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8668 
8669     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8670         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8671         .addImm(0)
8672         .addImm(BitWidth);
8673 
8674     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8675       .addImm(31)
8676       .addReg(MidRegLo);
8677 
8678     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8679       .addReg(MidRegLo)
8680       .addImm(AMDGPU::sub0)
8681       .addReg(MidRegHi)
8682       .addImm(AMDGPU::sub1);
8683 
8684     MRI.replaceRegWith(Dest.getReg(), ResultReg);
8685     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8686     return;
8687   }
8688 
8689   MachineOperand &Src = Inst.getOperand(1);
8690   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8691   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8692 
8693   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8694     .addImm(31)
8695     .addReg(Src.getReg(), 0, AMDGPU::sub0);
8696 
8697   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8698     .addReg(Src.getReg(), 0, AMDGPU::sub0)
8699     .addImm(AMDGPU::sub0)
8700     .addReg(TmpReg)
8701     .addImm(AMDGPU::sub1);
8702 
8703   MRI.replaceRegWith(Dest.getReg(), ResultReg);
8704   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8705 }
8706 
splitScalar64BitCountOp(SIInstrWorklist & Worklist,MachineInstr & Inst,unsigned Opcode,MachineDominatorTree * MDT) const8707 void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8708                                           MachineInstr &Inst, unsigned Opcode,
8709                                           MachineDominatorTree *MDT) const {
8710   //  (S_FLBIT_I32_B64 hi:lo) ->
8711   // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8712   //  (S_FF1_I32_B64 hi:lo) ->
8713   // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8714 
8715   MachineBasicBlock &MBB = *Inst.getParent();
8716   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8717   MachineBasicBlock::iterator MII = Inst;
8718   const DebugLoc &DL = Inst.getDebugLoc();
8719 
8720   MachineOperand &Dest = Inst.getOperand(0);
8721   MachineOperand &Src = Inst.getOperand(1);
8722 
8723   const MCInstrDesc &InstDesc = get(Opcode);
8724 
8725   bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8726   unsigned OpcodeAdd =
8727       ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8728 
8729   const TargetRegisterClass *SrcRC =
8730       Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8731   const TargetRegisterClass *SrcSubRC =
8732       RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8733 
8734   MachineOperand SrcRegSub0 =
8735       buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8736   MachineOperand SrcRegSub1 =
8737       buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8738 
8739   Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8740   Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8741   Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8742   Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8743 
8744   BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8745 
8746   BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8747 
8748   BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8749       .addReg(IsCtlz ? MidReg1 : MidReg2)
8750       .addImm(32)
8751       .addImm(1); // enable clamp
8752 
8753   BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8754       .addReg(MidReg3)
8755       .addReg(IsCtlz ? MidReg2 : MidReg1);
8756 
8757   MRI.replaceRegWith(Dest.getReg(), MidReg4);
8758 
8759   addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8760 }
8761 
addUsersToMoveToVALUWorklist(Register DstReg,MachineRegisterInfo & MRI,SIInstrWorklist & Worklist) const8762 void SIInstrInfo::addUsersToMoveToVALUWorklist(
8763     Register DstReg, MachineRegisterInfo &MRI,
8764     SIInstrWorklist &Worklist) const {
8765   for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
8766     MachineInstr &UseMI = *MO.getParent();
8767 
8768     unsigned OpNo = 0;
8769 
8770     switch (UseMI.getOpcode()) {
8771     case AMDGPU::COPY:
8772     case AMDGPU::WQM:
8773     case AMDGPU::SOFT_WQM:
8774     case AMDGPU::STRICT_WWM:
8775     case AMDGPU::STRICT_WQM:
8776     case AMDGPU::REG_SEQUENCE:
8777     case AMDGPU::PHI:
8778     case AMDGPU::INSERT_SUBREG:
8779       break;
8780     default:
8781       OpNo = MO.getOperandNo();
8782       break;
8783     }
8784 
8785     if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))
8786       Worklist.insert(&UseMI);
8787     else
8788       // Legalization could change user list.
8789       legalizeOperandsVALUt16(UseMI, OpNo, MRI);
8790   }
8791 }
8792 
movePackToVALU(SIInstrWorklist & Worklist,MachineRegisterInfo & MRI,MachineInstr & Inst) const8793 void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8794                                  MachineRegisterInfo &MRI,
8795                                  MachineInstr &Inst) const {
8796   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8797   MachineBasicBlock *MBB = Inst.getParent();
8798   MachineOperand &Src0 = Inst.getOperand(1);
8799   MachineOperand &Src1 = Inst.getOperand(2);
8800   const DebugLoc &DL = Inst.getDebugLoc();
8801 
8802   switch (Inst.getOpcode()) {
8803   case AMDGPU::S_PACK_LL_B32_B16: {
8804     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8805     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8806 
8807     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8808     // 0.
8809     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8810       .addImm(0xffff);
8811 
8812     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8813       .addReg(ImmReg, RegState::Kill)
8814       .add(Src0);
8815 
8816     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8817       .add(Src1)
8818       .addImm(16)
8819       .addReg(TmpReg, RegState::Kill);
8820     break;
8821   }
8822   case AMDGPU::S_PACK_LH_B32_B16: {
8823     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8824     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8825       .addImm(0xffff);
8826     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8827       .addReg(ImmReg, RegState::Kill)
8828       .add(Src0)
8829       .add(Src1);
8830     break;
8831   }
8832   case AMDGPU::S_PACK_HL_B32_B16: {
8833     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8834     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8835         .addImm(16)
8836         .add(Src0);
8837     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8838         .add(Src1)
8839         .addImm(16)
8840         .addReg(TmpReg, RegState::Kill);
8841     break;
8842   }
8843   case AMDGPU::S_PACK_HH_B32_B16: {
8844     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8845     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8846     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8847       .addImm(16)
8848       .add(Src0);
8849     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8850       .addImm(0xffff0000);
8851     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8852       .add(Src1)
8853       .addReg(ImmReg, RegState::Kill)
8854       .addReg(TmpReg, RegState::Kill);
8855     break;
8856   }
8857   default:
8858     llvm_unreachable("unhandled s_pack_* instruction");
8859   }
8860 
8861   MachineOperand &Dest = Inst.getOperand(0);
8862   MRI.replaceRegWith(Dest.getReg(), ResultReg);
8863   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8864 }
8865 
addSCCDefUsersToVALUWorklist(MachineOperand & Op,MachineInstr & SCCDefInst,SIInstrWorklist & Worklist,Register NewCond) const8866 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8867                                                MachineInstr &SCCDefInst,
8868                                                SIInstrWorklist &Worklist,
8869                                                Register NewCond) const {
8870 
8871   // Ensure that def inst defines SCC, which is still live.
8872   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8873          !Op.isDead() && Op.getParent() == &SCCDefInst);
8874   SmallVector<MachineInstr *, 4> CopyToDelete;
8875   // This assumes that all the users of SCC are in the same block
8876   // as the SCC def.
8877   for (MachineInstr &MI : // Skip the def inst itself.
8878        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8879                   SCCDefInst.getParent()->end())) {
8880     // Check if SCC is used first.
8881     int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8882     if (SCCIdx != -1) {
8883       if (MI.isCopy()) {
8884         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8885         Register DestReg = MI.getOperand(0).getReg();
8886 
8887         MRI.replaceRegWith(DestReg, NewCond);
8888         CopyToDelete.push_back(&MI);
8889       } else {
8890 
8891         if (NewCond.isValid())
8892           MI.getOperand(SCCIdx).setReg(NewCond);
8893 
8894         Worklist.insert(&MI);
8895       }
8896     }
8897     // Exit if we find another SCC def.
8898     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8899       break;
8900   }
8901   for (auto &Copy : CopyToDelete)
8902     Copy->eraseFromParent();
8903 }
8904 
8905 // Instructions that use SCC may be converted to VALU instructions. When that
8906 // happens, the SCC register is changed to VCC_LO. The instruction that defines
8907 // SCC must be changed to an instruction that defines VCC. This function makes
8908 // sure that the instruction that defines SCC is added to the moveToVALU
8909 // worklist.
addSCCDefsToVALUWorklist(MachineInstr * SCCUseInst,SIInstrWorklist & Worklist) const8910 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8911                                            SIInstrWorklist &Worklist) const {
8912   // Look for a preceding instruction that either defines VCC or SCC. If VCC
8913   // then there is nothing to do because the defining instruction has been
8914   // converted to a VALU already. If SCC then that instruction needs to be
8915   // converted to a VALU.
8916   for (MachineInstr &MI :
8917        make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8918                   SCCUseInst->getParent()->rend())) {
8919     if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8920       break;
8921     if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8922       Worklist.insert(&MI);
8923       break;
8924     }
8925   }
8926 }
8927 
getDestEquivalentVGPRClass(const MachineInstr & Inst) const8928 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8929   const MachineInstr &Inst) const {
8930   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8931 
8932   switch (Inst.getOpcode()) {
8933   // For target instructions, getOpRegClass just returns the virtual register
8934   // class associated with the operand, so we need to find an equivalent VGPR
8935   // register class in order to move the instruction to the VALU.
8936   case AMDGPU::COPY:
8937   case AMDGPU::PHI:
8938   case AMDGPU::REG_SEQUENCE:
8939   case AMDGPU::INSERT_SUBREG:
8940   case AMDGPU::WQM:
8941   case AMDGPU::SOFT_WQM:
8942   case AMDGPU::STRICT_WWM:
8943   case AMDGPU::STRICT_WQM: {
8944     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8945     if (RI.isAGPRClass(SrcRC)) {
8946       if (RI.isAGPRClass(NewDstRC))
8947         return nullptr;
8948 
8949       switch (Inst.getOpcode()) {
8950       case AMDGPU::PHI:
8951       case AMDGPU::REG_SEQUENCE:
8952       case AMDGPU::INSERT_SUBREG:
8953         NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8954         break;
8955       default:
8956         NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8957       }
8958 
8959       if (!NewDstRC)
8960         return nullptr;
8961     } else {
8962       if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8963         return nullptr;
8964 
8965       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8966       if (!NewDstRC)
8967         return nullptr;
8968     }
8969 
8970     return NewDstRC;
8971   }
8972   default:
8973     return NewDstRC;
8974   }
8975 }
8976 
8977 // Find the one SGPR operand we are allowed to use.
findUsedSGPR(const MachineInstr & MI,int OpIndices[3]) const8978 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8979                                    int OpIndices[3]) const {
8980   const MCInstrDesc &Desc = MI.getDesc();
8981 
8982   // Find the one SGPR operand we are allowed to use.
8983   //
8984   // First we need to consider the instruction's operand requirements before
8985   // legalizing. Some operands are required to be SGPRs, such as implicit uses
8986   // of VCC, but we are still bound by the constant bus requirement to only use
8987   // one.
8988   //
8989   // If the operand's class is an SGPR, we can never move it.
8990 
8991   Register SGPRReg = findImplicitSGPRRead(MI);
8992   if (SGPRReg)
8993     return SGPRReg;
8994 
8995   Register UsedSGPRs[3] = {Register()};
8996   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8997 
8998   for (unsigned i = 0; i < 3; ++i) {
8999     int Idx = OpIndices[i];
9000     if (Idx == -1)
9001       break;
9002 
9003     const MachineOperand &MO = MI.getOperand(Idx);
9004     if (!MO.isReg())
9005       continue;
9006 
9007     // Is this operand statically required to be an SGPR based on the operand
9008     // constraints?
9009     const TargetRegisterClass *OpRC =
9010         RI.getRegClass(Desc.operands()[Idx].RegClass);
9011     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9012     if (IsRequiredSGPR)
9013       return MO.getReg();
9014 
9015     // If this could be a VGPR or an SGPR, Check the dynamic register class.
9016     Register Reg = MO.getReg();
9017     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9018     if (RI.isSGPRClass(RegRC))
9019       UsedSGPRs[i] = Reg;
9020   }
9021 
9022   // We don't have a required SGPR operand, so we have a bit more freedom in
9023   // selecting operands to move.
9024 
9025   // Try to select the most used SGPR. If an SGPR is equal to one of the
9026   // others, we choose that.
9027   //
9028   // e.g.
9029   // V_FMA_F32 v0, s0, s0, s0 -> No moves
9030   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9031 
9032   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9033   // prefer those.
9034 
9035   if (UsedSGPRs[0]) {
9036     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9037       SGPRReg = UsedSGPRs[0];
9038   }
9039 
9040   if (!SGPRReg && UsedSGPRs[1]) {
9041     if (UsedSGPRs[1] == UsedSGPRs[2])
9042       SGPRReg = UsedSGPRs[1];
9043   }
9044 
9045   return SGPRReg;
9046 }
9047 
getNamedOperand(MachineInstr & MI,AMDGPU::OpName OperandName) const9048 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9049                                              AMDGPU::OpName OperandName) const {
9050   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9051   if (Idx == -1)
9052     return nullptr;
9053 
9054   return &MI.getOperand(Idx);
9055 }
9056 
getDefaultRsrcDataFormat() const9057 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9058   if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9059     int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9060                          ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9061                          : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9062     return (Format << 44) |
9063            (1ULL << 56) | // RESOURCE_LEVEL = 1
9064            (3ULL << 60); // OOB_SELECT = 3
9065   }
9066 
9067   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9068   if (ST.isAmdHsaOS()) {
9069     // Set ATC = 1. GFX9 doesn't have this bit.
9070     if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9071       RsrcDataFormat |= (1ULL << 56);
9072 
9073     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9074     // BTW, it disables TC L2 and therefore decreases performance.
9075     if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9076       RsrcDataFormat |= (2ULL << 59);
9077   }
9078 
9079   return RsrcDataFormat;
9080 }
9081 
getScratchRsrcWords23() const9082 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9083   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9084                     AMDGPU::RSRC_TID_ENABLE |
9085                     0xffffffff; // Size;
9086 
9087   // GFX9 doesn't have ELEMENT_SIZE.
9088   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9089     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9090     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9091   }
9092 
9093   // IndexStride = 64 / 32.
9094   uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9095   Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9096 
9097   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9098   // Clear them unless we want a huge stride.
9099   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9100       ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9101     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9102 
9103   return Rsrc23;
9104 }
9105 
isLowLatencyInstruction(const MachineInstr & MI) const9106 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9107   unsigned Opc = MI.getOpcode();
9108 
9109   return isSMRD(Opc);
9110 }
9111 
isHighLatencyDef(int Opc) const9112 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9113   return get(Opc).mayLoad() &&
9114          (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9115 }
9116 
isStackAccess(const MachineInstr & MI,int & FrameIndex) const9117 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
9118                                     int &FrameIndex) const {
9119   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9120   if (!Addr || !Addr->isFI())
9121     return Register();
9122 
9123   assert(!MI.memoperands_empty() &&
9124          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9125 
9126   FrameIndex = Addr->getIndex();
9127   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9128 }
9129 
isSGPRStackAccess(const MachineInstr & MI,int & FrameIndex) const9130 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
9131                                         int &FrameIndex) const {
9132   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9133   assert(Addr && Addr->isFI());
9134   FrameIndex = Addr->getIndex();
9135   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9136 }
9137 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const9138 Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9139                                           int &FrameIndex) const {
9140   if (!MI.mayLoad())
9141     return Register();
9142 
9143   if (isMUBUF(MI) || isVGPRSpill(MI))
9144     return isStackAccess(MI, FrameIndex);
9145 
9146   if (isSGPRSpill(MI))
9147     return isSGPRStackAccess(MI, FrameIndex);
9148 
9149   return Register();
9150 }
9151 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const9152 Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9153                                          int &FrameIndex) const {
9154   if (!MI.mayStore())
9155     return Register();
9156 
9157   if (isMUBUF(MI) || isVGPRSpill(MI))
9158     return isStackAccess(MI, FrameIndex);
9159 
9160   if (isSGPRSpill(MI))
9161     return isSGPRStackAccess(MI, FrameIndex);
9162 
9163   return Register();
9164 }
9165 
getInstBundleSize(const MachineInstr & MI) const9166 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
9167   unsigned Size = 0;
9168   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
9169   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9170   while (++I != E && I->isInsideBundle()) {
9171     assert(!I->isBundle() && "No nested bundle!");
9172     Size += getInstSizeInBytes(*I);
9173   }
9174 
9175   return Size;
9176 }
9177 
getInstSizeInBytes(const MachineInstr & MI) const9178 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9179   unsigned Opc = MI.getOpcode();
9180   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
9181   unsigned DescSize = Desc.getSize();
9182 
9183   // If we have a definitive size, we can use it. Otherwise we need to inspect
9184   // the operands to know the size.
9185   if (isFixedSize(MI)) {
9186     unsigned Size = DescSize;
9187 
9188     // If we hit the buggy offset, an extra nop will be inserted in MC so
9189     // estimate the worst case.
9190     if (MI.isBranch() && ST.hasOffset3fBug())
9191       Size += 4;
9192 
9193     return Size;
9194   }
9195 
9196   // Instructions may have a 32-bit literal encoded after them. Check
9197   // operands that could ever be literals.
9198   if (isVALU(MI) || isSALU(MI)) {
9199     if (isDPP(MI))
9200       return DescSize;
9201     bool HasLiteral = false;
9202     unsigned LiteralSize = 4;
9203     for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9204       const MachineOperand &Op = MI.getOperand(I);
9205       const MCOperandInfo &OpInfo = Desc.operands()[I];
9206       if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9207         HasLiteral = true;
9208         if (ST.has64BitLiterals()) {
9209           switch (OpInfo.OperandType) {
9210           default:
9211             break;
9212           case AMDGPU::OPERAND_REG_IMM_FP64:
9213             if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9214               LiteralSize = 8;
9215             break;
9216           case AMDGPU::OPERAND_REG_IMM_INT64:
9217             if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9218               LiteralSize = 8;
9219             break;
9220           }
9221         }
9222         break;
9223       }
9224     }
9225     return HasLiteral ? DescSize + LiteralSize : DescSize;
9226   }
9227 
9228   // Check whether we have extra NSA words.
9229   if (isMIMG(MI)) {
9230     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9231     if (VAddr0Idx < 0)
9232       return 8;
9233 
9234     int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9235     return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9236   }
9237 
9238   switch (Opc) {
9239   case TargetOpcode::BUNDLE:
9240     return getInstBundleSize(MI);
9241   case TargetOpcode::INLINEASM:
9242   case TargetOpcode::INLINEASM_BR: {
9243     const MachineFunction *MF = MI.getParent()->getParent();
9244     const char *AsmStr = MI.getOperand(0).getSymbolName();
9245     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9246   }
9247   default:
9248     if (MI.isMetaInstruction())
9249       return 0;
9250     return DescSize;
9251   }
9252 }
9253 
mayAccessFlatAddressSpace(const MachineInstr & MI) const9254 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
9255   if (!isFLAT(MI))
9256     return false;
9257 
9258   if (MI.memoperands_empty())
9259     return true;
9260 
9261   for (const MachineMemOperand *MMO : MI.memoperands()) {
9262     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9263       return true;
9264   }
9265   return false;
9266 }
9267 
9268 ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const9269 SIInstrInfo::getSerializableTargetIndices() const {
9270   static const std::pair<int, const char *> TargetIndices[] = {
9271       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9272       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9273       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9274       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9275       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9276   return ArrayRef(TargetIndices);
9277 }
9278 
9279 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
9280 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9281 ScheduleHazardRecognizer *
CreateTargetPostRAHazardRecognizer(const InstrItineraryData * II,const ScheduleDAG * DAG) const9282 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9283                                             const ScheduleDAG *DAG) const {
9284   return new GCNHazardRecognizer(DAG->MF);
9285 }
9286 
9287 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9288 /// pass.
9289 ScheduleHazardRecognizer *
CreateTargetPostRAHazardRecognizer(const MachineFunction & MF) const9290 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
9291   return new GCNHazardRecognizer(MF);
9292 }
9293 
9294 // Called during:
9295 // - pre-RA scheduling and post-RA scheduling
9296 ScheduleHazardRecognizer *
CreateTargetMIHazardRecognizer(const InstrItineraryData * II,const ScheduleDAGMI * DAG) const9297 SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
9298                                             const ScheduleDAGMI *DAG) const {
9299   // Borrowed from Arm Target
9300   // We would like to restrict this hazard recognizer to only
9301   // post-RA scheduling; we can tell that we're post-RA because we don't
9302   // track VRegLiveness.
9303   if (!DAG->hasVRegLiveness())
9304     return new GCNHazardRecognizer(DAG->MF);
9305   return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
9306 }
9307 
9308 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const9309 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9310   return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9311 }
9312 
9313 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const9314 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9315   static const std::pair<unsigned, const char *> TargetFlags[] = {
9316     { MO_GOTPCREL, "amdgpu-gotprel" },
9317     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
9318     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
9319     { MO_REL32_LO, "amdgpu-rel32-lo" },
9320     { MO_REL32_HI, "amdgpu-rel32-hi" },
9321     { MO_ABS32_LO, "amdgpu-abs32-lo" },
9322     { MO_ABS32_HI, "amdgpu-abs32-hi" },
9323   };
9324 
9325   return ArrayRef(TargetFlags);
9326 }
9327 
9328 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const9329 SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9330   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9331       {
9332           {MONoClobber, "amdgpu-noclobber"},
9333           {MOLastUse, "amdgpu-last-use"},
9334       };
9335 
9336   return ArrayRef(TargetFlags);
9337 }
9338 
getLiveRangeSplitOpcode(Register SrcReg,const MachineFunction & MF) const9339 unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
9340                                               const MachineFunction &MF) const {
9341   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9342   assert(SrcReg.isVirtual());
9343   if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9344     return AMDGPU::WWM_COPY;
9345 
9346   return AMDGPU::COPY;
9347 }
9348 
isBasicBlockPrologue(const MachineInstr & MI,Register Reg) const9349 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
9350                                        Register Reg) const {
9351   // We need to handle instructions which may be inserted during register
9352   // allocation to handle the prolog. The initial prolog instruction may have
9353   // been separated from the start of the block by spills and copies inserted
9354   // needed by the prolog. However, the insertions for scalar registers can
9355   // always be placed at the BB top as they are independent of the exec mask
9356   // value.
9357   const MachineFunction *MF = MI.getParent()->getParent();
9358   bool IsNullOrVectorRegister = true;
9359   if (Reg) {
9360     const MachineRegisterInfo &MRI = MF->getRegInfo();
9361     IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9362   }
9363 
9364   uint16_t Opcode = MI.getOpcode();
9365   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
9366   return IsNullOrVectorRegister &&
9367          (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9368           (Opcode == AMDGPU::IMPLICIT_DEF &&
9369            MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9370           (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9371            MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9372 }
9373 
9374 MachineInstrBuilder
getAddNoCarry(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DestReg) const9375 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
9376                            MachineBasicBlock::iterator I,
9377                            const DebugLoc &DL,
9378                            Register DestReg) const {
9379   if (ST.hasAddNoCarry())
9380     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9381 
9382   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9383   Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9384   MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9385 
9386   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9387            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9388 }
9389 
getAddNoCarry(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DestReg,RegScavenger & RS) const9390 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
9391                                                MachineBasicBlock::iterator I,
9392                                                const DebugLoc &DL,
9393                                                Register DestReg,
9394                                                RegScavenger &RS) const {
9395   if (ST.hasAddNoCarry())
9396     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9397 
9398   // If available, prefer to use vcc.
9399   Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9400                              ? Register(RI.getVCC())
9401                              : RS.scavengeRegisterBackwards(
9402                                    *RI.getBoolRC(), I, /* RestoreAfter */ false,
9403                                    0, /* AllowSpill */ false);
9404 
9405   // TODO: Users need to deal with this.
9406   if (!UnusedCarry.isValid())
9407     return MachineInstrBuilder();
9408 
9409   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9410            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9411 }
9412 
isKillTerminator(unsigned Opcode)9413 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9414   switch (Opcode) {
9415   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9416   case AMDGPU::SI_KILL_I1_TERMINATOR:
9417     return true;
9418   default:
9419     return false;
9420   }
9421 }
9422 
getKillTerminatorFromPseudo(unsigned Opcode) const9423 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
9424   switch (Opcode) {
9425   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9426     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9427   case AMDGPU::SI_KILL_I1_PSEUDO:
9428     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9429   default:
9430     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9431   }
9432 }
9433 
isLegalMUBUFImmOffset(unsigned Imm) const9434 bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9435   return Imm <= getMaxMUBUFImmOffset(ST);
9436 }
9437 
getMaxMUBUFImmOffset(const GCNSubtarget & ST)9438 unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
9439   // GFX12 field is non-negative 24-bit signed byte offset.
9440   const unsigned OffsetBits =
9441       ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9442   return (1 << OffsetBits) - 1;
9443 }
9444 
fixImplicitOperands(MachineInstr & MI) const9445 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
9446   if (!ST.isWave32())
9447     return;
9448 
9449   if (MI.isInlineAsm())
9450     return;
9451 
9452   for (auto &Op : MI.implicit_operands()) {
9453     if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9454       Op.setReg(AMDGPU::VCC_LO);
9455   }
9456 }
9457 
isBufferSMRD(const MachineInstr & MI) const9458 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
9459   if (!isSMRD(MI))
9460     return false;
9461 
9462   // Check that it is using a buffer resource.
9463   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9464   if (Idx == -1) // e.g. s_memtime
9465     return false;
9466 
9467   const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9468   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9469 }
9470 
9471 // Given Imm, split it into the values to put into the SOffset and ImmOffset
9472 // fields in an MUBUF instruction. Return false if it is not possible (due to a
9473 // hardware bug needing a workaround).
9474 //
9475 // The required alignment ensures that individual address components remain
9476 // aligned if they are aligned to begin with. It also ensures that additional
9477 // offsets within the given alignment can be added to the resulting ImmOffset.
splitMUBUFOffset(uint32_t Imm,uint32_t & SOffset,uint32_t & ImmOffset,Align Alignment) const9478 bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
9479                                    uint32_t &ImmOffset, Align Alignment) const {
9480   const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9481   const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9482   uint32_t Overflow = 0;
9483 
9484   if (Imm > MaxImm) {
9485     if (Imm <= MaxImm + 64) {
9486       // Use an SOffset inline constant for 4..64
9487       Overflow = Imm - MaxImm;
9488       Imm = MaxImm;
9489     } else {
9490       // Try to keep the same value in SOffset for adjacent loads, so that
9491       // the corresponding register contents can be re-used.
9492       //
9493       // Load values with all low-bits (except for alignment bits) set into
9494       // SOffset, so that a larger range of values can be covered using
9495       // s_movk_i32.
9496       //
9497       // Atomic operations fail to work correctly when individual address
9498       // components are unaligned, even if their sum is aligned.
9499       uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9500       uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9501       Imm = Low;
9502       Overflow = High - Alignment.value();
9503     }
9504   }
9505 
9506   if (Overflow > 0) {
9507     // There is a hardware bug in SI and CI which prevents address clamping in
9508     // MUBUF instructions from working correctly with SOffsets. The immediate
9509     // offset is unaffected.
9510     if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9511       return false;
9512 
9513     // It is not possible to set immediate in SOffset field on some targets.
9514     if (ST.hasRestrictedSOffset())
9515       return false;
9516   }
9517 
9518   ImmOffset = Imm;
9519   SOffset = Overflow;
9520   return true;
9521 }
9522 
9523 // Depending on the used address space and instructions, some immediate offsets
9524 // are allowed and some are not.
9525 // Pre-GFX12, flat instruction offsets can only be non-negative, global and
9526 // scratch instruction offsets can also be negative. On GFX12, offsets can be
9527 // negative for all variants.
9528 //
9529 // There are several bugs related to these offsets:
9530 // On gfx10.1, flat instructions that go into the global address space cannot
9531 // use an offset.
9532 //
9533 // For scratch instructions, the address can be either an SGPR or a VGPR.
9534 // The following offsets can be used, depending on the architecture (x means
9535 // cannot be used):
9536 // +----------------------------+------+------+
9537 // | Address-Mode               | SGPR | VGPR |
9538 // +----------------------------+------+------+
9539 // | gfx9                       |      |      |
9540 // | negative, 4-aligned offset | x    | ok   |
9541 // | negative, unaligned offset | x    | ok   |
9542 // +----------------------------+------+------+
9543 // | gfx10                      |      |      |
9544 // | negative, 4-aligned offset | ok   | ok   |
9545 // | negative, unaligned offset | ok   | x    |
9546 // +----------------------------+------+------+
9547 // | gfx10.3                    |      |      |
9548 // | negative, 4-aligned offset | ok   | ok   |
9549 // | negative, unaligned offset | ok   | ok   |
9550 // +----------------------------+------+------+
9551 //
9552 // This function ignores the addressing mode, so if an offset cannot be used in
9553 // one addressing mode, it is considered illegal.
isLegalFLATOffset(int64_t Offset,unsigned AddrSpace,uint64_t FlatVariant) const9554 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9555                                     uint64_t FlatVariant) const {
9556   // TODO: Should 0 be special cased?
9557   if (!ST.hasFlatInstOffsets())
9558     return false;
9559 
9560   if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9561       (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9562        AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9563     return false;
9564 
9565   if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9566       FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9567       (Offset % 4) != 0) {
9568     return false;
9569   }
9570 
9571   bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9572   unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9573   return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9574 }
9575 
9576 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9577 std::pair<int64_t, int64_t>
splitFlatOffset(int64_t COffsetVal,unsigned AddrSpace,uint64_t FlatVariant) const9578 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9579                              uint64_t FlatVariant) const {
9580   int64_t RemainderOffset = COffsetVal;
9581   int64_t ImmField = 0;
9582 
9583   bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9584   const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9585 
9586   if (AllowNegative) {
9587     // Use signed division by a power of two to truncate towards 0.
9588     int64_t D = 1LL << NumBits;
9589     RemainderOffset = (COffsetVal / D) * D;
9590     ImmField = COffsetVal - RemainderOffset;
9591 
9592     if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9593         FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9594         (ImmField % 4) != 0) {
9595       // Make ImmField a multiple of 4
9596       RemainderOffset += ImmField % 4;
9597       ImmField -= ImmField % 4;
9598     }
9599   } else if (COffsetVal >= 0) {
9600     ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9601     RemainderOffset = COffsetVal - ImmField;
9602   }
9603 
9604   assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9605   assert(RemainderOffset + ImmField == COffsetVal);
9606   return {ImmField, RemainderOffset};
9607 }
9608 
allowNegativeFlatOffset(uint64_t FlatVariant) const9609 bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
9610   if (ST.hasNegativeScratchOffsetBug() &&
9611       FlatVariant == SIInstrFlags::FlatScratch)
9612     return false;
9613 
9614   return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9615 }
9616 
subtargetEncodingFamily(const GCNSubtarget & ST)9617 static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9618   switch (ST.getGeneration()) {
9619   default:
9620     break;
9621   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
9622   case AMDGPUSubtarget::SEA_ISLANDS:
9623     return SIEncodingFamily::SI;
9624   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
9625   case AMDGPUSubtarget::GFX9:
9626     return SIEncodingFamily::VI;
9627   case AMDGPUSubtarget::GFX10:
9628     return SIEncodingFamily::GFX10;
9629   case AMDGPUSubtarget::GFX11:
9630     return SIEncodingFamily::GFX11;
9631   case AMDGPUSubtarget::GFX12:
9632     return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9633                                 : SIEncodingFamily::GFX12;
9634   }
9635   llvm_unreachable("Unknown subtarget generation!");
9636 }
9637 
isAsmOnlyOpcode(int MCOp) const9638 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9639   switch(MCOp) {
9640   // These opcodes use indirect register addressing so
9641   // they need special handling by codegen (currently missing).
9642   // Therefore it is too risky to allow these opcodes
9643   // to be selected by dpp combiner or sdwa peepholer.
9644   case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9645   case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9646   case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9647   case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9648   case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9649   case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9650   case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9651   case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9652     return true;
9653   default:
9654     return false;
9655   }
9656 }
9657 
9658 #define GENERATE_RENAMED_GFX9_CASES(OPCODE)                                    \
9659   case OPCODE##_dpp:                                                           \
9660   case OPCODE##_e32:                                                           \
9661   case OPCODE##_e64:                                                           \
9662   case OPCODE##_e64_dpp:                                                       \
9663   case OPCODE##_sdwa:
9664 
isRenamedInGFX9(int Opcode)9665 static bool isRenamedInGFX9(int Opcode) {
9666   switch (Opcode) {
9667     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9668     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9669     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9670     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9671     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9672     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9673     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9674     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9675     GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9676   //
9677   case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9678   case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9679   case AMDGPU::V_FMA_F16_gfx9_e64:
9680   case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9681   case AMDGPU::V_INTERP_P2_F16:
9682   case AMDGPU::V_MAD_F16_e64:
9683   case AMDGPU::V_MAD_U16_e64:
9684   case AMDGPU::V_MAD_I16_e64:
9685     return true;
9686   default:
9687     return false;
9688   }
9689 }
9690 
pseudoToMCOpcode(int Opcode) const9691 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9692   Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9693 
9694   unsigned Gen = subtargetEncodingFamily(ST);
9695 
9696   if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9697     Gen = SIEncodingFamily::GFX9;
9698 
9699   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9700   // subtarget has UnpackedD16VMem feature.
9701   // TODO: remove this when we discard GFX80 encoding.
9702   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9703     Gen = SIEncodingFamily::GFX80;
9704 
9705   if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9706     switch (ST.getGeneration()) {
9707     default:
9708       Gen = SIEncodingFamily::SDWA;
9709       break;
9710     case AMDGPUSubtarget::GFX9:
9711       Gen = SIEncodingFamily::SDWA9;
9712       break;
9713     case AMDGPUSubtarget::GFX10:
9714       Gen = SIEncodingFamily::SDWA10;
9715       break;
9716     }
9717   }
9718 
9719   if (isMAI(Opcode)) {
9720     int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9721     if (MFMAOp != -1)
9722       Opcode = MFMAOp;
9723   }
9724 
9725   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9726 
9727   if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9728     MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);
9729 
9730   // -1 means that Opcode is already a native instruction.
9731   if (MCOp == -1)
9732     return Opcode;
9733 
9734   if (ST.hasGFX90AInsts()) {
9735     uint16_t NMCOp = (uint16_t)-1;
9736     if (ST.hasGFX940Insts())
9737       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
9738     if (NMCOp == (uint16_t)-1)
9739       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
9740     if (NMCOp == (uint16_t)-1)
9741       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
9742     if (NMCOp != (uint16_t)-1)
9743       MCOp = NMCOp;
9744   }
9745 
9746   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9747   // no encoding in the given subtarget generation.
9748   if (MCOp == (uint16_t)-1)
9749     return -1;
9750 
9751   if (isAsmOnlyOpcode(MCOp))
9752     return -1;
9753 
9754   return MCOp;
9755 }
9756 
9757 static
getRegOrUndef(const MachineOperand & RegOpnd)9758 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
9759   assert(RegOpnd.isReg());
9760   return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9761                              getRegSubRegPair(RegOpnd);
9762 }
9763 
9764 TargetInstrInfo::RegSubRegPair
getRegSequenceSubReg(MachineInstr & MI,unsigned SubReg)9765 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
9766   assert(MI.isRegSequence());
9767   for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9768     if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9769       auto &RegOp = MI.getOperand(1 + 2 * I);
9770       return getRegOrUndef(RegOp);
9771     }
9772   return TargetInstrInfo::RegSubRegPair();
9773 }
9774 
9775 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
9776 // Following a subreg of reg:subreg isn't supported
followSubRegDef(MachineInstr & MI,TargetInstrInfo::RegSubRegPair & RSR)9777 static bool followSubRegDef(MachineInstr &MI,
9778                             TargetInstrInfo::RegSubRegPair &RSR) {
9779   if (!RSR.SubReg)
9780     return false;
9781   switch (MI.getOpcode()) {
9782   default: break;
9783   case AMDGPU::REG_SEQUENCE:
9784     RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9785     return true;
9786   // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9787   case AMDGPU::INSERT_SUBREG:
9788     if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9789       // inserted the subreg we're looking for
9790       RSR = getRegOrUndef(MI.getOperand(2));
9791     else { // the subreg in the rest of the reg
9792       auto R1 = getRegOrUndef(MI.getOperand(1));
9793       if (R1.SubReg) // subreg of subreg isn't supported
9794         return false;
9795       RSR.Reg = R1.Reg;
9796     }
9797     return true;
9798   }
9799   return false;
9800 }
9801 
getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair & P,MachineRegisterInfo & MRI)9802 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
9803                                      MachineRegisterInfo &MRI) {
9804   assert(MRI.isSSA());
9805   if (!P.Reg.isVirtual())
9806     return nullptr;
9807 
9808   auto RSR = P;
9809   auto *DefInst = MRI.getVRegDef(RSR.Reg);
9810   while (auto *MI = DefInst) {
9811     DefInst = nullptr;
9812     switch (MI->getOpcode()) {
9813     case AMDGPU::COPY:
9814     case AMDGPU::V_MOV_B32_e32: {
9815       auto &Op1 = MI->getOperand(1);
9816       if (Op1.isReg() && Op1.getReg().isVirtual()) {
9817         if (Op1.isUndef())
9818           return nullptr;
9819         RSR = getRegSubRegPair(Op1);
9820         DefInst = MRI.getVRegDef(RSR.Reg);
9821       }
9822       break;
9823     }
9824     default:
9825       if (followSubRegDef(*MI, RSR)) {
9826         if (!RSR.Reg)
9827           return nullptr;
9828         DefInst = MRI.getVRegDef(RSR.Reg);
9829       }
9830     }
9831     if (!DefInst)
9832       return MI;
9833   }
9834   return nullptr;
9835 }
9836 
execMayBeModifiedBeforeUse(const MachineRegisterInfo & MRI,Register VReg,const MachineInstr & DefMI,const MachineInstr & UseMI)9837 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
9838                                       Register VReg,
9839                                       const MachineInstr &DefMI,
9840                                       const MachineInstr &UseMI) {
9841   assert(MRI.isSSA() && "Must be run on SSA");
9842 
9843   auto *TRI = MRI.getTargetRegisterInfo();
9844   auto *DefBB = DefMI.getParent();
9845 
9846   // Don't bother searching between blocks, although it is possible this block
9847   // doesn't modify exec.
9848   if (UseMI.getParent() != DefBB)
9849     return true;
9850 
9851   const int MaxInstScan = 20;
9852   int NumInst = 0;
9853 
9854   // Stop scan at the use.
9855   auto E = UseMI.getIterator();
9856   for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9857     if (I->isDebugInstr())
9858       continue;
9859 
9860     if (++NumInst > MaxInstScan)
9861       return true;
9862 
9863     if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9864       return true;
9865   }
9866 
9867   return false;
9868 }
9869 
execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo & MRI,Register VReg,const MachineInstr & DefMI)9870 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
9871                                          Register VReg,
9872                                          const MachineInstr &DefMI) {
9873   assert(MRI.isSSA() && "Must be run on SSA");
9874 
9875   auto *TRI = MRI.getTargetRegisterInfo();
9876   auto *DefBB = DefMI.getParent();
9877 
9878   const int MaxUseScan = 10;
9879   int NumUse = 0;
9880 
9881   for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9882     auto &UseInst = *Use.getParent();
9883     // Don't bother searching between blocks, although it is possible this block
9884     // doesn't modify exec.
9885     if (UseInst.getParent() != DefBB || UseInst.isPHI())
9886       return true;
9887 
9888     if (++NumUse > MaxUseScan)
9889       return true;
9890   }
9891 
9892   if (NumUse == 0)
9893     return false;
9894 
9895   const int MaxInstScan = 20;
9896   int NumInst = 0;
9897 
9898   // Stop scan when we have seen all the uses.
9899   for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9900     assert(I != DefBB->end());
9901 
9902     if (I->isDebugInstr())
9903       continue;
9904 
9905     if (++NumInst > MaxInstScan)
9906       return true;
9907 
9908     for (const MachineOperand &Op : I->operands()) {
9909       // We don't check reg masks here as they're used only on calls:
9910       // 1. EXEC is only considered const within one BB
9911       // 2. Call should be a terminator instruction if present in a BB
9912 
9913       if (!Op.isReg())
9914         continue;
9915 
9916       Register Reg = Op.getReg();
9917       if (Op.isUse()) {
9918         if (Reg == VReg && --NumUse == 0)
9919           return false;
9920       } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9921         return true;
9922     }
9923   }
9924 }
9925 
createPHIDestinationCopy(MachineBasicBlock & MBB,MachineBasicBlock::iterator LastPHIIt,const DebugLoc & DL,Register Src,Register Dst) const9926 MachineInstr *SIInstrInfo::createPHIDestinationCopy(
9927     MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
9928     const DebugLoc &DL, Register Src, Register Dst) const {
9929   auto Cur = MBB.begin();
9930   if (Cur != MBB.end())
9931     do {
9932       if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9933         return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9934       ++Cur;
9935     } while (Cur != MBB.end() && Cur != LastPHIIt);
9936 
9937   return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9938                                                    Dst);
9939 }
9940 
createPHISourceCopy(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsPt,const DebugLoc & DL,Register Src,unsigned SrcSubReg,Register Dst) const9941 MachineInstr *SIInstrInfo::createPHISourceCopy(
9942     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
9943     const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9944   if (InsPt != MBB.end() &&
9945       (InsPt->getOpcode() == AMDGPU::SI_IF ||
9946        InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9947        InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9948       InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9949     InsPt++;
9950     return BuildMI(MBB, InsPt, DL,
9951                    get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9952                                      : AMDGPU::S_MOV_B64_term),
9953                    Dst)
9954         .addReg(Src, 0, SrcSubReg)
9955         .addReg(AMDGPU::EXEC, RegState::Implicit);
9956   }
9957   return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9958                                               Dst);
9959 }
9960 
isWave32() const9961 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9962 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const9963 MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
9964     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
9965     MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9966     VirtRegMap *VRM) const {
9967   // This is a bit of a hack (copied from AArch64). Consider this instruction:
9968   //
9969   //   %0:sreg_32 = COPY $m0
9970   //
9971   // We explicitly chose SReg_32 for the virtual register so such a copy might
9972   // be eliminated by RegisterCoalescer. However, that may not be possible, and
9973   // %0 may even spill. We can't spill $m0 normally (it would require copying to
9974   // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9975   // TargetInstrInfo::foldMemoryOperand() is going to try.
9976   // A similar issue also exists with spilling and reloading $exec registers.
9977   //
9978   // To prevent that, constrain the %0 register class here.
9979   if (isFullCopyInstr(MI)) {
9980     Register DstReg = MI.getOperand(0).getReg();
9981     Register SrcReg = MI.getOperand(1).getReg();
9982     if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9983         (DstReg.isVirtual() != SrcReg.isVirtual())) {
9984       MachineRegisterInfo &MRI = MF.getRegInfo();
9985       Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9986       const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9987       if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9988         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9989         return nullptr;
9990       }
9991       if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9992         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9993         return nullptr;
9994       }
9995     }
9996   }
9997 
9998   return nullptr;
9999 }
10000 
getInstrLatency(const InstrItineraryData * ItinData,const MachineInstr & MI,unsigned * PredCost) const10001 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
10002                                       const MachineInstr &MI,
10003                                       unsigned *PredCost) const {
10004   if (MI.isBundle()) {
10005     MachineBasicBlock::const_instr_iterator I(MI.getIterator());
10006     MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10007     unsigned Lat = 0, Count = 0;
10008     for (++I; I != E && I->isBundledWithPred(); ++I) {
10009       ++Count;
10010       Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10011     }
10012     return Lat + Count - 1;
10013   }
10014 
10015   return SchedModel.computeInstrLatency(&MI);
10016 }
10017 
10018 InstructionUniformity
getGenericInstructionUniformity(const MachineInstr & MI) const10019 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
10020   unsigned opcode = MI.getOpcode();
10021   if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10022     auto IID = GI->getIntrinsicID();
10023     if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
10024       return InstructionUniformity::NeverUniform;
10025     if (AMDGPU::isIntrinsicAlwaysUniform(IID))
10026       return InstructionUniformity::AlwaysUniform;
10027 
10028     switch (IID) {
10029     case Intrinsic::amdgcn_if:
10030     case Intrinsic::amdgcn_else:
10031       // FIXME: Uniform if second result
10032       break;
10033     }
10034 
10035     return InstructionUniformity::Default;
10036   }
10037 
10038   // Loads from the private and flat address spaces are divergent, because
10039   // threads can execute the load instruction with the same inputs and get
10040   // different results.
10041   //
10042   // All other loads are not divergent, because if threads issue loads with the
10043   // same arguments, they will always get the same result.
10044   if (opcode == AMDGPU::G_LOAD) {
10045     if (MI.memoperands_empty())
10046       return InstructionUniformity::NeverUniform; // conservative assumption
10047 
10048     if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10049           return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10050                  mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10051         })) {
10052       // At least one MMO in a non-global address space.
10053       return InstructionUniformity::NeverUniform;
10054     }
10055     return InstructionUniformity::Default;
10056   }
10057 
10058   if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
10059       opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10060       opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10061       AMDGPU::isGenericAtomic(opcode)) {
10062     return InstructionUniformity::NeverUniform;
10063   }
10064   return InstructionUniformity::Default;
10065 }
10066 
10067 InstructionUniformity
getInstructionUniformity(const MachineInstr & MI) const10068 SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
10069 
10070   if (isNeverUniform(MI))
10071     return InstructionUniformity::NeverUniform;
10072 
10073   unsigned opcode = MI.getOpcode();
10074   if (opcode == AMDGPU::V_READLANE_B32 ||
10075       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10076       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10077     return InstructionUniformity::AlwaysUniform;
10078 
10079   if (isCopyInstr(MI)) {
10080     const MachineOperand &srcOp = MI.getOperand(1);
10081     if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10082       const TargetRegisterClass *regClass =
10083           RI.getPhysRegBaseClass(srcOp.getReg());
10084       return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10085                                       : InstructionUniformity::NeverUniform;
10086     }
10087     return InstructionUniformity::Default;
10088   }
10089 
10090   // GMIR handling
10091   if (MI.isPreISelOpcode())
10092     return SIInstrInfo::getGenericInstructionUniformity(MI);
10093 
10094   // Atomics are divergent because they are executed sequentially: when an
10095   // atomic operation refers to the same address in each thread, then each
10096   // thread after the first sees the value written by the previous thread as
10097   // original value.
10098 
10099   if (isAtomic(MI))
10100     return InstructionUniformity::NeverUniform;
10101 
10102   // Loads from the private and flat address spaces are divergent, because
10103   // threads can execute the load instruction with the same inputs and get
10104   // different results.
10105   if (isFLAT(MI) && MI.mayLoad()) {
10106     if (MI.memoperands_empty())
10107       return InstructionUniformity::NeverUniform; // conservative assumption
10108 
10109     if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10110           return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10111                  mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10112         })) {
10113       // At least one MMO in a non-global address space.
10114       return InstructionUniformity::NeverUniform;
10115     }
10116 
10117     return InstructionUniformity::Default;
10118   }
10119 
10120   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10121   const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10122 
10123   // FIXME: It's conceptually broken to report this for an instruction, and not
10124   // a specific def operand. For inline asm in particular, there could be mixed
10125   // uniform and divergent results.
10126   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10127     const MachineOperand &SrcOp = MI.getOperand(I);
10128     if (!SrcOp.isReg())
10129       continue;
10130 
10131     Register Reg = SrcOp.getReg();
10132     if (!Reg || !SrcOp.readsReg())
10133       continue;
10134 
10135     // If RegBank is null, this is unassigned or an unallocatable special
10136     // register, which are all scalars.
10137     const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10138     if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10139       return InstructionUniformity::NeverUniform;
10140   }
10141 
10142   // TODO: Uniformity check condtions above can be rearranged for more
10143   // redability
10144 
10145   // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10146   //       currently turned into no-op COPYs by SelectionDAG ISel and are
10147   //       therefore no longer recognizable.
10148 
10149   return InstructionUniformity::Default;
10150 }
10151 
getDSShaderTypeValue(const MachineFunction & MF)10152 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
10153   switch (MF.getFunction().getCallingConv()) {
10154   case CallingConv::AMDGPU_PS:
10155     return 1;
10156   case CallingConv::AMDGPU_VS:
10157     return 2;
10158   case CallingConv::AMDGPU_GS:
10159     return 3;
10160   case CallingConv::AMDGPU_HS:
10161   case CallingConv::AMDGPU_LS:
10162   case CallingConv::AMDGPU_ES: {
10163     const Function &F = MF.getFunction();
10164     F.getContext().diagnose(DiagnosticInfoUnsupported(
10165         F, "ds_ordered_count unsupported for this calling conv"));
10166     [[fallthrough]];
10167   }
10168   case CallingConv::AMDGPU_CS:
10169   case CallingConv::AMDGPU_KERNEL:
10170   case CallingConv::C:
10171   case CallingConv::Fast:
10172   default:
10173     // Assume other calling conventions are various compute callable functions
10174     return 0;
10175   }
10176 }
10177 
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const10178 bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
10179                                  Register &SrcReg2, int64_t &CmpMask,
10180                                  int64_t &CmpValue) const {
10181   if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10182     return false;
10183 
10184   switch (MI.getOpcode()) {
10185   default:
10186     break;
10187   case AMDGPU::S_CMP_EQ_U32:
10188   case AMDGPU::S_CMP_EQ_I32:
10189   case AMDGPU::S_CMP_LG_U32:
10190   case AMDGPU::S_CMP_LG_I32:
10191   case AMDGPU::S_CMP_LT_U32:
10192   case AMDGPU::S_CMP_LT_I32:
10193   case AMDGPU::S_CMP_GT_U32:
10194   case AMDGPU::S_CMP_GT_I32:
10195   case AMDGPU::S_CMP_LE_U32:
10196   case AMDGPU::S_CMP_LE_I32:
10197   case AMDGPU::S_CMP_GE_U32:
10198   case AMDGPU::S_CMP_GE_I32:
10199   case AMDGPU::S_CMP_EQ_U64:
10200   case AMDGPU::S_CMP_LG_U64:
10201     SrcReg = MI.getOperand(0).getReg();
10202     if (MI.getOperand(1).isReg()) {
10203       if (MI.getOperand(1).getSubReg())
10204         return false;
10205       SrcReg2 = MI.getOperand(1).getReg();
10206       CmpValue = 0;
10207     } else if (MI.getOperand(1).isImm()) {
10208       SrcReg2 = Register();
10209       CmpValue = MI.getOperand(1).getImm();
10210     } else {
10211       return false;
10212     }
10213     CmpMask = ~0;
10214     return true;
10215   case AMDGPU::S_CMPK_EQ_U32:
10216   case AMDGPU::S_CMPK_EQ_I32:
10217   case AMDGPU::S_CMPK_LG_U32:
10218   case AMDGPU::S_CMPK_LG_I32:
10219   case AMDGPU::S_CMPK_LT_U32:
10220   case AMDGPU::S_CMPK_LT_I32:
10221   case AMDGPU::S_CMPK_GT_U32:
10222   case AMDGPU::S_CMPK_GT_I32:
10223   case AMDGPU::S_CMPK_LE_U32:
10224   case AMDGPU::S_CMPK_LE_I32:
10225   case AMDGPU::S_CMPK_GE_U32:
10226   case AMDGPU::S_CMPK_GE_I32:
10227     SrcReg = MI.getOperand(0).getReg();
10228     SrcReg2 = Register();
10229     CmpValue = MI.getOperand(1).getImm();
10230     CmpMask = ~0;
10231     return true;
10232   }
10233 
10234   return false;
10235 }
10236 
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const10237 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
10238                                        Register SrcReg2, int64_t CmpMask,
10239                                        int64_t CmpValue,
10240                                        const MachineRegisterInfo *MRI) const {
10241   if (!SrcReg || SrcReg.isPhysical())
10242     return false;
10243 
10244   if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10245     return false;
10246 
10247   const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10248                                this](int64_t ExpectedValue, unsigned SrcSize,
10249                                      bool IsReversible, bool IsSigned) -> bool {
10250     // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10251     // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10252     // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10253     // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10254     // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10255     // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10256     // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10257     // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10258     // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10259     // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10260     //
10261     // Signed ge/gt are not used for the sign bit.
10262     //
10263     // If result of the AND is unused except in the compare:
10264     // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10265     //
10266     // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10267     // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10268     // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10269     // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10270     // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10271     // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10272 
10273     MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10274     if (!Def || Def->getParent() != CmpInstr.getParent())
10275       return false;
10276 
10277     if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10278         Def->getOpcode() != AMDGPU::S_AND_B64)
10279       return false;
10280 
10281     int64_t Mask;
10282     const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10283       if (MO->isImm())
10284         Mask = MO->getImm();
10285       else if (!getFoldableImm(MO, Mask))
10286         return false;
10287       Mask &= maxUIntN(SrcSize);
10288       return isPowerOf2_64(Mask);
10289     };
10290 
10291     MachineOperand *SrcOp = &Def->getOperand(1);
10292     if (isMask(SrcOp))
10293       SrcOp = &Def->getOperand(2);
10294     else if (isMask(&Def->getOperand(2)))
10295       SrcOp = &Def->getOperand(1);
10296     else
10297       return false;
10298 
10299     // A valid Mask is required to have a single bit set, hence a non-zero and
10300     // power-of-two value. This verifies that we will not do 64-bit shift below.
10301     assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10302     unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10303     if (IsSigned && BitNo == SrcSize - 1)
10304       return false;
10305 
10306     ExpectedValue <<= BitNo;
10307 
10308     bool IsReversedCC = false;
10309     if (CmpValue != ExpectedValue) {
10310       if (!IsReversible)
10311         return false;
10312       IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10313       if (!IsReversedCC)
10314         return false;
10315     }
10316 
10317     Register DefReg = Def->getOperand(0).getReg();
10318     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10319       return false;
10320 
10321     for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10322          I != E; ++I) {
10323       if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10324           I->killsRegister(AMDGPU::SCC, &RI))
10325         return false;
10326     }
10327 
10328     MachineOperand *SccDef =
10329         Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10330     SccDef->setIsDead(false);
10331     CmpInstr.eraseFromParent();
10332 
10333     if (!MRI->use_nodbg_empty(DefReg)) {
10334       assert(!IsReversedCC);
10335       return true;
10336     }
10337 
10338     // Replace AND with unused result with a S_BITCMP.
10339     MachineBasicBlock *MBB = Def->getParent();
10340 
10341     unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10342                                                      : AMDGPU::S_BITCMP1_B32
10343                                       : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10344                                                      : AMDGPU::S_BITCMP1_B64;
10345 
10346     BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10347       .add(*SrcOp)
10348       .addImm(BitNo);
10349     Def->eraseFromParent();
10350 
10351     return true;
10352   };
10353 
10354   switch (CmpInstr.getOpcode()) {
10355   default:
10356     break;
10357   case AMDGPU::S_CMP_EQ_U32:
10358   case AMDGPU::S_CMP_EQ_I32:
10359   case AMDGPU::S_CMPK_EQ_U32:
10360   case AMDGPU::S_CMPK_EQ_I32:
10361     return optimizeCmpAnd(1, 32, true, false);
10362   case AMDGPU::S_CMP_GE_U32:
10363   case AMDGPU::S_CMPK_GE_U32:
10364     return optimizeCmpAnd(1, 32, false, false);
10365   case AMDGPU::S_CMP_GE_I32:
10366   case AMDGPU::S_CMPK_GE_I32:
10367     return optimizeCmpAnd(1, 32, false, true);
10368   case AMDGPU::S_CMP_EQ_U64:
10369     return optimizeCmpAnd(1, 64, true, false);
10370   case AMDGPU::S_CMP_LG_U32:
10371   case AMDGPU::S_CMP_LG_I32:
10372   case AMDGPU::S_CMPK_LG_U32:
10373   case AMDGPU::S_CMPK_LG_I32:
10374     return optimizeCmpAnd(0, 32, true, false);
10375   case AMDGPU::S_CMP_GT_U32:
10376   case AMDGPU::S_CMPK_GT_U32:
10377     return optimizeCmpAnd(0, 32, false, false);
10378   case AMDGPU::S_CMP_GT_I32:
10379   case AMDGPU::S_CMPK_GT_I32:
10380     return optimizeCmpAnd(0, 32, false, true);
10381   case AMDGPU::S_CMP_LG_U64:
10382     return optimizeCmpAnd(0, 64, true, false);
10383   }
10384 
10385   return false;
10386 }
10387 
enforceOperandRCAlignment(MachineInstr & MI,AMDGPU::OpName OpName) const10388 void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
10389                                             AMDGPU::OpName OpName) const {
10390   if (!ST.needsAlignedVGPRs())
10391     return;
10392 
10393   int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10394   if (OpNo < 0)
10395     return;
10396   MachineOperand &Op = MI.getOperand(OpNo);
10397   if (getOpSize(MI, OpNo) > 4)
10398     return;
10399 
10400   // Add implicit aligned super-reg to force alignment on the data operand.
10401   const DebugLoc &DL = MI.getDebugLoc();
10402   MachineBasicBlock *BB = MI.getParent();
10403   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10404   Register DataReg = Op.getReg();
10405   bool IsAGPR = RI.isAGPR(MRI, DataReg);
10406   Register Undef = MRI.createVirtualRegister(
10407       IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10408   BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10409   Register NewVR =
10410       MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10411                                        : &AMDGPU::VReg_64_Align2RegClass);
10412   BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10413       .addReg(DataReg, 0, Op.getSubReg())
10414       .addImm(AMDGPU::sub0)
10415       .addReg(Undef)
10416       .addImm(AMDGPU::sub1);
10417   Op.setReg(NewVR);
10418   Op.setSubReg(AMDGPU::sub0);
10419   MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10420 }
10421 
isGlobalMemoryObject(const MachineInstr * MI) const10422 bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
10423   if (isIGLP(*MI))
10424     return false;
10425 
10426   return TargetInstrInfo::isGlobalMemoryObject(MI);
10427 }
10428 
isXDL(const MachineInstr & MI) const10429 bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
10430   unsigned Opcode = MI.getOpcode();
10431 
10432   if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
10433       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10434       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10435     return false;
10436 
10437   if (!ST.hasGFX940Insts())
10438     return true;
10439 
10440   return AMDGPU::getMAIIsGFX940XDL(Opcode);
10441 }
10442