xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31 
32 #define DEBUG_TYPE "amdgpu-isel"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 #define GET_GLOBALISEL_IMPL
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenGlobalISel.inc"
40 #undef GET_GLOBALISEL_IMPL
41 #undef AMDGPUSubtarget
42 
AMDGPUInstructionSelector(const GCNSubtarget & STI,const AMDGPURegisterBankInfo & RBI,const AMDGPUTargetMachine & TM)43 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45     const AMDGPUTargetMachine &TM)
46     : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47       STI(STI),
48 #define GET_GLOBALISEL_PREDICATES_INIT
49 #include "AMDGPUGenGlobalISel.inc"
50 #undef GET_GLOBALISEL_PREDICATES_INIT
51 #define GET_GLOBALISEL_TEMPORARIES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_TEMPORARIES_INIT
54 {
55 }
56 
getName()57 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58 
setupMF(MachineFunction & MF,GISelValueTracking * VT,CodeGenCoverage * CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)59 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,
60                                         GISelValueTracking *VT,
61                                         CodeGenCoverage *CoverageInfo,
62                                         ProfileSummaryInfo *PSI,
63                                         BlockFrequencyInfo *BFI) {
64   MRI = &MF.getRegInfo();
65   Subtarget = &MF.getSubtarget<GCNSubtarget>();
66   Subtarget->checkSubtargetFeatures(MF.getFunction());
67   InstructionSelector::setupMF(MF, VT, CoverageInfo, PSI, BFI);
68 }
69 
70 // Return the wave level SGPR base address if this is a wave address.
getWaveAddress(const MachineInstr * Def)71 static Register getWaveAddress(const MachineInstr *Def) {
72   return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73              ? Def->getOperand(1).getReg()
74              : Register();
75 }
76 
isVCC(Register Reg,const MachineRegisterInfo & MRI) const77 bool AMDGPUInstructionSelector::isVCC(Register Reg,
78                                       const MachineRegisterInfo &MRI) const {
79   // The verifier is oblivious to s1 being a valid value for wavesize registers.
80   if (Reg.isPhysical())
81     return false;
82 
83   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84   const TargetRegisterClass *RC =
85       dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
86   if (RC) {
87     const LLT Ty = MRI.getType(Reg);
88     if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89       return false;
90     // G_TRUNC s1 result is never vcc.
91     return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92            RC->hasSuperClassEq(TRI.getBoolRC());
93   }
94 
95   const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96   return RB->getID() == AMDGPU::VCCRegBankID;
97 }
98 
constrainCopyLikeIntrin(MachineInstr & MI,unsigned NewOpc) const99 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100                                                         unsigned NewOpc) const {
101   MI.setDesc(TII.get(NewOpc));
102   MI.removeOperand(1); // Remove intrinsic ID.
103   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104 
105   MachineOperand &Dst = MI.getOperand(0);
106   MachineOperand &Src = MI.getOperand(1);
107 
108   // TODO: This should be legalized to s32 if needed
109   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110     return false;
111 
112   const TargetRegisterClass *DstRC
113     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114   const TargetRegisterClass *SrcRC
115     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116   if (!DstRC || DstRC != SrcRC)
117     return false;
118 
119   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121 }
122 
selectCOPY(MachineInstr & I) const123 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124   const DebugLoc &DL = I.getDebugLoc();
125   MachineBasicBlock *BB = I.getParent();
126   I.setDesc(TII.get(TargetOpcode::COPY));
127 
128   const MachineOperand &Src = I.getOperand(1);
129   MachineOperand &Dst = I.getOperand(0);
130   Register DstReg = Dst.getReg();
131   Register SrcReg = Src.getReg();
132 
133   if (isVCC(DstReg, *MRI)) {
134     if (SrcReg == AMDGPU::SCC) {
135       const TargetRegisterClass *RC
136         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137       if (!RC)
138         return true;
139       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140     }
141 
142     if (!isVCC(SrcReg, *MRI)) {
143       // TODO: Should probably leave the copy and let copyPhysReg expand it.
144       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145         return false;
146 
147       const TargetRegisterClass *SrcRC
148         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149 
150       std::optional<ValueAndVReg> ConstVal =
151           getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152       if (ConstVal) {
153         unsigned MovOpc =
154             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155         BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156             .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157       } else {
158         Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159 
160         // We can't trust the high bits at this point, so clear them.
161 
162         // TODO: Skip masking high bits if def is known boolean.
163 
164         if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165           assert(Subtarget->useRealTrue16Insts());
166           const int64_t NoMods = 0;
167           BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168               .addImm(NoMods)
169               .addImm(1)
170               .addImm(NoMods)
171               .addReg(SrcReg)
172               .addImm(NoMods);
173           BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
174               .addImm(NoMods)
175               .addImm(0)
176               .addImm(NoMods)
177               .addReg(MaskedReg)
178               .addImm(NoMods);
179         } else {
180           bool IsSGPR = TRI.isSGPRClass(SrcRC);
181           unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182           auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
183                          .addImm(1)
184                          .addReg(SrcReg);
185           if (IsSGPR)
186             And.setOperandDead(3); // Dead scc
187 
188           BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
189               .addImm(0)
190               .addReg(MaskedReg);
191         }
192       }
193 
194       if (!MRI->getRegClassOrNull(SrcReg))
195         MRI->setRegClass(SrcReg, SrcRC);
196       I.eraseFromParent();
197       return true;
198     }
199 
200     const TargetRegisterClass *RC =
201       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
202     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
203       return false;
204 
205     return true;
206   }
207 
208   for (const MachineOperand &MO : I.operands()) {
209     if (MO.getReg().isPhysical())
210       continue;
211 
212     const TargetRegisterClass *RC =
213             TRI.getConstrainedRegClassForOperand(MO, *MRI);
214     if (!RC)
215       continue;
216     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
217   }
218   return true;
219 }
220 
selectCOPY_SCC_VCC(MachineInstr & I) const221 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222   const DebugLoc &DL = I.getDebugLoc();
223   MachineBasicBlock *BB = I.getParent();
224 
225   unsigned CmpOpc =
226       STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227   MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
228                           .addReg(I.getOperand(1).getReg())
229                           .addImm(0);
230   if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
231     return false;
232 
233   Register DstReg = I.getOperand(0).getReg();
234   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
235 
236   I.eraseFromParent();
237   return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
238 }
239 
selectCOPY_VCC_SCC(MachineInstr & I) const240 bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
241   const DebugLoc &DL = I.getDebugLoc();
242   MachineBasicBlock *BB = I.getParent();
243 
244   Register DstReg = I.getOperand(0).getReg();
245   Register SrcReg = I.getOperand(1).getReg();
246   std::optional<ValueAndVReg> Arg =
247       getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
248 
249   if (Arg) {
250     const int64_t Value = Arg->Value.getZExtValue();
251     if (Value == 0) {
252       unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
253       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
254     } else {
255       assert(Value == 1);
256       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
257     }
258     I.eraseFromParent();
259     return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
260   }
261 
262   // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
263   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
264 
265   unsigned SelectOpcode =
266       STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
267   MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
268                              .addReg(TRI.getExec())
269                              .addImm(0);
270 
271   I.eraseFromParent();
272   return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
273 }
274 
selectReadAnyLane(MachineInstr & I) const275 bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
276   Register DstReg = I.getOperand(0).getReg();
277   Register SrcReg = I.getOperand(1).getReg();
278 
279   const DebugLoc &DL = I.getDebugLoc();
280   MachineBasicBlock *BB = I.getParent();
281 
282   auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
283                  .addReg(SrcReg);
284 
285   I.eraseFromParent();
286   return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
287 }
288 
selectPHI(MachineInstr & I) const289 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
290   const Register DefReg = I.getOperand(0).getReg();
291   const LLT DefTy = MRI->getType(DefReg);
292 
293   // S1 G_PHIs should not be selected in instruction-select, instead:
294   // - divergent S1 G_PHI should go through lane mask merging algorithm
295   //   and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
296   // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
297   if (DefTy == LLT::scalar(1))
298     return false;
299 
300   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
301 
302   const RegClassOrRegBank &RegClassOrBank =
303     MRI->getRegClassOrRegBank(DefReg);
304 
305   const TargetRegisterClass *DefRC =
306       dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
307   if (!DefRC) {
308     if (!DefTy.isValid()) {
309       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
310       return false;
311     }
312 
313     const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
314     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
315     if (!DefRC) {
316       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
317       return false;
318     }
319   }
320 
321   // If inputs have register bank, assign corresponding reg class.
322   // Note: registers don't need to have the same reg bank.
323   for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
324     const Register SrcReg = I.getOperand(i).getReg();
325 
326     const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
327     if (RB) {
328       const LLT SrcTy = MRI->getType(SrcReg);
329       const TargetRegisterClass *SrcRC =
330           TRI.getRegClassForTypeOnBank(SrcTy, *RB);
331       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
332         return false;
333     }
334   }
335 
336   I.setDesc(TII.get(TargetOpcode::PHI));
337   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
338 }
339 
340 MachineOperand
getSubOperand64(MachineOperand & MO,const TargetRegisterClass & SubRC,unsigned SubIdx) const341 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
342                                            const TargetRegisterClass &SubRC,
343                                            unsigned SubIdx) const {
344 
345   MachineInstr *MI = MO.getParent();
346   MachineBasicBlock *BB = MO.getParent()->getParent();
347   Register DstReg = MRI->createVirtualRegister(&SubRC);
348 
349   if (MO.isReg()) {
350     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
351     Register Reg = MO.getReg();
352     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
353             .addReg(Reg, 0, ComposedSubIdx);
354 
355     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
356                                      MO.isKill(), MO.isDead(), MO.isUndef(),
357                                      MO.isEarlyClobber(), 0, MO.isDebug(),
358                                      MO.isInternalRead());
359   }
360 
361   assert(MO.isImm());
362 
363   APInt Imm(64, MO.getImm());
364 
365   switch (SubIdx) {
366   default:
367     llvm_unreachable("do not know to split immediate with this sub index.");
368   case AMDGPU::sub0:
369     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
370   case AMDGPU::sub1:
371     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
372   }
373 }
374 
getLogicalBitOpcode(unsigned Opc,bool Is64)375 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
376   switch (Opc) {
377   case AMDGPU::G_AND:
378     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
379   case AMDGPU::G_OR:
380     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
381   case AMDGPU::G_XOR:
382     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
383   default:
384     llvm_unreachable("not a bit op");
385   }
386 }
387 
selectG_AND_OR_XOR(MachineInstr & I) const388 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
389   Register DstReg = I.getOperand(0).getReg();
390   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
391 
392   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
393   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
394       DstRB->getID() != AMDGPU::VCCRegBankID)
395     return false;
396 
397   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
398                             STI.isWave64());
399   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
400 
401   // Dead implicit-def of scc
402   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
403                                          true, // isImp
404                                          false, // isKill
405                                          true)); // isDead
406   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
407 }
408 
selectG_ADD_SUB(MachineInstr & I) const409 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
410   MachineBasicBlock *BB = I.getParent();
411   MachineFunction *MF = BB->getParent();
412   Register DstReg = I.getOperand(0).getReg();
413   const DebugLoc &DL = I.getDebugLoc();
414   LLT Ty = MRI->getType(DstReg);
415   if (Ty.isVector())
416     return false;
417 
418   unsigned Size = Ty.getSizeInBits();
419   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
420   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
421   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
422 
423   if (Size == 32) {
424     if (IsSALU) {
425       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
426       MachineInstr *Add =
427         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
428         .add(I.getOperand(1))
429         .add(I.getOperand(2))
430         .setOperandDead(3); // Dead scc
431       I.eraseFromParent();
432       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
433     }
434 
435     if (STI.hasAddNoCarry()) {
436       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
437       I.setDesc(TII.get(Opc));
438       I.addOperand(*MF, MachineOperand::CreateImm(0));
439       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
440       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
441     }
442 
443     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
444 
445     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
446     MachineInstr *Add
447       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
448       .addDef(UnusedCarry, RegState::Dead)
449       .add(I.getOperand(1))
450       .add(I.getOperand(2))
451       .addImm(0);
452     I.eraseFromParent();
453     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
454   }
455 
456   assert(!Sub && "illegal sub should not reach here");
457 
458   const TargetRegisterClass &RC
459     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
460   const TargetRegisterClass &HalfRC
461     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
462 
463   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
464   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
465   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
466   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
467 
468   Register DstLo = MRI->createVirtualRegister(&HalfRC);
469   Register DstHi = MRI->createVirtualRegister(&HalfRC);
470 
471   if (IsSALU) {
472     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
473       .add(Lo1)
474       .add(Lo2);
475     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
476       .add(Hi1)
477       .add(Hi2)
478       .setOperandDead(3); // Dead scc
479   } else {
480     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
481     Register CarryReg = MRI->createVirtualRegister(CarryRC);
482     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
483       .addDef(CarryReg)
484       .add(Lo1)
485       .add(Lo2)
486       .addImm(0);
487     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
488       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
489       .add(Hi1)
490       .add(Hi2)
491       .addReg(CarryReg, RegState::Kill)
492       .addImm(0);
493 
494     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
495       return false;
496   }
497 
498   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
499     .addReg(DstLo)
500     .addImm(AMDGPU::sub0)
501     .addReg(DstHi)
502     .addImm(AMDGPU::sub1);
503 
504 
505   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
506     return false;
507 
508   I.eraseFromParent();
509   return true;
510 }
511 
selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr & I) const512 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
513   MachineInstr &I) const {
514   MachineBasicBlock *BB = I.getParent();
515   MachineFunction *MF = BB->getParent();
516   const DebugLoc &DL = I.getDebugLoc();
517   Register Dst0Reg = I.getOperand(0).getReg();
518   Register Dst1Reg = I.getOperand(1).getReg();
519   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
520                      I.getOpcode() == AMDGPU::G_UADDE;
521   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
522                           I.getOpcode() == AMDGPU::G_USUBE;
523 
524   if (isVCC(Dst1Reg, *MRI)) {
525     unsigned NoCarryOpc =
526         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
527     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
528     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
529     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
530     I.addOperand(*MF, MachineOperand::CreateImm(0));
531     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
532   }
533 
534   Register Src0Reg = I.getOperand(2).getReg();
535   Register Src1Reg = I.getOperand(3).getReg();
536 
537   if (HasCarryIn) {
538     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
539       .addReg(I.getOperand(4).getReg());
540   }
541 
542   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
543   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
544 
545   auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
546     .add(I.getOperand(2))
547     .add(I.getOperand(3));
548 
549   if (MRI->use_nodbg_empty(Dst1Reg)) {
550     CarryInst.setOperandDead(3); // Dead scc
551   } else {
552     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
553       .addReg(AMDGPU::SCC);
554     if (!MRI->getRegClassOrNull(Dst1Reg))
555       MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
556   }
557 
558   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
560       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
561     return false;
562 
563   if (HasCarryIn &&
564       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
565                                     AMDGPU::SReg_32RegClass, *MRI))
566     return false;
567 
568   I.eraseFromParent();
569   return true;
570 }
571 
selectG_AMDGPU_MAD_64_32(MachineInstr & I) const572 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
573     MachineInstr &I) const {
574   MachineBasicBlock *BB = I.getParent();
575   MachineFunction *MF = BB->getParent();
576   const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
577 
578   unsigned Opc;
579   if (Subtarget->hasMADIntraFwdBug())
580     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
581                      : AMDGPU::V_MAD_I64_I32_gfx11_e64;
582   else
583     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
584   I.setDesc(TII.get(Opc));
585   I.addOperand(*MF, MachineOperand::CreateImm(0));
586   I.addImplicitDefUseOperands(*MF);
587   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
588 }
589 
590 // TODO: We should probably legalize these to only using 32-bit results.
selectG_EXTRACT(MachineInstr & I) const591 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
592   MachineBasicBlock *BB = I.getParent();
593   Register DstReg = I.getOperand(0).getReg();
594   Register SrcReg = I.getOperand(1).getReg();
595   LLT DstTy = MRI->getType(DstReg);
596   LLT SrcTy = MRI->getType(SrcReg);
597   const unsigned SrcSize = SrcTy.getSizeInBits();
598   unsigned DstSize = DstTy.getSizeInBits();
599 
600   // TODO: Should handle any multiple of 32 offset.
601   unsigned Offset = I.getOperand(2).getImm();
602   if (Offset % 32 != 0 || DstSize > 128)
603     return false;
604 
605   // 16-bit operations really use 32-bit registers.
606   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
607   if (DstSize == 16)
608     DstSize = 32;
609 
610   const TargetRegisterClass *DstRC =
611     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
612   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
613     return false;
614 
615   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
616   const TargetRegisterClass *SrcRC =
617       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
618   if (!SrcRC)
619     return false;
620   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
621                                                          DstSize / 32);
622   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
623   if (!SrcRC)
624     return false;
625 
626   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
627                                     *SrcRC, I.getOperand(1));
628   const DebugLoc &DL = I.getDebugLoc();
629   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
630     .addReg(SrcReg, 0, SubReg);
631 
632   I.eraseFromParent();
633   return true;
634 }
635 
selectG_MERGE_VALUES(MachineInstr & MI) const636 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
637   MachineBasicBlock *BB = MI.getParent();
638   Register DstReg = MI.getOperand(0).getReg();
639   LLT DstTy = MRI->getType(DstReg);
640   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
641 
642   const unsigned SrcSize = SrcTy.getSizeInBits();
643   if (SrcSize < 32)
644     return selectImpl(MI, *CoverageInfo);
645 
646   const DebugLoc &DL = MI.getDebugLoc();
647   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
648   const unsigned DstSize = DstTy.getSizeInBits();
649   const TargetRegisterClass *DstRC =
650       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
651   if (!DstRC)
652     return false;
653 
654   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
655   MachineInstrBuilder MIB =
656     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
657   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
658     MachineOperand &Src = MI.getOperand(I + 1);
659     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
660     MIB.addImm(SubRegs[I]);
661 
662     const TargetRegisterClass *SrcRC
663       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
664     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
665       return false;
666   }
667 
668   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
669     return false;
670 
671   MI.eraseFromParent();
672   return true;
673 }
674 
selectG_UNMERGE_VALUES(MachineInstr & MI) const675 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
676   MachineBasicBlock *BB = MI.getParent();
677   const int NumDst = MI.getNumOperands() - 1;
678 
679   MachineOperand &Src = MI.getOperand(NumDst);
680 
681   Register SrcReg = Src.getReg();
682   Register DstReg0 = MI.getOperand(0).getReg();
683   LLT DstTy = MRI->getType(DstReg0);
684   LLT SrcTy = MRI->getType(SrcReg);
685 
686   const unsigned DstSize = DstTy.getSizeInBits();
687   const unsigned SrcSize = SrcTy.getSizeInBits();
688   const DebugLoc &DL = MI.getDebugLoc();
689   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
690 
691   const TargetRegisterClass *SrcRC =
692       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
693   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
694     return false;
695 
696   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
697   // source, and this relies on the fact that the same subregister indices are
698   // used for both.
699   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
700   for (int I = 0, E = NumDst; I != E; ++I) {
701     MachineOperand &Dst = MI.getOperand(I);
702     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
703       .addReg(SrcReg, 0, SubRegs[I]);
704 
705     // Make sure the subregister index is valid for the source register.
706     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
707     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
708       return false;
709 
710     const TargetRegisterClass *DstRC =
711       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
712     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
713       return false;
714   }
715 
716   MI.eraseFromParent();
717   return true;
718 }
719 
selectG_BUILD_VECTOR(MachineInstr & MI) const720 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
721   assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
722          MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
723 
724   Register Src0 = MI.getOperand(1).getReg();
725   Register Src1 = MI.getOperand(2).getReg();
726   LLT SrcTy = MRI->getType(Src0);
727   const unsigned SrcSize = SrcTy.getSizeInBits();
728 
729   // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
730   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
731     return selectG_MERGE_VALUES(MI);
732   }
733 
734   // Selection logic below is for V2S16 only.
735   // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
736   Register Dst = MI.getOperand(0).getReg();
737   if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
738       (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
739        SrcTy != LLT::scalar(32)))
740     return selectImpl(MI, *CoverageInfo);
741 
742   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
743   if (DstBank->getID() == AMDGPU::AGPRRegBankID)
744     return false;
745 
746   assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
747          DstBank->getID() == AMDGPU::VGPRRegBankID);
748   const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
749 
750   const DebugLoc &DL = MI.getDebugLoc();
751   MachineBasicBlock *BB = MI.getParent();
752 
753   // First, before trying TableGen patterns, check if both sources are
754   // constants. In those cases, we can trivially compute the final constant
755   // and emit a simple move.
756   auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
757   if (ConstSrc1) {
758     auto ConstSrc0 =
759         getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
760     if (ConstSrc0) {
761       const int64_t K0 = ConstSrc0->Value.getSExtValue();
762       const int64_t K1 = ConstSrc1->Value.getSExtValue();
763       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
764       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
765       uint32_t Imm = Lo16 | (Hi16 << 16);
766 
767       // VALU
768       if (IsVector) {
769         BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
770         MI.eraseFromParent();
771         return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
772       }
773 
774       // SALU
775       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
776       MI.eraseFromParent();
777       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
778     }
779   }
780 
781   // Now try TableGen patterns.
782   if (selectImpl(MI, *CoverageInfo))
783     return true;
784 
785   // TODO: This should probably be a combine somewhere
786   // (build_vector $src0, undef) -> copy $src0
787   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
788   if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
789     MI.setDesc(TII.get(AMDGPU::COPY));
790     MI.removeOperand(2);
791     const auto &RC =
792         IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
793     return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
794            RBI.constrainGenericRegister(Src0, RC, *MRI);
795   }
796 
797   // TODO: Can be improved?
798   if (IsVector) {
799     Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
800     auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
801                    .addImm(0xFFFF)
802                    .addReg(Src0);
803     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
804       return false;
805 
806     MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
807               .addReg(Src1)
808               .addImm(16)
809               .addReg(TmpReg);
810     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
811       return false;
812 
813     MI.eraseFromParent();
814     return true;
815   }
816 
817   Register ShiftSrc0;
818   Register ShiftSrc1;
819 
820   // With multiple uses of the shift, this will duplicate the shift and
821   // increase register pressure.
822   //
823   // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
824   //  => (S_PACK_HH_B32_B16 $src0, $src1)
825   // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
826   //  => (S_PACK_HL_B32_B16 $src0, $src1)
827   // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
828   //  => (S_PACK_LH_B32_B16 $src0, $src1)
829   // (build_vector $src0, $src1)
830   //  => (S_PACK_LL_B32_B16 $src0, $src1)
831 
832   bool Shift0 = mi_match(
833       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
834 
835   bool Shift1 = mi_match(
836       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
837 
838   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
839   if (Shift0 && Shift1) {
840     Opc = AMDGPU::S_PACK_HH_B32_B16;
841     MI.getOperand(1).setReg(ShiftSrc0);
842     MI.getOperand(2).setReg(ShiftSrc1);
843   } else if (Shift1) {
844     Opc = AMDGPU::S_PACK_LH_B32_B16;
845     MI.getOperand(2).setReg(ShiftSrc1);
846   } else if (Shift0) {
847     auto ConstSrc1 =
848         getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
849     if (ConstSrc1 && ConstSrc1->Value == 0) {
850       // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
851       auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
852                      .addReg(ShiftSrc0)
853                      .addImm(16)
854                      .setOperandDead(3); // Dead scc
855 
856       MI.eraseFromParent();
857       return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
858     }
859     if (STI.hasSPackHL()) {
860       Opc = AMDGPU::S_PACK_HL_B32_B16;
861       MI.getOperand(1).setReg(ShiftSrc0);
862     }
863   }
864 
865   MI.setDesc(TII.get(Opc));
866   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
867 }
868 
selectG_IMPLICIT_DEF(MachineInstr & I) const869 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
870   const MachineOperand &MO = I.getOperand(0);
871 
872   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
873   // regbank check here is to know why getConstrainedRegClassForOperand failed.
874   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
875   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
876       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
877     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
878     return true;
879   }
880 
881   return false;
882 }
883 
selectG_INSERT(MachineInstr & I) const884 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
885   MachineBasicBlock *BB = I.getParent();
886 
887   Register DstReg = I.getOperand(0).getReg();
888   Register Src0Reg = I.getOperand(1).getReg();
889   Register Src1Reg = I.getOperand(2).getReg();
890   LLT Src1Ty = MRI->getType(Src1Reg);
891 
892   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
893   unsigned InsSize = Src1Ty.getSizeInBits();
894 
895   int64_t Offset = I.getOperand(3).getImm();
896 
897   // FIXME: These cases should have been illegal and unnecessary to check here.
898   if (Offset % 32 != 0 || InsSize % 32 != 0)
899     return false;
900 
901   // Currently not handled by getSubRegFromChannel.
902   if (InsSize > 128)
903     return false;
904 
905   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
906   if (SubReg == AMDGPU::NoSubRegister)
907     return false;
908 
909   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
910   const TargetRegisterClass *DstRC =
911       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
912   if (!DstRC)
913     return false;
914 
915   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
916   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
917   const TargetRegisterClass *Src0RC =
918       TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
919   const TargetRegisterClass *Src1RC =
920       TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
921 
922   // Deal with weird cases where the class only partially supports the subreg
923   // index.
924   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
925   if (!Src0RC || !Src1RC)
926     return false;
927 
928   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
929       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
930       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
931     return false;
932 
933   const DebugLoc &DL = I.getDebugLoc();
934   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
935     .addReg(Src0Reg)
936     .addReg(Src1Reg)
937     .addImm(SubReg);
938 
939   I.eraseFromParent();
940   return true;
941 }
942 
selectG_SBFX_UBFX(MachineInstr & MI) const943 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
944   Register DstReg = MI.getOperand(0).getReg();
945   Register SrcReg = MI.getOperand(1).getReg();
946   Register OffsetReg = MI.getOperand(2).getReg();
947   Register WidthReg = MI.getOperand(3).getReg();
948 
949   assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
950          "scalar BFX instructions are expanded in regbankselect");
951   assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
952          "64-bit vector BFX instructions are expanded in regbankselect");
953 
954   const DebugLoc &DL = MI.getDebugLoc();
955   MachineBasicBlock *MBB = MI.getParent();
956 
957   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
958   unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
959   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
960                  .addReg(SrcReg)
961                  .addReg(OffsetReg)
962                  .addReg(WidthReg);
963   MI.eraseFromParent();
964   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
965 }
966 
selectInterpP1F16(MachineInstr & MI) const967 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
968   if (STI.getLDSBankCount() != 16)
969     return selectImpl(MI, *CoverageInfo);
970 
971   Register Dst = MI.getOperand(0).getReg();
972   Register Src0 = MI.getOperand(2).getReg();
973   Register M0Val = MI.getOperand(6).getReg();
974   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
975       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
976       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
977     return false;
978 
979   // This requires 2 instructions. It is possible to write a pattern to support
980   // this, but the generated isel emitter doesn't correctly deal with multiple
981   // output instructions using the same physical register input. The copy to m0
982   // is incorrectly placed before the second instruction.
983   //
984   // TODO: Match source modifiers.
985 
986   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
987   const DebugLoc &DL = MI.getDebugLoc();
988   MachineBasicBlock *MBB = MI.getParent();
989 
990   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
991     .addReg(M0Val);
992   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
993     .addImm(2)
994     .addImm(MI.getOperand(4).getImm())  // $attr
995     .addImm(MI.getOperand(3).getImm()); // $attrchan
996 
997   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
998     .addImm(0)                          // $src0_modifiers
999     .addReg(Src0)                       // $src0
1000     .addImm(MI.getOperand(4).getImm())  // $attr
1001     .addImm(MI.getOperand(3).getImm())  // $attrchan
1002     .addImm(0)                          // $src2_modifiers
1003     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
1004     .addImm(MI.getOperand(5).getImm())  // $high
1005     .addImm(0)                          // $clamp
1006     .addImm(0);                         // $omod
1007 
1008   MI.eraseFromParent();
1009   return true;
1010 }
1011 
1012 // Writelane is special in that it can use SGPR and M0 (which would normally
1013 // count as using the constant bus twice - but in this case it is allowed since
1014 // the lane selector doesn't count as a use of the constant bus). However, it is
1015 // still required to abide by the 1 SGPR rule. Fix this up if we might have
1016 // multiple SGPRs.
selectWritelane(MachineInstr & MI) const1017 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1018   // With a constant bus limit of at least 2, there's no issue.
1019   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1020     return selectImpl(MI, *CoverageInfo);
1021 
1022   MachineBasicBlock *MBB = MI.getParent();
1023   const DebugLoc &DL = MI.getDebugLoc();
1024   Register VDst = MI.getOperand(0).getReg();
1025   Register Val = MI.getOperand(2).getReg();
1026   Register LaneSelect = MI.getOperand(3).getReg();
1027   Register VDstIn = MI.getOperand(4).getReg();
1028 
1029   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1030 
1031   std::optional<ValueAndVReg> ConstSelect =
1032       getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1033   if (ConstSelect) {
1034     // The selector has to be an inline immediate, so we can use whatever for
1035     // the other operands.
1036     MIB.addReg(Val);
1037     MIB.addImm(ConstSelect->Value.getSExtValue() &
1038                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1039   } else {
1040     std::optional<ValueAndVReg> ConstVal =
1041         getIConstantVRegValWithLookThrough(Val, *MRI);
1042 
1043     // If the value written is an inline immediate, we can get away without a
1044     // copy to m0.
1045     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1046                                                  STI.hasInv2PiInlineImm())) {
1047       MIB.addImm(ConstVal->Value.getSExtValue());
1048       MIB.addReg(LaneSelect);
1049     } else {
1050       MIB.addReg(Val);
1051 
1052       // If the lane selector was originally in a VGPR and copied with
1053       // readfirstlane, there's a hazard to read the same SGPR from the
1054       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1055       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1056 
1057       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1058         .addReg(LaneSelect);
1059       MIB.addReg(AMDGPU::M0);
1060     }
1061   }
1062 
1063   MIB.addReg(VDstIn);
1064 
1065   MI.eraseFromParent();
1066   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1067 }
1068 
1069 // We need to handle this here because tablegen doesn't support matching
1070 // instructions with multiple outputs.
selectDivScale(MachineInstr & MI) const1071 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1072   Register Dst0 = MI.getOperand(0).getReg();
1073   Register Dst1 = MI.getOperand(1).getReg();
1074 
1075   LLT Ty = MRI->getType(Dst0);
1076   unsigned Opc;
1077   if (Ty == LLT::scalar(32))
1078     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1079   else if (Ty == LLT::scalar(64))
1080     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1081   else
1082     return false;
1083 
1084   // TODO: Match source modifiers.
1085 
1086   const DebugLoc &DL = MI.getDebugLoc();
1087   MachineBasicBlock *MBB = MI.getParent();
1088 
1089   Register Numer = MI.getOperand(3).getReg();
1090   Register Denom = MI.getOperand(4).getReg();
1091   unsigned ChooseDenom = MI.getOperand(5).getImm();
1092 
1093   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1094 
1095   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1096     .addDef(Dst1)
1097     .addImm(0)     // $src0_modifiers
1098     .addUse(Src0)  // $src0
1099     .addImm(0)     // $src1_modifiers
1100     .addUse(Denom) // $src1
1101     .addImm(0)     // $src2_modifiers
1102     .addUse(Numer) // $src2
1103     .addImm(0)     // $clamp
1104     .addImm(0);    // $omod
1105 
1106   MI.eraseFromParent();
1107   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1108 }
1109 
selectG_INTRINSIC(MachineInstr & I) const1110 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1111   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1112   switch (IntrinsicID) {
1113   case Intrinsic::amdgcn_if_break: {
1114     MachineBasicBlock *BB = I.getParent();
1115 
1116     // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1117     // SelectionDAG uses for wave32 vs wave64.
1118     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1119       .add(I.getOperand(0))
1120       .add(I.getOperand(2))
1121       .add(I.getOperand(3));
1122 
1123     Register DstReg = I.getOperand(0).getReg();
1124     Register Src0Reg = I.getOperand(2).getReg();
1125     Register Src1Reg = I.getOperand(3).getReg();
1126 
1127     I.eraseFromParent();
1128 
1129     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1130       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1131 
1132     return true;
1133   }
1134   case Intrinsic::amdgcn_interp_p1_f16:
1135     return selectInterpP1F16(I);
1136   case Intrinsic::amdgcn_wqm:
1137     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1138   case Intrinsic::amdgcn_softwqm:
1139     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1140   case Intrinsic::amdgcn_strict_wwm:
1141   case Intrinsic::amdgcn_wwm:
1142     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1143   case Intrinsic::amdgcn_strict_wqm:
1144     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1145   case Intrinsic::amdgcn_writelane:
1146     return selectWritelane(I);
1147   case Intrinsic::amdgcn_div_scale:
1148     return selectDivScale(I);
1149   case Intrinsic::amdgcn_icmp:
1150   case Intrinsic::amdgcn_fcmp:
1151     if (selectImpl(I, *CoverageInfo))
1152       return true;
1153     return selectIntrinsicCmp(I);
1154   case Intrinsic::amdgcn_ballot:
1155     return selectBallot(I);
1156   case Intrinsic::amdgcn_reloc_constant:
1157     return selectRelocConstant(I);
1158   case Intrinsic::amdgcn_groupstaticsize:
1159     return selectGroupStaticSize(I);
1160   case Intrinsic::returnaddress:
1161     return selectReturnAddress(I);
1162   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1163   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1164   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1165   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1166   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1167   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1168   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1169   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1170   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1171   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1172   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1173   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1174   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1175   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1176   case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1177   case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1178   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1179   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1180   case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1181   case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1182   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1183   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1184   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1185   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1186   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1187   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1188   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1189   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1190     return selectSMFMACIntrin(I);
1191   case Intrinsic::amdgcn_permlane16_swap:
1192   case Intrinsic::amdgcn_permlane32_swap:
1193     return selectPermlaneSwapIntrin(I, IntrinsicID);
1194   default:
1195     return selectImpl(I, *CoverageInfo);
1196   }
1197 }
1198 
getV_CMPOpcode(CmpInst::Predicate P,unsigned Size,const GCNSubtarget & ST)1199 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1200                           const GCNSubtarget &ST) {
1201   if (Size != 16 && Size != 32 && Size != 64)
1202     return -1;
1203 
1204   if (Size == 16 && !ST.has16BitInsts())
1205     return -1;
1206 
1207   const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1208                           unsigned FakeS16Opc, unsigned S32Opc,
1209                           unsigned S64Opc) {
1210     if (Size == 16)
1211       return ST.hasTrue16BitInsts()
1212                  ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1213                  : S16Opc;
1214     if (Size == 32)
1215       return S32Opc;
1216     return S64Opc;
1217   };
1218 
1219   switch (P) {
1220   default:
1221     llvm_unreachable("Unknown condition code!");
1222   case CmpInst::ICMP_NE:
1223     return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224                   AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225                   AMDGPU::V_CMP_NE_U64_e64);
1226   case CmpInst::ICMP_EQ:
1227     return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228                   AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229                   AMDGPU::V_CMP_EQ_U64_e64);
1230   case CmpInst::ICMP_SGT:
1231     return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232                   AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233                   AMDGPU::V_CMP_GT_I64_e64);
1234   case CmpInst::ICMP_SGE:
1235     return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236                   AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237                   AMDGPU::V_CMP_GE_I64_e64);
1238   case CmpInst::ICMP_SLT:
1239     return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240                   AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241                   AMDGPU::V_CMP_LT_I64_e64);
1242   case CmpInst::ICMP_SLE:
1243     return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244                   AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245                   AMDGPU::V_CMP_LE_I64_e64);
1246   case CmpInst::ICMP_UGT:
1247     return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248                   AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249                   AMDGPU::V_CMP_GT_U64_e64);
1250   case CmpInst::ICMP_UGE:
1251     return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252                   AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253                   AMDGPU::V_CMP_GE_U64_e64);
1254   case CmpInst::ICMP_ULT:
1255     return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256                   AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257                   AMDGPU::V_CMP_LT_U64_e64);
1258   case CmpInst::ICMP_ULE:
1259     return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260                   AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261                   AMDGPU::V_CMP_LE_U64_e64);
1262 
1263   case CmpInst::FCMP_OEQ:
1264     return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265                   AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266                   AMDGPU::V_CMP_EQ_F64_e64);
1267   case CmpInst::FCMP_OGT:
1268     return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269                   AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270                   AMDGPU::V_CMP_GT_F64_e64);
1271   case CmpInst::FCMP_OGE:
1272     return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273                   AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274                   AMDGPU::V_CMP_GE_F64_e64);
1275   case CmpInst::FCMP_OLT:
1276     return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277                   AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278                   AMDGPU::V_CMP_LT_F64_e64);
1279   case CmpInst::FCMP_OLE:
1280     return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281                   AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282                   AMDGPU::V_CMP_LE_F64_e64);
1283   case CmpInst::FCMP_ONE:
1284     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285                   AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286                   AMDGPU::V_CMP_NEQ_F64_e64);
1287   case CmpInst::FCMP_ORD:
1288     return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289                   AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290                   AMDGPU::V_CMP_O_F64_e64);
1291   case CmpInst::FCMP_UNO:
1292     return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293                   AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294                   AMDGPU::V_CMP_U_F64_e64);
1295   case CmpInst::FCMP_UEQ:
1296     return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297                   AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298                   AMDGPU::V_CMP_NLG_F64_e64);
1299   case CmpInst::FCMP_UGT:
1300     return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301                   AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302                   AMDGPU::V_CMP_NLE_F64_e64);
1303   case CmpInst::FCMP_UGE:
1304     return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305                   AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306                   AMDGPU::V_CMP_NLT_F64_e64);
1307   case CmpInst::FCMP_ULT:
1308     return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309                   AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310                   AMDGPU::V_CMP_NGE_F64_e64);
1311   case CmpInst::FCMP_ULE:
1312     return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313                   AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314                   AMDGPU::V_CMP_NGT_F64_e64);
1315   case CmpInst::FCMP_UNE:
1316     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317                   AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318                   AMDGPU::V_CMP_NEQ_F64_e64);
1319   case CmpInst::FCMP_TRUE:
1320     return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321                   AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322                   AMDGPU::V_CMP_TRU_F64_e64);
1323   case CmpInst::FCMP_FALSE:
1324     return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325                   AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326                   AMDGPU::V_CMP_F_F64_e64);
1327   }
1328 }
1329 
getS_CMPOpcode(CmpInst::Predicate P,unsigned Size) const1330 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1331                                               unsigned Size) const {
1332   if (Size == 64) {
1333     if (!STI.hasScalarCompareEq64())
1334       return -1;
1335 
1336     switch (P) {
1337     case CmpInst::ICMP_NE:
1338       return AMDGPU::S_CMP_LG_U64;
1339     case CmpInst::ICMP_EQ:
1340       return AMDGPU::S_CMP_EQ_U64;
1341     default:
1342       return -1;
1343     }
1344   }
1345 
1346   if (Size == 32) {
1347     switch (P) {
1348     case CmpInst::ICMP_NE:
1349       return AMDGPU::S_CMP_LG_U32;
1350     case CmpInst::ICMP_EQ:
1351       return AMDGPU::S_CMP_EQ_U32;
1352     case CmpInst::ICMP_SGT:
1353       return AMDGPU::S_CMP_GT_I32;
1354     case CmpInst::ICMP_SGE:
1355       return AMDGPU::S_CMP_GE_I32;
1356     case CmpInst::ICMP_SLT:
1357       return AMDGPU::S_CMP_LT_I32;
1358     case CmpInst::ICMP_SLE:
1359       return AMDGPU::S_CMP_LE_I32;
1360     case CmpInst::ICMP_UGT:
1361       return AMDGPU::S_CMP_GT_U32;
1362     case CmpInst::ICMP_UGE:
1363       return AMDGPU::S_CMP_GE_U32;
1364     case CmpInst::ICMP_ULT:
1365       return AMDGPU::S_CMP_LT_U32;
1366     case CmpInst::ICMP_ULE:
1367       return AMDGPU::S_CMP_LE_U32;
1368     case CmpInst::FCMP_OEQ:
1369       return AMDGPU::S_CMP_EQ_F32;
1370     case CmpInst::FCMP_OGT:
1371       return AMDGPU::S_CMP_GT_F32;
1372     case CmpInst::FCMP_OGE:
1373       return AMDGPU::S_CMP_GE_F32;
1374     case CmpInst::FCMP_OLT:
1375       return AMDGPU::S_CMP_LT_F32;
1376     case CmpInst::FCMP_OLE:
1377       return AMDGPU::S_CMP_LE_F32;
1378     case CmpInst::FCMP_ONE:
1379       return AMDGPU::S_CMP_LG_F32;
1380     case CmpInst::FCMP_ORD:
1381       return AMDGPU::S_CMP_O_F32;
1382     case CmpInst::FCMP_UNO:
1383       return AMDGPU::S_CMP_U_F32;
1384     case CmpInst::FCMP_UEQ:
1385       return AMDGPU::S_CMP_NLG_F32;
1386     case CmpInst::FCMP_UGT:
1387       return AMDGPU::S_CMP_NLE_F32;
1388     case CmpInst::FCMP_UGE:
1389       return AMDGPU::S_CMP_NLT_F32;
1390     case CmpInst::FCMP_ULT:
1391       return AMDGPU::S_CMP_NGE_F32;
1392     case CmpInst::FCMP_ULE:
1393       return AMDGPU::S_CMP_NGT_F32;
1394     case CmpInst::FCMP_UNE:
1395       return AMDGPU::S_CMP_NEQ_F32;
1396     default:
1397       llvm_unreachable("Unknown condition code!");
1398     }
1399   }
1400 
1401   if (Size == 16) {
1402     if (!STI.hasSALUFloatInsts())
1403       return -1;
1404 
1405     switch (P) {
1406     case CmpInst::FCMP_OEQ:
1407       return AMDGPU::S_CMP_EQ_F16;
1408     case CmpInst::FCMP_OGT:
1409       return AMDGPU::S_CMP_GT_F16;
1410     case CmpInst::FCMP_OGE:
1411       return AMDGPU::S_CMP_GE_F16;
1412     case CmpInst::FCMP_OLT:
1413       return AMDGPU::S_CMP_LT_F16;
1414     case CmpInst::FCMP_OLE:
1415       return AMDGPU::S_CMP_LE_F16;
1416     case CmpInst::FCMP_ONE:
1417       return AMDGPU::S_CMP_LG_F16;
1418     case CmpInst::FCMP_ORD:
1419       return AMDGPU::S_CMP_O_F16;
1420     case CmpInst::FCMP_UNO:
1421       return AMDGPU::S_CMP_U_F16;
1422     case CmpInst::FCMP_UEQ:
1423       return AMDGPU::S_CMP_NLG_F16;
1424     case CmpInst::FCMP_UGT:
1425       return AMDGPU::S_CMP_NLE_F16;
1426     case CmpInst::FCMP_UGE:
1427       return AMDGPU::S_CMP_NLT_F16;
1428     case CmpInst::FCMP_ULT:
1429       return AMDGPU::S_CMP_NGE_F16;
1430     case CmpInst::FCMP_ULE:
1431       return AMDGPU::S_CMP_NGT_F16;
1432     case CmpInst::FCMP_UNE:
1433       return AMDGPU::S_CMP_NEQ_F16;
1434     default:
1435       llvm_unreachable("Unknown condition code!");
1436     }
1437   }
1438 
1439   return -1;
1440 }
1441 
selectG_ICMP_or_FCMP(MachineInstr & I) const1442 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1443 
1444   MachineBasicBlock *BB = I.getParent();
1445   const DebugLoc &DL = I.getDebugLoc();
1446 
1447   Register SrcReg = I.getOperand(2).getReg();
1448   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1449 
1450   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1451 
1452   Register CCReg = I.getOperand(0).getReg();
1453   if (!isVCC(CCReg, *MRI)) {
1454     int Opcode = getS_CMPOpcode(Pred, Size);
1455     if (Opcode == -1)
1456       return false;
1457     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1458             .add(I.getOperand(2))
1459             .add(I.getOperand(3));
1460     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1461       .addReg(AMDGPU::SCC);
1462     bool Ret =
1463         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1464         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1465     I.eraseFromParent();
1466     return Ret;
1467   }
1468 
1469   if (I.getOpcode() == AMDGPU::G_FCMP)
1470     return false;
1471 
1472   int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1473   if (Opcode == -1)
1474     return false;
1475 
1476   MachineInstrBuilder ICmp;
1477   // t16 instructions
1478   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1479     ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1480                .addImm(0)
1481                .add(I.getOperand(2))
1482                .addImm(0)
1483                .add(I.getOperand(3))
1484                .addImm(0); // op_sel
1485   } else {
1486     ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1487                .add(I.getOperand(2))
1488                .add(I.getOperand(3));
1489   }
1490 
1491   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1492                                *TRI.getBoolRC(), *MRI);
1493   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1494   I.eraseFromParent();
1495   return Ret;
1496 }
1497 
selectIntrinsicCmp(MachineInstr & I) const1498 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1499   Register Dst = I.getOperand(0).getReg();
1500   if (isVCC(Dst, *MRI))
1501     return false;
1502 
1503   LLT DstTy = MRI->getType(Dst);
1504   if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1505     return false;
1506 
1507   MachineBasicBlock *BB = I.getParent();
1508   const DebugLoc &DL = I.getDebugLoc();
1509   Register SrcReg = I.getOperand(2).getReg();
1510   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1511 
1512   // i1 inputs are not supported in GlobalISel.
1513   if (Size == 1)
1514     return false;
1515 
1516   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1517   if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1518     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1519     I.eraseFromParent();
1520     return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1521   }
1522 
1523   const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1524   if (Opcode == -1)
1525     return false;
1526 
1527   MachineInstrBuilder SelectedMI;
1528   MachineOperand &LHS = I.getOperand(2);
1529   MachineOperand &RHS = I.getOperand(3);
1530   auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1531   auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1532   Register Src0Reg =
1533       copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1534   Register Src1Reg =
1535       copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1536   SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1537   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1538     SelectedMI.addImm(Src0Mods);
1539   SelectedMI.addReg(Src0Reg);
1540   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1541     SelectedMI.addImm(Src1Mods);
1542   SelectedMI.addReg(Src1Reg);
1543   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1544     SelectedMI.addImm(0); // clamp
1545   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1546     SelectedMI.addImm(0); // op_sel
1547 
1548   RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1549   if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1550     return false;
1551 
1552   I.eraseFromParent();
1553   return true;
1554 }
1555 
1556 // Ballot has to zero bits in input lane-mask that are zero in current exec,
1557 // Done as AND with exec. For inputs that are results of instruction that
1558 // implicitly use same exec, for example compares in same basic block or SCC to
1559 // VCC copy, use copy.
isLaneMaskFromSameBlock(Register Reg,MachineRegisterInfo & MRI,MachineBasicBlock * MBB)1560 static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1561                                     MachineBasicBlock *MBB) {
1562   MachineInstr *MI = MRI.getVRegDef(Reg);
1563   if (MI->getParent() != MBB)
1564     return false;
1565 
1566   // Lane mask generated by SCC to VCC copy.
1567   if (MI->getOpcode() == AMDGPU::COPY) {
1568     auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1569     auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1570     if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1571         SrcRB->getID() == AMDGPU::SGPRRegBankID)
1572       return true;
1573   }
1574 
1575   // Lane mask generated using compare with same exec.
1576   if (isa<GAnyCmp>(MI))
1577     return true;
1578 
1579   Register LHS, RHS;
1580   // Look through AND.
1581   if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1582     return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1583            isLaneMaskFromSameBlock(RHS, MRI, MBB);
1584 
1585   return false;
1586 }
1587 
selectBallot(MachineInstr & I) const1588 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1589   MachineBasicBlock *BB = I.getParent();
1590   const DebugLoc &DL = I.getDebugLoc();
1591   Register DstReg = I.getOperand(0).getReg();
1592   Register SrcReg = I.getOperand(2).getReg();
1593   const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1594   const unsigned WaveSize = STI.getWavefrontSize();
1595 
1596   // In the common case, the return type matches the wave size.
1597   // However we also support emitting i64 ballots in wave32 mode.
1598   if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1599     return false;
1600 
1601   std::optional<ValueAndVReg> Arg =
1602       getIConstantVRegValWithLookThrough(SrcReg, *MRI);
1603 
1604   Register Dst = DstReg;
1605   // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1606   if (BallotSize != WaveSize) {
1607     Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1608   }
1609 
1610   if (Arg) {
1611     const int64_t Value = Arg->Value.getZExtValue();
1612     if (Value == 0) {
1613       // Dst = S_MOV 0
1614       unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1615       BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1616     } else {
1617       // Dst = COPY EXEC
1618       assert(Value == 1);
1619       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1620     }
1621     if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1622       return false;
1623   } else {
1624     if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1625       // Dst = COPY SrcReg
1626       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1627       if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1628         return false;
1629     } else {
1630       // Dst = S_AND SrcReg, EXEC
1631       unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1632       auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1633                      .addReg(SrcReg)
1634                      .addReg(TRI.getExec())
1635                      .setOperandDead(3); // Dead scc
1636       if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1637         return false;
1638     }
1639   }
1640 
1641   // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1642   if (BallotSize != WaveSize) {
1643     Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1644     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1645     BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1646         .addReg(Dst)
1647         .addImm(AMDGPU::sub0)
1648         .addReg(HiReg)
1649         .addImm(AMDGPU::sub1);
1650   }
1651 
1652   I.eraseFromParent();
1653   return true;
1654 }
1655 
selectRelocConstant(MachineInstr & I) const1656 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1657   Register DstReg = I.getOperand(0).getReg();
1658   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1659   const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1660   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1661     return false;
1662 
1663   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1664 
1665   Module *M = MF->getFunction().getParent();
1666   const MDNode *Metadata = I.getOperand(2).getMetadata();
1667   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1668   auto *RelocSymbol = cast<GlobalVariable>(
1669       M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1670 
1671   MachineBasicBlock *BB = I.getParent();
1672   BuildMI(*BB, &I, I.getDebugLoc(),
1673           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1674     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1675 
1676   I.eraseFromParent();
1677   return true;
1678 }
1679 
selectGroupStaticSize(MachineInstr & I) const1680 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1681   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1682 
1683   Register DstReg = I.getOperand(0).getReg();
1684   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1685   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1686     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1687 
1688   MachineBasicBlock *MBB = I.getParent();
1689   const DebugLoc &DL = I.getDebugLoc();
1690 
1691   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1692 
1693   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1694     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1695     MIB.addImm(MFI->getLDSSize());
1696   } else {
1697     Module *M = MF->getFunction().getParent();
1698     const GlobalValue *GV =
1699         Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1700     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1701   }
1702 
1703   I.eraseFromParent();
1704   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1705 }
1706 
selectReturnAddress(MachineInstr & I) const1707 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1708   MachineBasicBlock *MBB = I.getParent();
1709   MachineFunction &MF = *MBB->getParent();
1710   const DebugLoc &DL = I.getDebugLoc();
1711 
1712   MachineOperand &Dst = I.getOperand(0);
1713   Register DstReg = Dst.getReg();
1714   unsigned Depth = I.getOperand(2).getImm();
1715 
1716   const TargetRegisterClass *RC
1717     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1718   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1719       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1720     return false;
1721 
1722   // Check for kernel and shader functions
1723   if (Depth != 0 ||
1724       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1725     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1726       .addImm(0);
1727     I.eraseFromParent();
1728     return true;
1729   }
1730 
1731   MachineFrameInfo &MFI = MF.getFrameInfo();
1732   // There is a call to @llvm.returnaddress in this function
1733   MFI.setReturnAddressIsTaken(true);
1734 
1735   // Get the return address reg and mark it as an implicit live-in
1736   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1737   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1738                                              AMDGPU::SReg_64RegClass, DL);
1739   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1740     .addReg(LiveIn);
1741   I.eraseFromParent();
1742   return true;
1743 }
1744 
selectEndCfIntrinsic(MachineInstr & MI) const1745 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1746   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1747   // SelectionDAG uses for wave32 vs wave64.
1748   MachineBasicBlock *BB = MI.getParent();
1749   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1750       .add(MI.getOperand(1));
1751 
1752   Register Reg = MI.getOperand(1).getReg();
1753   MI.eraseFromParent();
1754 
1755   if (!MRI->getRegClassOrNull(Reg))
1756     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1757   return true;
1758 }
1759 
selectDSOrderedIntrinsic(MachineInstr & MI,Intrinsic::ID IntrID) const1760 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1761   MachineInstr &MI, Intrinsic::ID IntrID) const {
1762   MachineBasicBlock *MBB = MI.getParent();
1763   MachineFunction *MF = MBB->getParent();
1764   const DebugLoc &DL = MI.getDebugLoc();
1765 
1766   unsigned IndexOperand = MI.getOperand(7).getImm();
1767   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1768   bool WaveDone = MI.getOperand(9).getImm() != 0;
1769 
1770   if (WaveDone && !WaveRelease) {
1771     // TODO: Move this to IR verifier
1772     const Function &Fn = MF->getFunction();
1773     Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1774         Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1775   }
1776 
1777   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1778   IndexOperand &= ~0x3f;
1779   unsigned CountDw = 0;
1780 
1781   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1782     CountDw = (IndexOperand >> 24) & 0xf;
1783     IndexOperand &= ~(0xf << 24);
1784 
1785     if (CountDw < 1 || CountDw > 4) {
1786       const Function &Fn = MF->getFunction();
1787       Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1788           Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1789       CountDw = 1;
1790     }
1791   }
1792 
1793   if (IndexOperand) {
1794     const Function &Fn = MF->getFunction();
1795     Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1796         Fn, "ds_ordered_count: bad index operand", DL));
1797   }
1798 
1799   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1800   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1801 
1802   unsigned Offset0 = OrderedCountIndex << 2;
1803   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1804 
1805   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1806     Offset1 |= (CountDw - 1) << 6;
1807 
1808   if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1809     Offset1 |= ShaderType << 2;
1810 
1811   unsigned Offset = Offset0 | (Offset1 << 8);
1812 
1813   Register M0Val = MI.getOperand(2).getReg();
1814   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1815     .addReg(M0Val);
1816 
1817   Register DstReg = MI.getOperand(0).getReg();
1818   Register ValReg = MI.getOperand(3).getReg();
1819   MachineInstrBuilder DS =
1820     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1821       .addReg(ValReg)
1822       .addImm(Offset)
1823       .cloneMemRefs(MI);
1824 
1825   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1826     return false;
1827 
1828   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1829   MI.eraseFromParent();
1830   return Ret;
1831 }
1832 
gwsIntrinToOpcode(unsigned IntrID)1833 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1834   switch (IntrID) {
1835   case Intrinsic::amdgcn_ds_gws_init:
1836     return AMDGPU::DS_GWS_INIT;
1837   case Intrinsic::amdgcn_ds_gws_barrier:
1838     return AMDGPU::DS_GWS_BARRIER;
1839   case Intrinsic::amdgcn_ds_gws_sema_v:
1840     return AMDGPU::DS_GWS_SEMA_V;
1841   case Intrinsic::amdgcn_ds_gws_sema_br:
1842     return AMDGPU::DS_GWS_SEMA_BR;
1843   case Intrinsic::amdgcn_ds_gws_sema_p:
1844     return AMDGPU::DS_GWS_SEMA_P;
1845   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1846     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1847   default:
1848     llvm_unreachable("not a gws intrinsic");
1849   }
1850 }
1851 
selectDSGWSIntrinsic(MachineInstr & MI,Intrinsic::ID IID) const1852 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1853                                                      Intrinsic::ID IID) const {
1854   if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1855                         !STI.hasGWSSemaReleaseAll()))
1856     return false;
1857 
1858   // intrinsic ID, vsrc, offset
1859   const bool HasVSrc = MI.getNumOperands() == 3;
1860   assert(HasVSrc || MI.getNumOperands() == 2);
1861 
1862   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1863   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1864   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1865     return false;
1866 
1867   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1868   unsigned ImmOffset;
1869 
1870   MachineBasicBlock *MBB = MI.getParent();
1871   const DebugLoc &DL = MI.getDebugLoc();
1872 
1873   MachineInstr *Readfirstlane = nullptr;
1874 
1875   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1876   // incoming offset, in case there's an add of a constant. We'll have to put it
1877   // back later.
1878   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1879     Readfirstlane = OffsetDef;
1880     BaseOffset = OffsetDef->getOperand(1).getReg();
1881     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1882   }
1883 
1884   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1885     // If we have a constant offset, try to use the 0 in m0 as the base.
1886     // TODO: Look into changing the default m0 initialization value. If the
1887     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1888     // the immediate offset.
1889 
1890     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1891     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1892       .addImm(0);
1893   } else {
1894     std::tie(BaseOffset, ImmOffset) =
1895         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1896 
1897     if (Readfirstlane) {
1898       // We have the constant offset now, so put the readfirstlane back on the
1899       // variable component.
1900       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1901         return false;
1902 
1903       Readfirstlane->getOperand(1).setReg(BaseOffset);
1904       BaseOffset = Readfirstlane->getOperand(0).getReg();
1905     } else {
1906       if (!RBI.constrainGenericRegister(BaseOffset,
1907                                         AMDGPU::SReg_32RegClass, *MRI))
1908         return false;
1909     }
1910 
1911     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1912     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1913       .addReg(BaseOffset)
1914       .addImm(16)
1915       .setOperandDead(3); // Dead scc
1916 
1917     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1918       .addReg(M0Base);
1919   }
1920 
1921   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1922   // offset field) % 64. Some versions of the programming guide omit the m0
1923   // part, or claim it's from offset 0.
1924   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1925 
1926   if (HasVSrc) {
1927     Register VSrc = MI.getOperand(1).getReg();
1928     MIB.addReg(VSrc);
1929 
1930     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1931       return false;
1932   }
1933 
1934   MIB.addImm(ImmOffset)
1935      .cloneMemRefs(MI);
1936 
1937   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1938 
1939   MI.eraseFromParent();
1940   return true;
1941 }
1942 
selectDSAppendConsume(MachineInstr & MI,bool IsAppend) const1943 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1944                                                       bool IsAppend) const {
1945   Register PtrBase = MI.getOperand(2).getReg();
1946   LLT PtrTy = MRI->getType(PtrBase);
1947   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1948 
1949   unsigned Offset;
1950   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1951 
1952   // TODO: Should this try to look through readfirstlane like GWS?
1953   if (!isDSOffsetLegal(PtrBase, Offset)) {
1954     PtrBase = MI.getOperand(2).getReg();
1955     Offset = 0;
1956   }
1957 
1958   MachineBasicBlock *MBB = MI.getParent();
1959   const DebugLoc &DL = MI.getDebugLoc();
1960   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1961 
1962   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1963     .addReg(PtrBase);
1964   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1965     return false;
1966 
1967   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1968     .addImm(Offset)
1969     .addImm(IsGDS ? -1 : 0)
1970     .cloneMemRefs(MI);
1971   MI.eraseFromParent();
1972   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1973 }
1974 
selectInitWholeWave(MachineInstr & MI) const1975 bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1976   MachineFunction *MF = MI.getParent()->getParent();
1977   SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1978 
1979   MFInfo->setInitWholeWave();
1980   return selectImpl(MI, *CoverageInfo);
1981 }
1982 
selectSBarrier(MachineInstr & MI) const1983 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1984   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1985   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1986     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1987     if (WGSize <= STI.getWavefrontSize()) {
1988       // If the workgroup fits in a wave, remove s_barrier_signal and lower
1989       // s_barrier/s_barrier_wait to wave_barrier.
1990       if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1991           IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1992         MachineBasicBlock *MBB = MI.getParent();
1993         const DebugLoc &DL = MI.getDebugLoc();
1994         BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1995       }
1996       MI.eraseFromParent();
1997       return true;
1998     }
1999   }
2000 
2001   if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
2002     // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
2003     MachineBasicBlock *MBB = MI.getParent();
2004     const DebugLoc &DL = MI.getDebugLoc();
2005     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
2006         .addImm(AMDGPU::Barrier::WORKGROUP);
2007     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
2008         .addImm(AMDGPU::Barrier::WORKGROUP);
2009     MI.eraseFromParent();
2010     return true;
2011   }
2012 
2013   return selectImpl(MI, *CoverageInfo);
2014 }
2015 
parseTexFail(uint64_t TexFailCtrl,bool & TFE,bool & LWE,bool & IsTexFail)2016 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2017                          bool &IsTexFail) {
2018   if (TexFailCtrl)
2019     IsTexFail = true;
2020 
2021   TFE = TexFailCtrl & 0x1;
2022   TexFailCtrl &= ~(uint64_t)0x1;
2023   LWE = TexFailCtrl & 0x2;
2024   TexFailCtrl &= ~(uint64_t)0x2;
2025 
2026   return TexFailCtrl == 0;
2027 }
2028 
selectImageIntrinsic(MachineInstr & MI,const AMDGPU::ImageDimIntrinsicInfo * Intr) const2029 bool AMDGPUInstructionSelector::selectImageIntrinsic(
2030   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2031   MachineBasicBlock *MBB = MI.getParent();
2032   const DebugLoc &DL = MI.getDebugLoc();
2033 
2034   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2035     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
2036 
2037   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2038   unsigned IntrOpcode = Intr->BaseOpcode;
2039   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2040   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2041   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2042 
2043   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2044 
2045   Register VDataIn, VDataOut;
2046   LLT VDataTy;
2047   int NumVDataDwords = -1;
2048   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2049                MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2050 
2051   bool Unorm;
2052   if (!BaseOpcode->Sampler)
2053     Unorm = true;
2054   else
2055     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2056 
2057   bool TFE;
2058   bool LWE;
2059   bool IsTexFail = false;
2060   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2061                     TFE, LWE, IsTexFail))
2062     return false;
2063 
2064   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2065   const bool IsA16 = (Flags & 1) != 0;
2066   const bool IsG16 = (Flags & 2) != 0;
2067 
2068   // A16 implies 16 bit gradients if subtarget doesn't support G16
2069   if (IsA16 && !STI.hasG16() && !IsG16)
2070     return false;
2071 
2072   unsigned DMask = 0;
2073   unsigned DMaskLanes = 0;
2074 
2075   if (BaseOpcode->Atomic) {
2076     VDataOut = MI.getOperand(0).getReg();
2077     VDataIn = MI.getOperand(2).getReg();
2078     LLT Ty = MRI->getType(VDataIn);
2079 
2080     // Be careful to allow atomic swap on 16-bit element vectors.
2081     const bool Is64Bit = BaseOpcode->AtomicX2 ?
2082       Ty.getSizeInBits() == 128 :
2083       Ty.getSizeInBits() == 64;
2084 
2085     if (BaseOpcode->AtomicX2) {
2086       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2087 
2088       DMask = Is64Bit ? 0xf : 0x3;
2089       NumVDataDwords = Is64Bit ? 4 : 2;
2090     } else {
2091       DMask = Is64Bit ? 0x3 : 0x1;
2092       NumVDataDwords = Is64Bit ? 2 : 1;
2093     }
2094   } else {
2095     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2096     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2097 
2098     if (BaseOpcode->Store) {
2099       VDataIn = MI.getOperand(1).getReg();
2100       VDataTy = MRI->getType(VDataIn);
2101       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2102     } else if (BaseOpcode->NoReturn) {
2103       NumVDataDwords = 0;
2104     } else {
2105       VDataOut = MI.getOperand(0).getReg();
2106       VDataTy = MRI->getType(VDataOut);
2107       NumVDataDwords = DMaskLanes;
2108 
2109       if (IsD16 && !STI.hasUnpackedD16VMem())
2110         NumVDataDwords = (DMaskLanes + 1) / 2;
2111     }
2112   }
2113 
2114   // Set G16 opcode
2115   if (Subtarget->hasG16() && IsG16) {
2116     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2117         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
2118     assert(G16MappingInfo);
2119     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2120   }
2121 
2122   // TODO: Check this in verifier.
2123   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2124 
2125   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2126   if (BaseOpcode->Atomic)
2127     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2128   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2129                AMDGPU::CPol::VOLATILE))
2130     return false;
2131 
2132   int NumVAddrRegs = 0;
2133   int NumVAddrDwords = 0;
2134   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2135     // Skip the $noregs and 0s inserted during legalization.
2136     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2137     if (!AddrOp.isReg())
2138       continue; // XXX - Break?
2139 
2140     Register Addr = AddrOp.getReg();
2141     if (!Addr)
2142       break;
2143 
2144     ++NumVAddrRegs;
2145     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2146   }
2147 
2148   // The legalizer preprocessed the intrinsic arguments. If we aren't using
2149   // NSA, these should have been packed into a single value in the first
2150   // address register
2151   const bool UseNSA =
2152       NumVAddrRegs != 1 &&
2153       (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2154                                    : NumVAddrDwords == NumVAddrRegs);
2155   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2156     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2157     return false;
2158   }
2159 
2160   if (IsTexFail)
2161     ++NumVDataDwords;
2162 
2163   int Opcode = -1;
2164   if (IsGFX12Plus) {
2165     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2166                                    NumVDataDwords, NumVAddrDwords);
2167   } else if (IsGFX11Plus) {
2168     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2169                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
2170                                           : AMDGPU::MIMGEncGfx11Default,
2171                                    NumVDataDwords, NumVAddrDwords);
2172   } else if (IsGFX10Plus) {
2173     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2174                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
2175                                           : AMDGPU::MIMGEncGfx10Default,
2176                                    NumVDataDwords, NumVAddrDwords);
2177   } else {
2178     if (Subtarget->hasGFX90AInsts()) {
2179       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2180                                      NumVDataDwords, NumVAddrDwords);
2181       if (Opcode == -1) {
2182         LLVM_DEBUG(
2183             dbgs()
2184             << "requested image instruction is not supported on this GPU\n");
2185         return false;
2186       }
2187     }
2188     if (Opcode == -1 &&
2189         STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2190       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2191                                      NumVDataDwords, NumVAddrDwords);
2192     if (Opcode == -1)
2193       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2194                                      NumVDataDwords, NumVAddrDwords);
2195   }
2196   if (Opcode == -1)
2197     return false;
2198 
2199   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2200     .cloneMemRefs(MI);
2201 
2202   if (VDataOut) {
2203     if (BaseOpcode->AtomicX2) {
2204       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2205 
2206       Register TmpReg = MRI->createVirtualRegister(
2207         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2208       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2209 
2210       MIB.addDef(TmpReg);
2211       if (!MRI->use_empty(VDataOut)) {
2212         BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2213             .addReg(TmpReg, RegState::Kill, SubReg);
2214       }
2215 
2216     } else {
2217       MIB.addDef(VDataOut); // vdata output
2218     }
2219   }
2220 
2221   if (VDataIn)
2222     MIB.addReg(VDataIn); // vdata input
2223 
2224   for (int I = 0; I != NumVAddrRegs; ++I) {
2225     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2226     if (SrcOp.isReg()) {
2227       assert(SrcOp.getReg() != 0);
2228       MIB.addReg(SrcOp.getReg());
2229     }
2230   }
2231 
2232   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2233   if (BaseOpcode->Sampler)
2234     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2235 
2236   MIB.addImm(DMask); // dmask
2237 
2238   if (IsGFX10Plus)
2239     MIB.addImm(DimInfo->Encoding);
2240   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2241     MIB.addImm(Unorm);
2242 
2243   MIB.addImm(CPol);
2244   MIB.addImm(IsA16 &&  // a16 or r128
2245              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2246   if (IsGFX10Plus)
2247     MIB.addImm(IsA16 ? -1 : 0);
2248 
2249   if (!Subtarget->hasGFX90AInsts()) {
2250     MIB.addImm(TFE); // tfe
2251   } else if (TFE) {
2252     LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2253     return false;
2254   }
2255 
2256   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2257     MIB.addImm(LWE); // lwe
2258   if (!IsGFX10Plus)
2259     MIB.addImm(DimInfo->DA ? -1 : 0);
2260   if (BaseOpcode->HasD16)
2261     MIB.addImm(IsD16 ? -1 : 0);
2262 
2263   MI.eraseFromParent();
2264   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2265   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2266   return true;
2267 }
2268 
2269 // We need to handle this here because tablegen doesn't support matching
2270 // instructions with multiple outputs.
selectDSBvhStackIntrinsic(MachineInstr & MI) const2271 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2272     MachineInstr &MI) const {
2273   Register Dst0 = MI.getOperand(0).getReg();
2274   Register Dst1 = MI.getOperand(1).getReg();
2275 
2276   const DebugLoc &DL = MI.getDebugLoc();
2277   MachineBasicBlock *MBB = MI.getParent();
2278 
2279   Register Addr = MI.getOperand(3).getReg();
2280   Register Data0 = MI.getOperand(4).getReg();
2281   Register Data1 = MI.getOperand(5).getReg();
2282   unsigned Offset = MI.getOperand(6).getImm();
2283 
2284   unsigned Opc;
2285   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2286   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2287   case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2288     Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2289     break;
2290   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2291     Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2292     break;
2293   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2294     Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2295     break;
2296   }
2297 
2298   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2299                  .addDef(Dst1)
2300                  .addUse(Addr)
2301                  .addUse(Data0)
2302                  .addUse(Data1)
2303                  .addImm(Offset)
2304                  .cloneMemRefs(MI);
2305 
2306   MI.eraseFromParent();
2307   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2308 }
2309 
selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr & I) const2310 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2311     MachineInstr &I) const {
2312   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2313   switch (IntrinsicID) {
2314   case Intrinsic::amdgcn_end_cf:
2315     return selectEndCfIntrinsic(I);
2316   case Intrinsic::amdgcn_ds_ordered_add:
2317   case Intrinsic::amdgcn_ds_ordered_swap:
2318     return selectDSOrderedIntrinsic(I, IntrinsicID);
2319   case Intrinsic::amdgcn_ds_gws_init:
2320   case Intrinsic::amdgcn_ds_gws_barrier:
2321   case Intrinsic::amdgcn_ds_gws_sema_v:
2322   case Intrinsic::amdgcn_ds_gws_sema_br:
2323   case Intrinsic::amdgcn_ds_gws_sema_p:
2324   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2325     return selectDSGWSIntrinsic(I, IntrinsicID);
2326   case Intrinsic::amdgcn_ds_append:
2327     return selectDSAppendConsume(I, true);
2328   case Intrinsic::amdgcn_ds_consume:
2329     return selectDSAppendConsume(I, false);
2330   case Intrinsic::amdgcn_init_whole_wave:
2331     return selectInitWholeWave(I);
2332   case Intrinsic::amdgcn_s_barrier:
2333   case Intrinsic::amdgcn_s_barrier_signal:
2334   case Intrinsic::amdgcn_s_barrier_wait:
2335     return selectSBarrier(I);
2336   case Intrinsic::amdgcn_raw_buffer_load_lds:
2337   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2338   case Intrinsic::amdgcn_struct_buffer_load_lds:
2339   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2340     return selectBufferLoadLds(I);
2341   // Until we can store both the address space of the global and the LDS
2342   // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2343   // that the argument is a global pointer (buffer pointers have been handled by
2344   // a LLVM IR-level lowering).
2345   case Intrinsic::amdgcn_load_to_lds:
2346   case Intrinsic::amdgcn_global_load_lds:
2347     return selectGlobalLoadLds(I);
2348   case Intrinsic::amdgcn_exp_compr:
2349     if (!STI.hasCompressedExport()) {
2350       Function &F = I.getMF()->getFunction();
2351       F.getContext().diagnose(
2352           DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2353                                     I.getDebugLoc(), DS_Error));
2354       return false;
2355     }
2356     break;
2357   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2358   case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2359   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2360   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2361     return selectDSBvhStackIntrinsic(I);
2362   case Intrinsic::amdgcn_s_barrier_signal_var:
2363     return selectNamedBarrierInit(I, IntrinsicID);
2364   case Intrinsic::amdgcn_s_get_named_barrier_state:
2365     return selectNamedBarrierInst(I, IntrinsicID);
2366   case Intrinsic::amdgcn_s_get_barrier_state:
2367     return selectSGetBarrierState(I, IntrinsicID);
2368   case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2369     return selectSBarrierSignalIsfirst(I, IntrinsicID);
2370   }
2371   return selectImpl(I, *CoverageInfo);
2372 }
2373 
selectG_SELECT(MachineInstr & I) const2374 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2375   if (selectImpl(I, *CoverageInfo))
2376     return true;
2377 
2378   MachineBasicBlock *BB = I.getParent();
2379   const DebugLoc &DL = I.getDebugLoc();
2380 
2381   Register DstReg = I.getOperand(0).getReg();
2382   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2383   assert(Size <= 32 || Size == 64);
2384   const MachineOperand &CCOp = I.getOperand(1);
2385   Register CCReg = CCOp.getReg();
2386   if (!isVCC(CCReg, *MRI)) {
2387     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2388                                          AMDGPU::S_CSELECT_B32;
2389     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2390             .addReg(CCReg);
2391 
2392     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2393     // bank, because it does not cover the register class that we used to represent
2394     // for it.  So we need to manually set the register class here.
2395     if (!MRI->getRegClassOrNull(CCReg))
2396         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2397     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2398             .add(I.getOperand(2))
2399             .add(I.getOperand(3));
2400 
2401     bool Ret = false;
2402     Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2403     Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2404     I.eraseFromParent();
2405     return Ret;
2406   }
2407 
2408   // Wide VGPR select should have been split in RegBankSelect.
2409   if (Size > 32)
2410     return false;
2411 
2412   MachineInstr *Select =
2413       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2414               .addImm(0)
2415               .add(I.getOperand(3))
2416               .addImm(0)
2417               .add(I.getOperand(2))
2418               .add(I.getOperand(1));
2419 
2420   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2421   I.eraseFromParent();
2422   return Ret;
2423 }
2424 
selectG_TRUNC(MachineInstr & I) const2425 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2426   Register DstReg = I.getOperand(0).getReg();
2427   Register SrcReg = I.getOperand(1).getReg();
2428   const LLT DstTy = MRI->getType(DstReg);
2429   const LLT SrcTy = MRI->getType(SrcReg);
2430   const LLT S1 = LLT::scalar(1);
2431 
2432   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2433   const RegisterBank *DstRB;
2434   if (DstTy == S1) {
2435     // This is a special case. We don't treat s1 for legalization artifacts as
2436     // vcc booleans.
2437     DstRB = SrcRB;
2438   } else {
2439     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2440     if (SrcRB != DstRB)
2441       return false;
2442   }
2443 
2444   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2445 
2446   unsigned DstSize = DstTy.getSizeInBits();
2447   unsigned SrcSize = SrcTy.getSizeInBits();
2448 
2449   const TargetRegisterClass *SrcRC =
2450       TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2451   const TargetRegisterClass *DstRC =
2452       TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2453   if (!SrcRC || !DstRC)
2454     return false;
2455 
2456   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2457       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2458     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2459     return false;
2460   }
2461 
2462   if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2463     assert(STI.useRealTrue16Insts());
2464     const DebugLoc &DL = I.getDebugLoc();
2465     MachineBasicBlock *MBB = I.getParent();
2466     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2467         .addReg(SrcReg, 0, AMDGPU::lo16);
2468     I.eraseFromParent();
2469     return true;
2470   }
2471 
2472   if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2473     MachineBasicBlock *MBB = I.getParent();
2474     const DebugLoc &DL = I.getDebugLoc();
2475 
2476     Register LoReg = MRI->createVirtualRegister(DstRC);
2477     Register HiReg = MRI->createVirtualRegister(DstRC);
2478     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2479       .addReg(SrcReg, 0, AMDGPU::sub0);
2480     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2481       .addReg(SrcReg, 0, AMDGPU::sub1);
2482 
2483     if (IsVALU && STI.hasSDWA()) {
2484       // Write the low 16-bits of the high element into the high 16-bits of the
2485       // low element.
2486       MachineInstr *MovSDWA =
2487         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2488         .addImm(0)                             // $src0_modifiers
2489         .addReg(HiReg)                         // $src0
2490         .addImm(0)                             // $clamp
2491         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2492         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2493         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2494         .addReg(LoReg, RegState::Implicit);
2495       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2496     } else {
2497       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2498       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2499       Register ImmReg = MRI->createVirtualRegister(DstRC);
2500       if (IsVALU) {
2501         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2502           .addImm(16)
2503           .addReg(HiReg);
2504       } else {
2505         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2506           .addReg(HiReg)
2507           .addImm(16)
2508           .setOperandDead(3); // Dead scc
2509       }
2510 
2511       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2512       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2513       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2514 
2515       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2516         .addImm(0xffff);
2517       auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2518         .addReg(LoReg)
2519         .addReg(ImmReg);
2520       auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2521         .addReg(TmpReg0)
2522         .addReg(TmpReg1);
2523 
2524       if (!IsVALU) {
2525         And.setOperandDead(3); // Dead scc
2526         Or.setOperandDead(3); // Dead scc
2527       }
2528     }
2529 
2530     I.eraseFromParent();
2531     return true;
2532   }
2533 
2534   if (!DstTy.isScalar())
2535     return false;
2536 
2537   if (SrcSize > 32) {
2538     unsigned SubRegIdx = DstSize < 32
2539                              ? static_cast<unsigned>(AMDGPU::sub0)
2540                              : TRI.getSubRegFromChannel(0, DstSize / 32);
2541     if (SubRegIdx == AMDGPU::NoSubRegister)
2542       return false;
2543 
2544     // Deal with weird cases where the class only partially supports the subreg
2545     // index.
2546     const TargetRegisterClass *SrcWithSubRC
2547       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2548     if (!SrcWithSubRC)
2549       return false;
2550 
2551     if (SrcWithSubRC != SrcRC) {
2552       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2553         return false;
2554     }
2555 
2556     I.getOperand(1).setSubReg(SubRegIdx);
2557   }
2558 
2559   I.setDesc(TII.get(TargetOpcode::COPY));
2560   return true;
2561 }
2562 
2563 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
shouldUseAndMask(unsigned Size,unsigned & Mask)2564 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2565   Mask = maskTrailingOnes<unsigned>(Size);
2566   int SignedMask = static_cast<int>(Mask);
2567   return SignedMask >= -16 && SignedMask <= 64;
2568 }
2569 
2570 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
getArtifactRegBank(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2571 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2572   Register Reg, const MachineRegisterInfo &MRI,
2573   const TargetRegisterInfo &TRI) const {
2574   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2575   if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2576     return RB;
2577 
2578   // Ignore the type, since we don't use vcc in artifacts.
2579   if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2580     return &RBI.getRegBankFromRegClass(*RC, LLT());
2581   return nullptr;
2582 }
2583 
selectG_SZA_EXT(MachineInstr & I) const2584 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2585   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2586   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2587   const DebugLoc &DL = I.getDebugLoc();
2588   MachineBasicBlock &MBB = *I.getParent();
2589   const Register DstReg = I.getOperand(0).getReg();
2590   const Register SrcReg = I.getOperand(1).getReg();
2591 
2592   const LLT DstTy = MRI->getType(DstReg);
2593   const LLT SrcTy = MRI->getType(SrcReg);
2594   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2595     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2596   const unsigned DstSize = DstTy.getSizeInBits();
2597   if (!DstTy.isScalar())
2598     return false;
2599 
2600   // Artifact casts should never use vcc.
2601   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2602 
2603   // FIXME: This should probably be illegal and split earlier.
2604   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2605     if (DstSize <= 32)
2606       return selectCOPY(I);
2607 
2608     const TargetRegisterClass *SrcRC =
2609         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2610     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2611     const TargetRegisterClass *DstRC =
2612         TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2613 
2614     Register UndefReg = MRI->createVirtualRegister(SrcRC);
2615     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2616     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2617       .addReg(SrcReg)
2618       .addImm(AMDGPU::sub0)
2619       .addReg(UndefReg)
2620       .addImm(AMDGPU::sub1);
2621     I.eraseFromParent();
2622 
2623     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2624            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2625   }
2626 
2627   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2628     // 64-bit should have been split up in RegBankSelect
2629 
2630     // Try to use an and with a mask if it will save code size.
2631     unsigned Mask;
2632     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2633       MachineInstr *ExtI =
2634       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2635         .addImm(Mask)
2636         .addReg(SrcReg);
2637       I.eraseFromParent();
2638       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2639     }
2640 
2641     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2642     MachineInstr *ExtI =
2643       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2644       .addReg(SrcReg)
2645       .addImm(0) // Offset
2646       .addImm(SrcSize); // Width
2647     I.eraseFromParent();
2648     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2649   }
2650 
2651   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2652     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2653       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2654     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2655       return false;
2656 
2657     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2658       const unsigned SextOpc = SrcSize == 8 ?
2659         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2660       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2661         .addReg(SrcReg);
2662       I.eraseFromParent();
2663       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2664     }
2665 
2666     // Using a single 32-bit SALU to calculate the high half is smaller than
2667     // S_BFE with a literal constant operand.
2668     if (DstSize > 32 && SrcSize == 32) {
2669       Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2670       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2671       if (Signed) {
2672         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2673           .addReg(SrcReg, 0, SubReg)
2674           .addImm(31)
2675           .setOperandDead(3); // Dead scc
2676       } else {
2677         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2678           .addImm(0);
2679       }
2680       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2681         .addReg(SrcReg, 0, SubReg)
2682         .addImm(AMDGPU::sub0)
2683         .addReg(HiReg)
2684         .addImm(AMDGPU::sub1);
2685       I.eraseFromParent();
2686       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2687                                           *MRI);
2688     }
2689 
2690     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2691     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2692 
2693     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2694     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2695       // We need a 64-bit register source, but the high bits don't matter.
2696       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2697       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2698       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2699 
2700       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2701       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2702         .addReg(SrcReg, 0, SubReg)
2703         .addImm(AMDGPU::sub0)
2704         .addReg(UndefReg)
2705         .addImm(AMDGPU::sub1);
2706 
2707       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2708         .addReg(ExtReg)
2709         .addImm(SrcSize << 16);
2710 
2711       I.eraseFromParent();
2712       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2713     }
2714 
2715     unsigned Mask;
2716     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2717       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2718         .addReg(SrcReg)
2719         .addImm(Mask)
2720         .setOperandDead(3); // Dead scc
2721     } else {
2722       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2723         .addReg(SrcReg)
2724         .addImm(SrcSize << 16);
2725     }
2726 
2727     I.eraseFromParent();
2728     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2729   }
2730 
2731   return false;
2732 }
2733 
stripCopy(Register Reg,MachineRegisterInfo & MRI)2734 static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2735   return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2736 }
2737 
stripBitCast(Register Reg,MachineRegisterInfo & MRI)2738 static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2739   Register BitcastSrc;
2740   if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2741     Reg = BitcastSrc;
2742   return Reg;
2743 }
2744 
isExtractHiElt(MachineRegisterInfo & MRI,Register In,Register & Out)2745 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2746                            Register &Out) {
2747   Register Trunc;
2748   if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2749     return false;
2750 
2751   Register LShlSrc;
2752   Register Cst;
2753   if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2754     Cst = stripCopy(Cst, MRI);
2755     if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2756       Out = stripBitCast(LShlSrc, MRI);
2757       return true;
2758     }
2759   }
2760 
2761   MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2762   if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2763     return false;
2764 
2765   assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2766          LLT::fixed_vector(2, 16));
2767 
2768   ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2769   assert(Mask.size() == 2);
2770 
2771   if (Mask[0] == 1 && Mask[1] <= 1) {
2772     Out = Shuffle->getOperand(0).getReg();
2773     return true;
2774   }
2775 
2776   return false;
2777 }
2778 
selectG_FPEXT(MachineInstr & I) const2779 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2780   if (!Subtarget->hasSALUFloatInsts())
2781     return false;
2782 
2783   Register Dst = I.getOperand(0).getReg();
2784   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2785   if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2786     return false;
2787 
2788   Register Src = I.getOperand(1).getReg();
2789 
2790   if (MRI->getType(Dst) == LLT::scalar(32) &&
2791       MRI->getType(Src) == LLT::scalar(16)) {
2792     if (isExtractHiElt(*MRI, Src, Src)) {
2793       MachineBasicBlock *BB = I.getParent();
2794       BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2795           .addUse(Src);
2796       I.eraseFromParent();
2797       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2798     }
2799   }
2800 
2801   return false;
2802 }
2803 
selectG_FNEG(MachineInstr & MI) const2804 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2805   // Only manually handle the f64 SGPR case.
2806   //
2807   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2808   // the bit ops theoretically have a second result due to the implicit def of
2809   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2810   // that is easy by disabling the check. The result works, but uses a
2811   // nonsensical sreg32orlds_and_sreg_1 regclass.
2812   //
2813   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2814   // the variadic REG_SEQUENCE operands.
2815 
2816   Register Dst = MI.getOperand(0).getReg();
2817   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2818   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2819       MRI->getType(Dst) != LLT::scalar(64))
2820     return false;
2821 
2822   Register Src = MI.getOperand(1).getReg();
2823   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2824   if (Fabs)
2825     Src = Fabs->getOperand(1).getReg();
2826 
2827   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2828       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2829     return false;
2830 
2831   MachineBasicBlock *BB = MI.getParent();
2832   const DebugLoc &DL = MI.getDebugLoc();
2833   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2834   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2835   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2836   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2837 
2838   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2839     .addReg(Src, 0, AMDGPU::sub0);
2840   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2841     .addReg(Src, 0, AMDGPU::sub1);
2842   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2843     .addImm(0x80000000);
2844 
2845   // Set or toggle sign bit.
2846   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2847   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2848     .addReg(HiReg)
2849     .addReg(ConstReg)
2850     .setOperandDead(3); // Dead scc
2851   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2852     .addReg(LoReg)
2853     .addImm(AMDGPU::sub0)
2854     .addReg(OpReg)
2855     .addImm(AMDGPU::sub1);
2856   MI.eraseFromParent();
2857   return true;
2858 }
2859 
2860 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
selectG_FABS(MachineInstr & MI) const2861 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2862   Register Dst = MI.getOperand(0).getReg();
2863   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2864   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2865       MRI->getType(Dst) != LLT::scalar(64))
2866     return false;
2867 
2868   Register Src = MI.getOperand(1).getReg();
2869   MachineBasicBlock *BB = MI.getParent();
2870   const DebugLoc &DL = MI.getDebugLoc();
2871   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2872   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2873   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2874   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2875 
2876   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2877       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2878     return false;
2879 
2880   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2881     .addReg(Src, 0, AMDGPU::sub0);
2882   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2883     .addReg(Src, 0, AMDGPU::sub1);
2884   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2885     .addImm(0x7fffffff);
2886 
2887   // Clear sign bit.
2888   // TODO: Should this used S_BITSET0_*?
2889   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2890     .addReg(HiReg)
2891     .addReg(ConstReg)
2892     .setOperandDead(3); // Dead scc
2893   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2894     .addReg(LoReg)
2895     .addImm(AMDGPU::sub0)
2896     .addReg(OpReg)
2897     .addImm(AMDGPU::sub1);
2898 
2899   MI.eraseFromParent();
2900   return true;
2901 }
2902 
isConstant(const MachineInstr & MI)2903 static bool isConstant(const MachineInstr &MI) {
2904   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2905 }
2906 
getAddrModeInfo(const MachineInstr & Load,const MachineRegisterInfo & MRI,SmallVectorImpl<GEPInfo> & AddrInfo) const2907 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2908     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2909 
2910   unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2911   const MachineInstr *PtrMI =
2912       MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2913 
2914   assert(PtrMI);
2915 
2916   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2917     return;
2918 
2919   GEPInfo GEPInfo;
2920 
2921   for (unsigned i = 1; i != 3; ++i) {
2922     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2923     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2924     assert(OpDef);
2925     if (i == 2 && isConstant(*OpDef)) {
2926       // TODO: Could handle constant base + variable offset, but a combine
2927       // probably should have commuted it.
2928       assert(GEPInfo.Imm == 0);
2929       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2930       continue;
2931     }
2932     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2933     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2934       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2935     else
2936       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2937   }
2938 
2939   AddrInfo.push_back(GEPInfo);
2940   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2941 }
2942 
isSGPR(Register Reg) const2943 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2944   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2945 }
2946 
isInstrUniform(const MachineInstr & MI) const2947 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2948   if (!MI.hasOneMemOperand())
2949     return false;
2950 
2951   const MachineMemOperand *MMO = *MI.memoperands_begin();
2952   const Value *Ptr = MMO->getValue();
2953 
2954   // UndefValue means this is a load of a kernel input.  These are uniform.
2955   // Sometimes LDS instructions have constant pointers.
2956   // If Ptr is null, then that means this mem operand contains a
2957   // PseudoSourceValue like GOT.
2958   if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Ptr))
2959     return true;
2960 
2961   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2962     return true;
2963 
2964   if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2965     return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2966            AMDGPU::SGPRRegBankID;
2967 
2968   const Instruction *I = dyn_cast<Instruction>(Ptr);
2969   return I && I->getMetadata("amdgpu.uniform");
2970 }
2971 
hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const2972 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2973   for (const GEPInfo &GEPInfo : AddrInfo) {
2974     if (!GEPInfo.VgprParts.empty())
2975       return true;
2976   }
2977   return false;
2978 }
2979 
initM0(MachineInstr & I) const2980 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2981   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2982   unsigned AS = PtrTy.getAddressSpace();
2983   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2984       STI.ldsRequiresM0Init()) {
2985     MachineBasicBlock *BB = I.getParent();
2986 
2987     // If DS instructions require M0 initialization, insert it before selecting.
2988     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2989       .addImm(-1);
2990   }
2991 }
2992 
selectG_LOAD_STORE_ATOMICRMW(MachineInstr & I) const2993 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2994   MachineInstr &I) const {
2995   initM0(I);
2996   return selectImpl(I, *CoverageInfo);
2997 }
2998 
isVCmpResult(Register Reg,MachineRegisterInfo & MRI)2999 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
3000   if (Reg.isPhysical())
3001     return false;
3002 
3003   MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3004   const unsigned Opcode = MI.getOpcode();
3005 
3006   if (Opcode == AMDGPU::COPY)
3007     return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3008 
3009   if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3010       Opcode == AMDGPU::G_XOR)
3011     return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3012            isVCmpResult(MI.getOperand(2).getReg(), MRI);
3013 
3014   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3015     return GI->is(Intrinsic::amdgcn_class);
3016 
3017   return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3018 }
3019 
selectG_BRCOND(MachineInstr & I) const3020 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3021   MachineBasicBlock *BB = I.getParent();
3022   MachineOperand &CondOp = I.getOperand(0);
3023   Register CondReg = CondOp.getReg();
3024   const DebugLoc &DL = I.getDebugLoc();
3025 
3026   unsigned BrOpcode;
3027   Register CondPhysReg;
3028   const TargetRegisterClass *ConstrainRC;
3029 
3030   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3031   // whether the branch is uniform when selecting the instruction. In
3032   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3033   // RegBankSelect knows what it's doing if the branch condition is scc, even
3034   // though it currently does not.
3035   if (!isVCC(CondReg, *MRI)) {
3036     if (MRI->getType(CondReg) != LLT::scalar(32))
3037       return false;
3038 
3039     CondPhysReg = AMDGPU::SCC;
3040     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3041     ConstrainRC = &AMDGPU::SReg_32RegClass;
3042   } else {
3043     // FIXME: Should scc->vcc copies and with exec?
3044 
3045     // Unless the value of CondReg is a result of a V_CMP* instruction then we
3046     // need to insert an and with exec.
3047     if (!isVCmpResult(CondReg, *MRI)) {
3048       const bool Is64 = STI.isWave64();
3049       const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3050       const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3051 
3052       Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3053       BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3054           .addReg(CondReg)
3055           .addReg(Exec)
3056           .setOperandDead(3); // Dead scc
3057       CondReg = TmpReg;
3058     }
3059 
3060     CondPhysReg = TRI.getVCC();
3061     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3062     ConstrainRC = TRI.getBoolRC();
3063   }
3064 
3065   if (!MRI->getRegClassOrNull(CondReg))
3066     MRI->setRegClass(CondReg, ConstrainRC);
3067 
3068   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3069     .addReg(CondReg);
3070   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3071     .addMBB(I.getOperand(1).getMBB());
3072 
3073   I.eraseFromParent();
3074   return true;
3075 }
3076 
selectG_GLOBAL_VALUE(MachineInstr & I) const3077 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3078   MachineInstr &I) const {
3079   Register DstReg = I.getOperand(0).getReg();
3080   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3081   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3082   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3083   if (IsVGPR)
3084     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3085 
3086   return RBI.constrainGenericRegister(
3087     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3088 }
3089 
selectG_PTRMASK(MachineInstr & I) const3090 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3091   Register DstReg = I.getOperand(0).getReg();
3092   Register SrcReg = I.getOperand(1).getReg();
3093   Register MaskReg = I.getOperand(2).getReg();
3094   LLT Ty = MRI->getType(DstReg);
3095   LLT MaskTy = MRI->getType(MaskReg);
3096   MachineBasicBlock *BB = I.getParent();
3097   const DebugLoc &DL = I.getDebugLoc();
3098 
3099   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3100   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3101   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3102   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3103   if (DstRB != SrcRB) // Should only happen for hand written MIR.
3104     return false;
3105 
3106   // Try to avoid emitting a bit operation when we only need to touch half of
3107   // the 64-bit pointer.
3108   APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3109   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3110   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3111 
3112   const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3113   const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3114 
3115   if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3116       !CanCopyLow32 && !CanCopyHi32) {
3117     auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3118       .addReg(SrcReg)
3119       .addReg(MaskReg)
3120       .setOperandDead(3); // Dead scc
3121     I.eraseFromParent();
3122     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3123   }
3124 
3125   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3126   const TargetRegisterClass &RegRC
3127     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3128 
3129   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3130   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3131   const TargetRegisterClass *MaskRC =
3132       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3133 
3134   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3135       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3136       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3137     return false;
3138 
3139   if (Ty.getSizeInBits() == 32) {
3140     assert(MaskTy.getSizeInBits() == 32 &&
3141            "ptrmask should have been narrowed during legalize");
3142 
3143     auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3144       .addReg(SrcReg)
3145       .addReg(MaskReg);
3146 
3147     if (!IsVGPR)
3148       NewOp.setOperandDead(3); // Dead scc
3149     I.eraseFromParent();
3150     return true;
3151   }
3152 
3153   Register HiReg = MRI->createVirtualRegister(&RegRC);
3154   Register LoReg = MRI->createVirtualRegister(&RegRC);
3155 
3156   // Extract the subregisters from the source pointer.
3157   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3158     .addReg(SrcReg, 0, AMDGPU::sub0);
3159   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3160     .addReg(SrcReg, 0, AMDGPU::sub1);
3161 
3162   Register MaskedLo, MaskedHi;
3163 
3164   if (CanCopyLow32) {
3165     // If all the bits in the low half are 1, we only need a copy for it.
3166     MaskedLo = LoReg;
3167   } else {
3168     // Extract the mask subregister and apply the and.
3169     Register MaskLo = MRI->createVirtualRegister(&RegRC);
3170     MaskedLo = MRI->createVirtualRegister(&RegRC);
3171 
3172     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3173       .addReg(MaskReg, 0, AMDGPU::sub0);
3174     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3175       .addReg(LoReg)
3176       .addReg(MaskLo);
3177   }
3178 
3179   if (CanCopyHi32) {
3180     // If all the bits in the high half are 1, we only need a copy for it.
3181     MaskedHi = HiReg;
3182   } else {
3183     Register MaskHi = MRI->createVirtualRegister(&RegRC);
3184     MaskedHi = MRI->createVirtualRegister(&RegRC);
3185 
3186     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3187       .addReg(MaskReg, 0, AMDGPU::sub1);
3188     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3189       .addReg(HiReg)
3190       .addReg(MaskHi);
3191   }
3192 
3193   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3194     .addReg(MaskedLo)
3195     .addImm(AMDGPU::sub0)
3196     .addReg(MaskedHi)
3197     .addImm(AMDGPU::sub1);
3198   I.eraseFromParent();
3199   return true;
3200 }
3201 
3202 /// Return the register to use for the index value, and the subregister to use
3203 /// for the indirectly accessed register.
3204 static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo & MRI,const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,Register IdxReg,unsigned EltSize,GISelValueTracking & ValueTracking)3205 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3206                         const TargetRegisterClass *SuperRC, Register IdxReg,
3207                         unsigned EltSize, GISelValueTracking &ValueTracking) {
3208   Register IdxBaseReg;
3209   int Offset;
3210 
3211   std::tie(IdxBaseReg, Offset) =
3212       AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3213   if (IdxBaseReg == AMDGPU::NoRegister) {
3214     // This will happen if the index is a known constant. This should ordinarily
3215     // be legalized out, but handle it as a register just in case.
3216     assert(Offset == 0);
3217     IdxBaseReg = IdxReg;
3218   }
3219 
3220   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3221 
3222   // Skip out of bounds offsets, or else we would end up using an undefined
3223   // register.
3224   if (static_cast<unsigned>(Offset) >= SubRegs.size())
3225     return std::pair(IdxReg, SubRegs[0]);
3226   return std::pair(IdxBaseReg, SubRegs[Offset]);
3227 }
3228 
selectG_EXTRACT_VECTOR_ELT(MachineInstr & MI) const3229 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3230   MachineInstr &MI) const {
3231   Register DstReg = MI.getOperand(0).getReg();
3232   Register SrcReg = MI.getOperand(1).getReg();
3233   Register IdxReg = MI.getOperand(2).getReg();
3234 
3235   LLT DstTy = MRI->getType(DstReg);
3236   LLT SrcTy = MRI->getType(SrcReg);
3237 
3238   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3239   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3240   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3241 
3242   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3243   // into a waterfall loop.
3244   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3245     return false;
3246 
3247   const TargetRegisterClass *SrcRC =
3248       TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3249   const TargetRegisterClass *DstRC =
3250       TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3251   if (!SrcRC || !DstRC)
3252     return false;
3253   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3254       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3255       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3256     return false;
3257 
3258   MachineBasicBlock *BB = MI.getParent();
3259   const DebugLoc &DL = MI.getDebugLoc();
3260   const bool Is64 = DstTy.getSizeInBits() == 64;
3261 
3262   unsigned SubReg;
3263   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3264       *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3265 
3266   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3267     if (DstTy.getSizeInBits() != 32 && !Is64)
3268       return false;
3269 
3270     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3271       .addReg(IdxReg);
3272 
3273     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3274     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3275       .addReg(SrcReg, 0, SubReg)
3276       .addReg(SrcReg, RegState::Implicit);
3277     MI.eraseFromParent();
3278     return true;
3279   }
3280 
3281   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3282     return false;
3283 
3284   if (!STI.useVGPRIndexMode()) {
3285     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3286       .addReg(IdxReg);
3287     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3288       .addReg(SrcReg, 0, SubReg)
3289       .addReg(SrcReg, RegState::Implicit);
3290     MI.eraseFromParent();
3291     return true;
3292   }
3293 
3294   const MCInstrDesc &GPRIDXDesc =
3295       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3296   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3297       .addReg(SrcReg)
3298       .addReg(IdxReg)
3299       .addImm(SubReg);
3300 
3301   MI.eraseFromParent();
3302   return true;
3303 }
3304 
3305 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
selectG_INSERT_VECTOR_ELT(MachineInstr & MI) const3306 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3307   MachineInstr &MI) const {
3308   Register DstReg = MI.getOperand(0).getReg();
3309   Register VecReg = MI.getOperand(1).getReg();
3310   Register ValReg = MI.getOperand(2).getReg();
3311   Register IdxReg = MI.getOperand(3).getReg();
3312 
3313   LLT VecTy = MRI->getType(DstReg);
3314   LLT ValTy = MRI->getType(ValReg);
3315   unsigned VecSize = VecTy.getSizeInBits();
3316   unsigned ValSize = ValTy.getSizeInBits();
3317 
3318   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3319   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3320   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3321 
3322   assert(VecTy.getElementType() == ValTy);
3323 
3324   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3325   // into a waterfall loop.
3326   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3327     return false;
3328 
3329   const TargetRegisterClass *VecRC =
3330       TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3331   const TargetRegisterClass *ValRC =
3332       TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3333 
3334   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3335       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3336       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3337       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3338     return false;
3339 
3340   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3341     return false;
3342 
3343   unsigned SubReg;
3344   std::tie(IdxReg, SubReg) =
3345       computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3346 
3347   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3348                          STI.useVGPRIndexMode();
3349 
3350   MachineBasicBlock *BB = MI.getParent();
3351   const DebugLoc &DL = MI.getDebugLoc();
3352 
3353   if (!IndexMode) {
3354     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3355       .addReg(IdxReg);
3356 
3357     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3358         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3359     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3360         .addReg(VecReg)
3361         .addReg(ValReg)
3362         .addImm(SubReg);
3363     MI.eraseFromParent();
3364     return true;
3365   }
3366 
3367   const MCInstrDesc &GPRIDXDesc =
3368       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3369   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3370       .addReg(VecReg)
3371       .addReg(ValReg)
3372       .addReg(IdxReg)
3373       .addImm(SubReg);
3374 
3375   MI.eraseFromParent();
3376   return true;
3377 }
3378 
selectBufferLoadLds(MachineInstr & MI) const3379 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3380   if (!Subtarget->hasVMemToLDSLoad())
3381     return false;
3382   unsigned Opc;
3383   unsigned Size = MI.getOperand(3).getImm();
3384 
3385   // The struct intrinsic variants add one additional operand over raw.
3386   const bool HasVIndex = MI.getNumOperands() == 9;
3387   Register VIndex;
3388   int OpOffset = 0;
3389   if (HasVIndex) {
3390     VIndex = MI.getOperand(4).getReg();
3391     OpOffset = 1;
3392   }
3393 
3394   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3395   std::optional<ValueAndVReg> MaybeVOffset =
3396       getIConstantVRegValWithLookThrough(VOffset, *MRI);
3397   const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3398 
3399   switch (Size) {
3400   default:
3401     return false;
3402   case 1:
3403     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3404                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3405                     : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3406                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3407     break;
3408   case 2:
3409     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3410                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3411                     : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3412                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3413     break;
3414   case 4:
3415     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3416                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3417                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3418                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3419     break;
3420   case 12:
3421     if (!Subtarget->hasLDSLoadB96_B128())
3422       return false;
3423 
3424     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3425                                  : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3426                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3427                                  : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3428     break;
3429   case 16:
3430     if (!Subtarget->hasLDSLoadB96_B128())
3431       return false;
3432 
3433     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3434                                  : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3435                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3436                                  : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3437     break;
3438   }
3439 
3440   MachineBasicBlock *MBB = MI.getParent();
3441   const DebugLoc &DL = MI.getDebugLoc();
3442   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3443     .add(MI.getOperand(2));
3444 
3445   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3446 
3447   if (HasVIndex && HasVOffset) {
3448     Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3449     BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3450       .addReg(VIndex)
3451       .addImm(AMDGPU::sub0)
3452       .addReg(VOffset)
3453       .addImm(AMDGPU::sub1);
3454 
3455     MIB.addReg(IdxReg);
3456   } else if (HasVIndex) {
3457     MIB.addReg(VIndex);
3458   } else if (HasVOffset) {
3459     MIB.addReg(VOffset);
3460   }
3461 
3462   MIB.add(MI.getOperand(1));            // rsrc
3463   MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3464   MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3465   bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3466   unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3467   MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3468                                 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3469   MIB.addImm(
3470       Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3471           ? 1
3472           : 0); // swz
3473 
3474   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3475   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3476   LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3477   MachinePointerInfo StorePtrI = LoadPtrI;
3478   StorePtrI.V = nullptr;
3479   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3480 
3481   auto F = LoadMMO->getFlags() &
3482            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3483   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3484                                      Size, LoadMMO->getBaseAlign());
3485 
3486   MachineMemOperand *StoreMMO =
3487       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3488                                sizeof(int32_t), LoadMMO->getBaseAlign());
3489 
3490   MIB.setMemRefs({LoadMMO, StoreMMO});
3491 
3492   MI.eraseFromParent();
3493   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3494 }
3495 
3496 /// Match a zero extend from a 32-bit value to 64-bits.
matchZeroExtendFromS32(MachineRegisterInfo & MRI,Register Reg)3497 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3498   Register ZExtSrc;
3499   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3500     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3501 
3502   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3503   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3504   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3505     return Register();
3506 
3507   assert(Def->getNumOperands() == 3 &&
3508          MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3509   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3510     return Def->getOperand(1).getReg();
3511   }
3512 
3513   return Register();
3514 }
3515 
selectGlobalLoadLds(MachineInstr & MI) const3516 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3517   if (!Subtarget->hasVMemToLDSLoad())
3518     return false;
3519 
3520   unsigned Opc;
3521   unsigned Size = MI.getOperand(3).getImm();
3522 
3523   switch (Size) {
3524   default:
3525     return false;
3526   case 1:
3527     Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3528     break;
3529   case 2:
3530     Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3531     break;
3532   case 4:
3533     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3534     break;
3535   case 12:
3536     if (!Subtarget->hasLDSLoadB96_B128())
3537       return false;
3538     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3539     break;
3540   case 16:
3541     if (!Subtarget->hasLDSLoadB96_B128())
3542       return false;
3543     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3544     break;
3545   }
3546 
3547   MachineBasicBlock *MBB = MI.getParent();
3548   const DebugLoc &DL = MI.getDebugLoc();
3549   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3550     .add(MI.getOperand(2));
3551 
3552   Register Addr = MI.getOperand(1).getReg();
3553   Register VOffset;
3554   // Try to split SAddr and VOffset. Global and LDS pointers share the same
3555   // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3556   if (!isSGPR(Addr)) {
3557     auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3558     if (isSGPR(AddrDef->Reg)) {
3559       Addr = AddrDef->Reg;
3560     } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3561       Register SAddr =
3562           getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3563       if (isSGPR(SAddr)) {
3564         Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3565         if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3566           Addr = SAddr;
3567           VOffset = Off;
3568         }
3569       }
3570     }
3571   }
3572 
3573   if (isSGPR(Addr)) {
3574     Opc = AMDGPU::getGlobalSaddrOp(Opc);
3575     if (!VOffset) {
3576       VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3577       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3578         .addImm(0);
3579     }
3580   }
3581 
3582   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3583     .addReg(Addr);
3584 
3585   if (isSGPR(Addr))
3586     MIB.addReg(VOffset);
3587 
3588   MIB.add(MI.getOperand(4))  // offset
3589      .add(MI.getOperand(5)); // cpol
3590 
3591   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3592   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3593   LoadPtrI.Offset = MI.getOperand(4).getImm();
3594   MachinePointerInfo StorePtrI = LoadPtrI;
3595   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3596   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3597   auto F = LoadMMO->getFlags() &
3598            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3599   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3600                                      Size, LoadMMO->getBaseAlign());
3601   MachineMemOperand *StoreMMO =
3602       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3603                                sizeof(int32_t), Align(4));
3604 
3605   MIB.setMemRefs({LoadMMO, StoreMMO});
3606 
3607   MI.eraseFromParent();
3608   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3609 }
3610 
selectBVHIntersectRayIntrinsic(MachineInstr & MI) const3611 bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3612     MachineInstr &MI) const {
3613   unsigned OpcodeOpIdx =
3614       MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3615   MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3616   MI.removeOperand(OpcodeOpIdx);
3617   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3618   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3619 }
3620 
3621 // FIXME: This should be removed and let the patterns select. We just need the
3622 // AGPR/VGPR combination versions.
selectSMFMACIntrin(MachineInstr & MI) const3623 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3624   unsigned Opc;
3625   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3626   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3627     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3628     break;
3629   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3630     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3631     break;
3632   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3633     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3634     break;
3635   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3636     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3637     break;
3638   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3639     Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3640     break;
3641   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3642     Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3643     break;
3644   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3645     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3646     break;
3647   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3648     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3649     break;
3650   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3651     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3652     break;
3653   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3654     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3655     break;
3656   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3657     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3658     break;
3659   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3660     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3661     break;
3662   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3663     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3664     break;
3665   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3666     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3667     break;
3668   case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3669     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3670     break;
3671   case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3672     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3673     break;
3674   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3675     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3676     break;
3677   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3678     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3679     break;
3680   case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3681     Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3682     break;
3683   case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3684     Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3685     break;
3686   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3687     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3688     break;
3689   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3690     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3691     break;
3692   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3693     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3694     break;
3695   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3696     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3697     break;
3698   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3699     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3700     break;
3701   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3702     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3703     break;
3704   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3705     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3706     break;
3707   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3708     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3709     break;
3710   default:
3711     llvm_unreachable("unhandled smfmac intrinsic");
3712   }
3713 
3714   auto VDst_In = MI.getOperand(4);
3715 
3716   MI.setDesc(TII.get(Opc));
3717   MI.removeOperand(4); // VDst_In
3718   MI.removeOperand(1); // Intrinsic ID
3719   MI.addOperand(VDst_In); // Readd VDst_In to the end
3720   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3721   return true;
3722 }
3723 
selectPermlaneSwapIntrin(MachineInstr & MI,Intrinsic::ID IntrID) const3724 bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3725     MachineInstr &MI, Intrinsic::ID IntrID) const {
3726   if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3727       !Subtarget->hasPermlane16Swap())
3728     return false;
3729   if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3730       !Subtarget->hasPermlane32Swap())
3731     return false;
3732 
3733   unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3734                         ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3735                         : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3736 
3737   MI.removeOperand(2);
3738   MI.setDesc(TII.get(Opcode));
3739   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3740 
3741   MachineOperand &FI = MI.getOperand(4);
3742   FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
3743 
3744   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3745 }
3746 
selectWaveAddress(MachineInstr & MI) const3747 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3748   Register DstReg = MI.getOperand(0).getReg();
3749   Register SrcReg = MI.getOperand(1).getReg();
3750   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3751   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3752   MachineBasicBlock *MBB = MI.getParent();
3753   const DebugLoc &DL = MI.getDebugLoc();
3754 
3755   if (IsVALU) {
3756     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3757       .addImm(Subtarget->getWavefrontSizeLog2())
3758       .addReg(SrcReg);
3759   } else {
3760     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3761       .addReg(SrcReg)
3762       .addImm(Subtarget->getWavefrontSizeLog2())
3763       .setOperandDead(3); // Dead scc
3764   }
3765 
3766   const TargetRegisterClass &RC =
3767       IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3768   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3769     return false;
3770 
3771   MI.eraseFromParent();
3772   return true;
3773 }
3774 
3775 // Match BITOP3 operation and return a number of matched instructions plus
3776 // truth table.
BitOp3_Op(Register R,SmallVectorImpl<Register> & Src,const MachineRegisterInfo & MRI)3777 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3778                                               SmallVectorImpl<Register> &Src,
3779                                               const MachineRegisterInfo &MRI) {
3780   unsigned NumOpcodes = 0;
3781   uint8_t LHSBits, RHSBits;
3782 
3783   auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3784     // Define truth table given Src0, Src1, Src2 bits permutations:
3785     //                          0     0     0
3786     //                          0     0     1
3787     //                          0     1     0
3788     //                          0     1     1
3789     //                          1     0     0
3790     //                          1     0     1
3791     //                          1     1     0
3792     //                          1     1     1
3793     const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3794 
3795     if (mi_match(Op, MRI, m_AllOnesInt())) {
3796       Bits = 0xff;
3797       return true;
3798     }
3799     if (mi_match(Op, MRI, m_ZeroInt())) {
3800       Bits = 0;
3801       return true;
3802     }
3803 
3804     for (unsigned I = 0; I < Src.size(); ++I) {
3805       // Try to find existing reused operand
3806       if (Src[I] == Op) {
3807         Bits = SrcBits[I];
3808         return true;
3809       }
3810       // Try to replace parent operator
3811       if (Src[I] == R) {
3812         Bits = SrcBits[I];
3813         Src[I] = Op;
3814         return true;
3815       }
3816     }
3817 
3818     if (Src.size() == 3) {
3819       // No room left for operands. Try one last time, there can be a 'not' of
3820       // one of our source operands. In this case we can compute the bits
3821       // without growing Src vector.
3822       Register LHS;
3823       if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3824         LHS = getSrcRegIgnoringCopies(LHS, MRI);
3825         for (unsigned I = 0; I < Src.size(); ++I) {
3826           if (Src[I] == LHS) {
3827             Bits = ~SrcBits[I];
3828             return true;
3829           }
3830         }
3831       }
3832 
3833       return false;
3834     }
3835 
3836     Bits = SrcBits[Src.size()];
3837     Src.push_back(Op);
3838     return true;
3839   };
3840 
3841   MachineInstr *MI = MRI.getVRegDef(R);
3842   switch (MI->getOpcode()) {
3843   case TargetOpcode::G_AND:
3844   case TargetOpcode::G_OR:
3845   case TargetOpcode::G_XOR: {
3846     Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3847     Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3848 
3849     SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3850     if (!getOperandBits(LHS, LHSBits) ||
3851         !getOperandBits(RHS, RHSBits)) {
3852       Src = Backup;
3853       return std::make_pair(0, 0);
3854     }
3855 
3856     // Recursion is naturally limited by the size of the operand vector.
3857     auto Op = BitOp3_Op(LHS, Src, MRI);
3858     if (Op.first) {
3859       NumOpcodes += Op.first;
3860       LHSBits = Op.second;
3861     }
3862 
3863     Op = BitOp3_Op(RHS, Src, MRI);
3864     if (Op.first) {
3865       NumOpcodes += Op.first;
3866       RHSBits = Op.second;
3867     }
3868     break;
3869   }
3870   default:
3871     return std::make_pair(0, 0);
3872   }
3873 
3874   uint8_t TTbl;
3875   switch (MI->getOpcode()) {
3876   case TargetOpcode::G_AND:
3877     TTbl = LHSBits & RHSBits;
3878     break;
3879   case TargetOpcode::G_OR:
3880     TTbl = LHSBits | RHSBits;
3881     break;
3882   case TargetOpcode::G_XOR:
3883     TTbl = LHSBits ^ RHSBits;
3884     break;
3885   default:
3886     break;
3887   }
3888 
3889   return std::make_pair(NumOpcodes + 1, TTbl);
3890 }
3891 
selectBITOP3(MachineInstr & MI) const3892 bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3893   if (!Subtarget->hasBitOp3Insts())
3894     return false;
3895 
3896   Register DstReg = MI.getOperand(0).getReg();
3897   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3898   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3899   if (!IsVALU)
3900     return false;
3901 
3902   SmallVector<Register, 3> Src;
3903   uint8_t TTbl;
3904   unsigned NumOpcodes;
3905 
3906   std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3907 
3908   // Src.empty() case can happen if all operands are all zero or all ones.
3909   // Normally it shall be optimized out before reaching this.
3910   if (NumOpcodes < 2 || Src.empty())
3911     return false;
3912 
3913   const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3914   if (NumOpcodes == 2 && IsB32) {
3915     // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3916     // asm more readable. This cannot be modeled with AddedComplexity because
3917     // selector does not know how many operations did we match.
3918     if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3919         mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3920         mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3921       return false;
3922   } else if (NumOpcodes < 4) {
3923     // For a uniform case threshold should be higher to account for moves
3924     // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3925     // in SGPRs and a readtfirstlane after.
3926     return false;
3927   }
3928 
3929   unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3930   unsigned CBL = STI.getConstantBusLimit(Opc);
3931   MachineBasicBlock *MBB = MI.getParent();
3932   const DebugLoc &DL = MI.getDebugLoc();
3933 
3934   for (unsigned I = 0; I < Src.size(); ++I) {
3935     const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3936     if (RB->getID() != AMDGPU::SGPRRegBankID)
3937       continue;
3938     if (CBL > 0) {
3939       --CBL;
3940       continue;
3941     }
3942     Register NewReg =  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3943     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3944         .addReg(Src[I]);
3945     Src[I] = NewReg;
3946   }
3947 
3948   // Last operand can be ignored, turning a ternary operation into a binary.
3949   // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3950   // 'c' with 'a' here without changing the answer. In some pathological
3951   // cases it should be possible to get an operation with a single operand
3952   // too if optimizer would not catch it.
3953   while (Src.size() < 3)
3954     Src.push_back(Src[0]);
3955 
3956   auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3957   if (!IsB32)
3958     MIB.addImm(0); // src_mod0
3959   MIB.addReg(Src[0]);
3960   if (!IsB32)
3961     MIB.addImm(0); // src_mod1
3962   MIB.addReg(Src[1]);
3963   if (!IsB32)
3964     MIB.addImm(0); // src_mod2
3965   MIB.addReg(Src[2])
3966      .addImm(TTbl);
3967   if (!IsB32)
3968     MIB.addImm(0); // op_sel
3969 
3970   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3971   MI.eraseFromParent();
3972 
3973   return true;
3974 }
3975 
selectStackRestore(MachineInstr & MI) const3976 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3977   Register SrcReg = MI.getOperand(0).getReg();
3978   if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3979     return false;
3980 
3981   MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3982   Register SP =
3983       Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3984   Register WaveAddr = getWaveAddress(DefMI);
3985   MachineBasicBlock *MBB = MI.getParent();
3986   const DebugLoc &DL = MI.getDebugLoc();
3987 
3988   if (!WaveAddr) {
3989     WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3990     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3991       .addReg(SrcReg)
3992       .addImm(Subtarget->getWavefrontSizeLog2())
3993       .setOperandDead(3); // Dead scc
3994   }
3995 
3996   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3997     .addReg(WaveAddr);
3998 
3999   MI.eraseFromParent();
4000   return true;
4001 }
4002 
select(MachineInstr & I)4003 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
4004 
4005   if (!I.isPreISelOpcode()) {
4006     if (I.isCopy())
4007       return selectCOPY(I);
4008     return true;
4009   }
4010 
4011   switch (I.getOpcode()) {
4012   case TargetOpcode::G_AND:
4013   case TargetOpcode::G_OR:
4014   case TargetOpcode::G_XOR:
4015     if (selectBITOP3(I))
4016       return true;
4017     if (selectImpl(I, *CoverageInfo))
4018       return true;
4019     return selectG_AND_OR_XOR(I);
4020   case TargetOpcode::G_ADD:
4021   case TargetOpcode::G_SUB:
4022   case TargetOpcode::G_PTR_ADD:
4023     if (selectImpl(I, *CoverageInfo))
4024       return true;
4025     return selectG_ADD_SUB(I);
4026   case TargetOpcode::G_UADDO:
4027   case TargetOpcode::G_USUBO:
4028   case TargetOpcode::G_UADDE:
4029   case TargetOpcode::G_USUBE:
4030     return selectG_UADDO_USUBO_UADDE_USUBE(I);
4031   case AMDGPU::G_AMDGPU_MAD_U64_U32:
4032   case AMDGPU::G_AMDGPU_MAD_I64_I32:
4033     return selectG_AMDGPU_MAD_64_32(I);
4034   case TargetOpcode::G_INTTOPTR:
4035   case TargetOpcode::G_BITCAST:
4036   case TargetOpcode::G_PTRTOINT:
4037   case TargetOpcode::G_FREEZE:
4038     return selectCOPY(I);
4039   case TargetOpcode::G_FNEG:
4040     if (selectImpl(I, *CoverageInfo))
4041       return true;
4042     return selectG_FNEG(I);
4043   case TargetOpcode::G_FABS:
4044     if (selectImpl(I, *CoverageInfo))
4045       return true;
4046     return selectG_FABS(I);
4047   case TargetOpcode::G_EXTRACT:
4048     return selectG_EXTRACT(I);
4049   case TargetOpcode::G_MERGE_VALUES:
4050   case TargetOpcode::G_CONCAT_VECTORS:
4051     return selectG_MERGE_VALUES(I);
4052   case TargetOpcode::G_UNMERGE_VALUES:
4053     return selectG_UNMERGE_VALUES(I);
4054   case TargetOpcode::G_BUILD_VECTOR:
4055   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4056     return selectG_BUILD_VECTOR(I);
4057   case TargetOpcode::G_IMPLICIT_DEF:
4058     return selectG_IMPLICIT_DEF(I);
4059   case TargetOpcode::G_INSERT:
4060     return selectG_INSERT(I);
4061   case TargetOpcode::G_INTRINSIC:
4062   case TargetOpcode::G_INTRINSIC_CONVERGENT:
4063     return selectG_INTRINSIC(I);
4064   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4065   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4066     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4067   case TargetOpcode::G_ICMP:
4068   case TargetOpcode::G_FCMP:
4069     if (selectG_ICMP_or_FCMP(I))
4070       return true;
4071     return selectImpl(I, *CoverageInfo);
4072   case TargetOpcode::G_LOAD:
4073   case TargetOpcode::G_ZEXTLOAD:
4074   case TargetOpcode::G_SEXTLOAD:
4075   case TargetOpcode::G_STORE:
4076   case TargetOpcode::G_ATOMIC_CMPXCHG:
4077   case TargetOpcode::G_ATOMICRMW_XCHG:
4078   case TargetOpcode::G_ATOMICRMW_ADD:
4079   case TargetOpcode::G_ATOMICRMW_SUB:
4080   case TargetOpcode::G_ATOMICRMW_AND:
4081   case TargetOpcode::G_ATOMICRMW_OR:
4082   case TargetOpcode::G_ATOMICRMW_XOR:
4083   case TargetOpcode::G_ATOMICRMW_MIN:
4084   case TargetOpcode::G_ATOMICRMW_MAX:
4085   case TargetOpcode::G_ATOMICRMW_UMIN:
4086   case TargetOpcode::G_ATOMICRMW_UMAX:
4087   case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4088   case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4089   case TargetOpcode::G_ATOMICRMW_FADD:
4090   case TargetOpcode::G_ATOMICRMW_FMIN:
4091   case TargetOpcode::G_ATOMICRMW_FMAX:
4092     return selectG_LOAD_STORE_ATOMICRMW(I);
4093   case TargetOpcode::G_SELECT:
4094     return selectG_SELECT(I);
4095   case TargetOpcode::G_TRUNC:
4096     return selectG_TRUNC(I);
4097   case TargetOpcode::G_SEXT:
4098   case TargetOpcode::G_ZEXT:
4099   case TargetOpcode::G_ANYEXT:
4100   case TargetOpcode::G_SEXT_INREG:
4101     // This is a workaround. For extension from type i1, `selectImpl()` uses
4102     // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4103     // i1 can only be hold in a SGPR class.
4104     if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4105         selectImpl(I, *CoverageInfo))
4106       return true;
4107     return selectG_SZA_EXT(I);
4108   case TargetOpcode::G_FPEXT:
4109     if (selectG_FPEXT(I))
4110       return true;
4111     return selectImpl(I, *CoverageInfo);
4112   case TargetOpcode::G_BRCOND:
4113     return selectG_BRCOND(I);
4114   case TargetOpcode::G_GLOBAL_VALUE:
4115     return selectG_GLOBAL_VALUE(I);
4116   case TargetOpcode::G_PTRMASK:
4117     return selectG_PTRMASK(I);
4118   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4119     return selectG_EXTRACT_VECTOR_ELT(I);
4120   case TargetOpcode::G_INSERT_VECTOR_ELT:
4121     return selectG_INSERT_VECTOR_ELT(I);
4122   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4123   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4124   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4125   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4126   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4127     const AMDGPU::ImageDimIntrinsicInfo *Intr =
4128         AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
4129     assert(Intr && "not an image intrinsic with image pseudo");
4130     return selectImageIntrinsic(I, Intr);
4131   }
4132   case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4133   case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4134   case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4135     return selectBVHIntersectRayIntrinsic(I);
4136   case AMDGPU::G_SBFX:
4137   case AMDGPU::G_UBFX:
4138     return selectG_SBFX_UBFX(I);
4139   case AMDGPU::G_SI_CALL:
4140     I.setDesc(TII.get(AMDGPU::SI_CALL));
4141     return true;
4142   case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4143     return selectWaveAddress(I);
4144   case AMDGPU::G_STACKRESTORE:
4145     return selectStackRestore(I);
4146   case AMDGPU::G_PHI:
4147     return selectPHI(I);
4148   case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4149     return selectCOPY_SCC_VCC(I);
4150   case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4151     return selectCOPY_VCC_SCC(I);
4152   case AMDGPU::G_AMDGPU_READANYLANE:
4153     return selectReadAnyLane(I);
4154   case TargetOpcode::G_CONSTANT:
4155   case TargetOpcode::G_FCONSTANT:
4156   default:
4157     return selectImpl(I, *CoverageInfo);
4158   }
4159   return false;
4160 }
4161 
4162 InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand & Root) const4163 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4164   return {{
4165       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4166   }};
4167 
4168 }
4169 
selectVOP3ModsImpl(Register Src,bool IsCanonicalizing,bool AllowAbs,bool OpSel) const4170 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4171     Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4172   unsigned Mods = 0;
4173   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4174 
4175   if (MI->getOpcode() == AMDGPU::G_FNEG) {
4176     Src = MI->getOperand(1).getReg();
4177     Mods |= SISrcMods::NEG;
4178     MI = getDefIgnoringCopies(Src, *MRI);
4179   } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4180     // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4181     // denormal mode, but we're implicitly canonicalizing in a source operand.
4182     const ConstantFP *LHS =
4183         getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4184     if (LHS && LHS->isZero()) {
4185       Mods |= SISrcMods::NEG;
4186       Src = MI->getOperand(2).getReg();
4187     }
4188   }
4189 
4190   if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4191     Src = MI->getOperand(1).getReg();
4192     Mods |= SISrcMods::ABS;
4193   }
4194 
4195   if (OpSel)
4196     Mods |= SISrcMods::OP_SEL_0;
4197 
4198   return std::pair(Src, Mods);
4199 }
4200 
copyToVGPRIfSrcFolded(Register Src,unsigned Mods,MachineOperand Root,MachineInstr * InsertPt,bool ForceVGPR) const4201 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4202     Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4203     bool ForceVGPR) const {
4204   if ((Mods != 0 || ForceVGPR) &&
4205       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4206 
4207     // If we looked through copies to find source modifiers on an SGPR operand,
4208     // we now have an SGPR register source. To avoid potentially violating the
4209     // constant bus restriction, we need to insert a copy to a VGPR.
4210     Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4211     BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4212             TII.get(AMDGPU::COPY), VGPRSrc)
4213         .addReg(Src);
4214     Src = VGPRSrc;
4215   }
4216 
4217   return Src;
4218 }
4219 
4220 ///
4221 /// This will select either an SGPR or VGPR operand and will save us from
4222 /// having to write an extra tablegen pattern.
4223 InstructionSelector::ComplexRendererFns
selectVSRC0(MachineOperand & Root) const4224 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4225   return {{
4226       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4227   }};
4228 }
4229 
4230 InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand & Root) const4231 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4232   Register Src;
4233   unsigned Mods;
4234   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4235 
4236   return {{
4237       [=](MachineInstrBuilder &MIB) {
4238         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4239       },
4240       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4241       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
4242       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
4243   }};
4244 }
4245 
4246 InstructionSelector::ComplexRendererFns
selectVOP3BMods0(MachineOperand & Root) const4247 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4248   Register Src;
4249   unsigned Mods;
4250   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4251                                            /*IsCanonicalizing=*/true,
4252                                            /*AllowAbs=*/false);
4253 
4254   return {{
4255       [=](MachineInstrBuilder &MIB) {
4256         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4257       },
4258       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4259       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
4260       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
4261   }};
4262 }
4263 
4264 InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand & Root) const4265 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4266   return {{
4267       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4268       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4269       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
4270   }};
4271 }
4272 
4273 InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand & Root) const4274 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4275   Register Src;
4276   unsigned Mods;
4277   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4278 
4279   return {{
4280       [=](MachineInstrBuilder &MIB) {
4281         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4282       },
4283       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4284   }};
4285 }
4286 
4287 InstructionSelector::ComplexRendererFns
selectVOP3ModsNonCanonicalizing(MachineOperand & Root) const4288 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4289     MachineOperand &Root) const {
4290   Register Src;
4291   unsigned Mods;
4292   std::tie(Src, Mods) =
4293       selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4294 
4295   return {{
4296       [=](MachineInstrBuilder &MIB) {
4297         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4298       },
4299       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4300   }};
4301 }
4302 
4303 InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand & Root) const4304 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4305   Register Src;
4306   unsigned Mods;
4307   std::tie(Src, Mods) =
4308       selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4309                          /*AllowAbs=*/false);
4310 
4311   return {{
4312       [=](MachineInstrBuilder &MIB) {
4313         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4314       },
4315       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4316   }};
4317 }
4318 
4319 InstructionSelector::ComplexRendererFns
selectVOP3NoMods(MachineOperand & Root) const4320 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4321   Register Reg = Root.getReg();
4322   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4323   if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4324     return {};
4325   return {{
4326       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4327   }};
4328 }
4329 
4330 enum class SrcStatus {
4331   IS_SAME,
4332   IS_UPPER_HALF,
4333   IS_LOWER_HALF,
4334   IS_UPPER_HALF_NEG,
4335   // This means current op = [op_upper, op_lower] and src = -op_lower.
4336   IS_LOWER_HALF_NEG,
4337   IS_HI_NEG,
4338   // This means current op = [op_upper, op_lower] and src = [op_upper,
4339   // -op_lower].
4340   IS_LO_NEG,
4341   IS_BOTH_NEG,
4342   INVALID,
4343   NEG_START = IS_UPPER_HALF_NEG,
4344   NEG_END = IS_BOTH_NEG,
4345   HALF_START = IS_UPPER_HALF,
4346   HALF_END = IS_LOWER_HALF_NEG
4347 };
4348 /// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
isTruncHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4349 static bool isTruncHalf(const MachineInstr *MI,
4350                         const MachineRegisterInfo &MRI) {
4351   if (MI->getOpcode() != AMDGPU::G_TRUNC)
4352     return false;
4353 
4354   unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4355   unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4356   return DstSize * 2 == SrcSize;
4357 }
4358 
4359 /// Test if the MI is logic shift right with half bits,
4360 /// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
isLshrHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4361 static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4362   if (MI->getOpcode() != AMDGPU::G_LSHR)
4363     return false;
4364 
4365   Register ShiftSrc;
4366   std::optional<ValueAndVReg> ShiftAmt;
4367   if (mi_match(MI->getOperand(0).getReg(), MRI,
4368                m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4369     unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4370     unsigned Shift = ShiftAmt->Value.getZExtValue();
4371     return Shift * 2 == SrcSize;
4372   }
4373   return false;
4374 }
4375 
4376 /// Test if the MI is shift left with half bits,
4377 /// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
isShlHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4378 static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4379   if (MI->getOpcode() != AMDGPU::G_SHL)
4380     return false;
4381 
4382   Register ShiftSrc;
4383   std::optional<ValueAndVReg> ShiftAmt;
4384   if (mi_match(MI->getOperand(0).getReg(), MRI,
4385                m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4386     unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4387     unsigned Shift = ShiftAmt->Value.getZExtValue();
4388     return Shift * 2 == SrcSize;
4389   }
4390   return false;
4391 }
4392 
4393 /// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
isUnmergeHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4394 static bool isUnmergeHalf(const MachineInstr *MI,
4395                           const MachineRegisterInfo &MRI) {
4396   if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4397     return false;
4398   return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4399          MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4400 }
4401 
4402 enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
4403 
isVectorOfTwoOrScalar(Register Reg,const MachineRegisterInfo & MRI)4404 static TypeClass isVectorOfTwoOrScalar(Register Reg,
4405                                        const MachineRegisterInfo &MRI) {
4406   LLT OpTy = MRI.getType(Reg);
4407   if (OpTy.isScalar())
4408     return TypeClass::SCALAR;
4409   if (OpTy.isVector() && OpTy.getNumElements() == 2)
4410     return TypeClass::VECTOR_OF_TWO;
4411   return TypeClass::NONE_OF_LISTED;
4412 }
4413 
getNegStatus(Register Reg,SrcStatus S,const MachineRegisterInfo & MRI)4414 static SrcStatus getNegStatus(Register Reg, SrcStatus S,
4415                               const MachineRegisterInfo &MRI) {
4416   TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4417   if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4418     return SrcStatus::INVALID;
4419 
4420   switch (S) {
4421   case SrcStatus::IS_SAME:
4422     if (NegType == TypeClass::VECTOR_OF_TWO) {
4423       // Vector of 2:
4424       // [SrcHi, SrcLo]   = [CurrHi, CurrLo]
4425       // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4426       // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4427       // [SrcHi, SrcLo]   = [-OpHi, -OpLo]
4428       return SrcStatus::IS_BOTH_NEG;
4429     }
4430     if (NegType == TypeClass::SCALAR) {
4431       // Scalar:
4432       // [SrcHi, SrcLo]   = [CurrHi, CurrLo]
4433       // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4434       // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4435       // [SrcHi, SrcLo]   = [-OpHi, OpLo]
4436       return SrcStatus::IS_HI_NEG;
4437     }
4438     break;
4439   case SrcStatus::IS_HI_NEG:
4440     if (NegType == TypeClass::VECTOR_OF_TWO) {
4441       // Vector of 2:
4442       // [SrcHi, SrcLo]   = [-CurrHi, CurrLo]
4443       // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4444       // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4445       // [SrcHi, SrcLo]   = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4446       return SrcStatus::IS_LO_NEG;
4447     }
4448     if (NegType == TypeClass::SCALAR) {
4449       // Scalar:
4450       // [SrcHi, SrcLo]   = [-CurrHi, CurrLo]
4451       // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4452       // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4453       // [SrcHi, SrcLo]   = [-(-OpHi), OpLo] = [OpHi, OpLo]
4454       return SrcStatus::IS_SAME;
4455     }
4456     break;
4457   case SrcStatus::IS_LO_NEG:
4458     if (NegType == TypeClass::VECTOR_OF_TWO) {
4459       // Vector of 2:
4460       // [SrcHi, SrcLo]   = [CurrHi, -CurrLo]
4461       // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4462       // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4463       // [SrcHi, SrcLo]   = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4464       return SrcStatus::IS_HI_NEG;
4465     }
4466     if (NegType == TypeClass::SCALAR) {
4467       // Scalar:
4468       // [SrcHi, SrcLo]   = [CurrHi, -CurrLo]
4469       // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4470       // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4471       // [SrcHi, SrcLo]   = [-OpHi, -OpLo]
4472       return SrcStatus::IS_BOTH_NEG;
4473     }
4474     break;
4475   case SrcStatus::IS_BOTH_NEG:
4476     if (NegType == TypeClass::VECTOR_OF_TWO) {
4477       // Vector of 2:
4478       // [SrcHi, SrcLo]   = [-CurrHi, -CurrLo]
4479       // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4480       // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4481       // [SrcHi, SrcLo]   = [OpHi, OpLo]
4482       return SrcStatus::IS_SAME;
4483     }
4484     if (NegType == TypeClass::SCALAR) {
4485       // Scalar:
4486       // [SrcHi, SrcLo]   = [-CurrHi, -CurrLo]
4487       // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4488       // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4489       // [SrcHi, SrcLo]   = [OpHi, -OpLo]
4490       return SrcStatus::IS_LO_NEG;
4491     }
4492     break;
4493   case SrcStatus::IS_UPPER_HALF:
4494     // Vector of 2:
4495     // Src = CurrUpper
4496     // Curr = [CurrUpper, CurrLower]
4497     // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4498     // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4499     // Src = -OpUpper
4500     //
4501     // Scalar:
4502     // Src = CurrUpper
4503     // Curr = [CurrUpper, CurrLower]
4504     // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4505     // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4506     // Src = -OpUpper
4507     return SrcStatus::IS_UPPER_HALF_NEG;
4508   case SrcStatus::IS_LOWER_HALF:
4509     if (NegType == TypeClass::VECTOR_OF_TWO) {
4510       // Vector of 2:
4511       // Src = CurrLower
4512       // Curr = [CurrUpper, CurrLower]
4513       // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4514       // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4515       // Src = -OpLower
4516       return SrcStatus::IS_LOWER_HALF_NEG;
4517     }
4518     if (NegType == TypeClass::SCALAR) {
4519       // Scalar:
4520       // Src = CurrLower
4521       // Curr = [CurrUpper, CurrLower]
4522       // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4523       // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4524       // Src = OpLower
4525       return SrcStatus::IS_LOWER_HALF;
4526     }
4527     break;
4528   case SrcStatus::IS_UPPER_HALF_NEG:
4529     // Vector of 2:
4530     // Src = -CurrUpper
4531     // Curr = [CurrUpper, CurrLower]
4532     // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4533     // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4534     // Src = -(-OpUpper) = OpUpper
4535     //
4536     // Scalar:
4537     // Src = -CurrUpper
4538     // Curr = [CurrUpper, CurrLower]
4539     // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4540     // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4541     // Src = -(-OpUpper) = OpUpper
4542     return SrcStatus::IS_UPPER_HALF;
4543   case SrcStatus::IS_LOWER_HALF_NEG:
4544     if (NegType == TypeClass::VECTOR_OF_TWO) {
4545       // Vector of 2:
4546       // Src = -CurrLower
4547       // Curr = [CurrUpper, CurrLower]
4548       // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4549       // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4550       // Src = -(-OpLower) = OpLower
4551       return SrcStatus::IS_LOWER_HALF;
4552     }
4553     if (NegType == TypeClass::SCALAR) {
4554       // Scalar:
4555       // Src = -CurrLower
4556       // Curr = [CurrUpper, CurrLower]
4557       // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4558       // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4559       // Src = -OpLower
4560       return SrcStatus::IS_LOWER_HALF_NEG;
4561     }
4562     break;
4563   default:
4564     break;
4565   }
4566   llvm_unreachable("unexpected SrcStatus & NegType combination");
4567 }
4568 
4569 static std::optional<std::pair<Register, SrcStatus>>
calcNextStatus(std::pair<Register,SrcStatus> Curr,const MachineRegisterInfo & MRI)4570 calcNextStatus(std::pair<Register, SrcStatus> Curr,
4571                const MachineRegisterInfo &MRI) {
4572   const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4573 
4574   unsigned Opc = MI->getOpcode();
4575 
4576   // Handle general Opc cases.
4577   switch (Opc) {
4578   case AMDGPU::G_BITCAST:
4579     return std::optional<std::pair<Register, SrcStatus>>(
4580         {MI->getOperand(1).getReg(), Curr.second});
4581   case AMDGPU::COPY:
4582     if (MI->getOperand(1).getReg().isPhysical())
4583       return std::nullopt;
4584     return std::optional<std::pair<Register, SrcStatus>>(
4585         {MI->getOperand(1).getReg(), Curr.second});
4586   case AMDGPU::G_FNEG: {
4587     SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4588     if (Stat == SrcStatus::INVALID)
4589       return std::nullopt;
4590     return std::optional<std::pair<Register, SrcStatus>>(
4591         {MI->getOperand(1).getReg(), Stat});
4592   }
4593   default:
4594     break;
4595   }
4596 
4597   // Calc next Stat from current Stat.
4598   switch (Curr.second) {
4599   case SrcStatus::IS_SAME:
4600     if (isTruncHalf(MI, MRI))
4601       return std::optional<std::pair<Register, SrcStatus>>(
4602           {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4603     else if (isUnmergeHalf(MI, MRI)) {
4604       if (Curr.first == MI->getOperand(0).getReg())
4605         return std::optional<std::pair<Register, SrcStatus>>(
4606             {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4607       return std::optional<std::pair<Register, SrcStatus>>(
4608           {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4609     }
4610     break;
4611   case SrcStatus::IS_HI_NEG:
4612     if (isTruncHalf(MI, MRI)) {
4613       // [SrcHi, SrcLo]   = [-CurrHi, CurrLo]
4614       // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4615       //                  = [OpLowerHi, OpLowerLo]
4616       // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4617       //     = [-OpLowerHi, OpLowerLo]
4618       //     = -OpLower
4619       return std::optional<std::pair<Register, SrcStatus>>(
4620           {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4621     }
4622     if (isUnmergeHalf(MI, MRI)) {
4623       if (Curr.first == MI->getOperand(0).getReg())
4624         return std::optional<std::pair<Register, SrcStatus>>(
4625             {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4626       return std::optional<std::pair<Register, SrcStatus>>(
4627           {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4628     }
4629     break;
4630   case SrcStatus::IS_UPPER_HALF:
4631     if (isShlHalf(MI, MRI))
4632       return std::optional<std::pair<Register, SrcStatus>>(
4633           {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4634     break;
4635   case SrcStatus::IS_LOWER_HALF:
4636     if (isLshrHalf(MI, MRI))
4637       return std::optional<std::pair<Register, SrcStatus>>(
4638           {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4639     break;
4640   case SrcStatus::IS_UPPER_HALF_NEG:
4641     if (isShlHalf(MI, MRI))
4642       return std::optional<std::pair<Register, SrcStatus>>(
4643           {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4644     break;
4645   case SrcStatus::IS_LOWER_HALF_NEG:
4646     if (isLshrHalf(MI, MRI))
4647       return std::optional<std::pair<Register, SrcStatus>>(
4648           {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4649     break;
4650   default:
4651     break;
4652   }
4653   return std::nullopt;
4654 }
4655 
4656 /// This is used to control valid status that current MI supports. For example,
4657 /// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4658 /// bit on VOP3P.
4659 /// The class can be further extended to recognize support on SEL, NEG, ABS bit
4660 /// for different MI on different arch
4661 class SearchOptions {
4662 private:
4663   bool HasNeg = false;
4664   // Assume all complex pattern of VOP3P have opsel.
4665   bool HasOpsel = true;
4666 
4667 public:
SearchOptions(Register Reg,const MachineRegisterInfo & MRI)4668   SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {
4669     const MachineInstr *MI = MRI.getVRegDef(Reg);
4670     unsigned Opc = MI->getOpcode();
4671 
4672     if (Opc < TargetOpcode::GENERIC_OP_END) {
4673       // Keep same for generic op.
4674       HasNeg = true;
4675     } else if (Opc == TargetOpcode::G_INTRINSIC) {
4676       Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4677       // Only float point intrinsic has neg & neg_hi bits.
4678       if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4679         HasNeg = true;
4680     }
4681   }
checkOptions(SrcStatus Stat) const4682   bool checkOptions(SrcStatus Stat) const {
4683     if (!HasNeg &&
4684         (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4685       return false;
4686     }
4687     if (!HasOpsel &&
4688         (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4689       return false;
4690     }
4691     return true;
4692   }
4693 };
4694 
4695 static SmallVector<std::pair<Register, SrcStatus>>
getSrcStats(Register Reg,const MachineRegisterInfo & MRI,SearchOptions SO,int MaxDepth=3)4696 getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
4697             int MaxDepth = 3) {
4698   int Depth = 0;
4699   auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4700   SmallVector<std::pair<Register, SrcStatus>> Statlist;
4701 
4702   while (Depth <= MaxDepth && Curr.has_value()) {
4703     Depth++;
4704     if (SO.checkOptions(Curr.value().second))
4705       Statlist.push_back(Curr.value());
4706     Curr = calcNextStatus(Curr.value(), MRI);
4707   }
4708 
4709   return Statlist;
4710 }
4711 
4712 static std::pair<Register, SrcStatus>
getLastSameOrNeg(Register Reg,const MachineRegisterInfo & MRI,SearchOptions SO,int MaxDepth=3)4713 getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
4714                  int MaxDepth = 3) {
4715   int Depth = 0;
4716   std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4717   auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4718 
4719   while (Depth <= MaxDepth && Curr.has_value()) {
4720     Depth++;
4721     SrcStatus Stat = Curr.value().second;
4722     if (SO.checkOptions(Stat)) {
4723       if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4724           Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
4725         LastSameOrNeg = Curr.value();
4726     }
4727     Curr = calcNextStatus(Curr.value(), MRI);
4728   }
4729 
4730   return LastSameOrNeg;
4731 }
4732 
isSameBitWidth(Register Reg1,Register Reg2,const MachineRegisterInfo & MRI)4733 static bool isSameBitWidth(Register Reg1, Register Reg2,
4734                            const MachineRegisterInfo &MRI) {
4735   unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4736   unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4737   return Width1 == Width2;
4738 }
4739 
updateMods(SrcStatus HiStat,SrcStatus LoStat,unsigned Mods)4740 static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4741   // SrcStatus::IS_LOWER_HALF remain 0.
4742   if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4743     Mods ^= SISrcMods::NEG_HI;
4744     Mods |= SISrcMods::OP_SEL_1;
4745   } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4746     Mods |= SISrcMods::OP_SEL_1;
4747   else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4748     Mods ^= SISrcMods::NEG_HI;
4749   else if (HiStat == SrcStatus::IS_HI_NEG)
4750     Mods ^= SISrcMods::NEG_HI;
4751 
4752   if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4753     Mods ^= SISrcMods::NEG;
4754     Mods |= SISrcMods::OP_SEL_0;
4755   } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4756     Mods |= SISrcMods::OP_SEL_0;
4757   else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4758     Mods |= SISrcMods::NEG;
4759   else if (LoStat == SrcStatus::IS_HI_NEG)
4760     Mods ^= SISrcMods::NEG;
4761 
4762   return Mods;
4763 }
4764 
isValidToPack(SrcStatus HiStat,SrcStatus LoStat,Register NewReg,Register RootReg,const SIInstrInfo & TII,const MachineRegisterInfo & MRI)4765 static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4766                           Register RootReg, const SIInstrInfo &TII,
4767                           const MachineRegisterInfo &MRI) {
4768   auto IsHalfState = [](SrcStatus S) {
4769     return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
4770            S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
4771   };
4772   return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4773          IsHalfState(HiStat);
4774 }
4775 
selectVOP3PModsImpl(Register RootReg,const MachineRegisterInfo & MRI,bool IsDOT) const4776 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4777     Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4778   unsigned Mods = 0;
4779   // No modification if Root type is not form of <2 x Type>.
4780   if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4781     Mods |= SISrcMods::OP_SEL_1;
4782     return {RootReg, Mods};
4783   }
4784 
4785   SearchOptions SO(RootReg, MRI);
4786 
4787   std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4788 
4789   if (Stat.second == SrcStatus::IS_BOTH_NEG)
4790     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
4791   else if (Stat.second == SrcStatus::IS_HI_NEG)
4792     Mods ^= SISrcMods::NEG_HI;
4793   else if (Stat.second == SrcStatus::IS_LO_NEG)
4794     Mods ^= SISrcMods::NEG;
4795 
4796   MachineInstr *MI = MRI.getVRegDef(Stat.first);
4797 
4798   if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4799       (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4800     Mods |= SISrcMods::OP_SEL_1;
4801     return {Stat.first, Mods};
4802   }
4803 
4804   SmallVector<std::pair<Register, SrcStatus>> StatlistHi =
4805       getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4806 
4807   if (StatlistHi.empty()) {
4808     Mods |= SISrcMods::OP_SEL_1;
4809     return {Stat.first, Mods};
4810   }
4811 
4812   SmallVector<std::pair<Register, SrcStatus>> StatlistLo =
4813       getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4814 
4815   if (StatlistLo.empty()) {
4816     Mods |= SISrcMods::OP_SEL_1;
4817     return {Stat.first, Mods};
4818   }
4819 
4820   for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4821     for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4822       if (StatlistHi[I].first == StatlistLo[J].first &&
4823           isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4824                         StatlistHi[I].first, RootReg, TII, MRI))
4825         return {StatlistHi[I].first,
4826                 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4827     }
4828   }
4829   // Packed instructions do not have abs modifiers.
4830   Mods |= SISrcMods::OP_SEL_1;
4831 
4832   return {Stat.first, Mods};
4833 }
4834 
4835 // Removed unused function `getAllKindImm` to eliminate dead code.
4836 
checkRB(Register Reg,unsigned int RBNo,const AMDGPURegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI)4837 static bool checkRB(Register Reg, unsigned int RBNo,
4838                     const AMDGPURegisterBankInfo &RBI,
4839                     const MachineRegisterInfo &MRI,
4840                     const TargetRegisterInfo &TRI) {
4841   const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4842   return RB->getID() == RBNo;
4843 }
4844 
4845 // This function is used to get the correct register bank for returned reg.
4846 // Assume:
4847 // 1. VOP3P is always legal for VGPR.
4848 // 2. RootOp's regbank is legal.
4849 // Thus
4850 // 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4851 // 2. If RootOp is VGPR, then NewOp must be VGPR.
getLegalRegBank(Register NewReg,Register RootReg,const AMDGPURegisterBankInfo & RBI,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const SIInstrInfo & TII)4852 static Register getLegalRegBank(Register NewReg, Register RootReg,
4853                                 const AMDGPURegisterBankInfo &RBI,
4854                                 MachineRegisterInfo &MRI,
4855                                 const TargetRegisterInfo &TRI,
4856                                 const SIInstrInfo &TII) {
4857   // RootOp can only be VGPR or SGPR (some hand written cases such as.
4858   // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4859   if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4860       checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4861     return NewReg;
4862 
4863   MachineInstr *MI = MRI.getVRegDef(RootReg);
4864   if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4865     // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4866     return RootReg;
4867   }
4868 
4869   MachineBasicBlock *BB = MI->getParent();
4870   Register DstReg = MRI.cloneVirtualRegister(RootReg);
4871 
4872   MachineInstrBuilder MIB =
4873       BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4874           .addReg(NewReg);
4875 
4876   // Only accept VGPR.
4877   return MIB->getOperand(0).getReg();
4878 }
4879 
4880 InstructionSelector::ComplexRendererFns
selectVOP3PRetHelper(MachineOperand & Root,bool IsDOT) const4881 AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
4882                                                 bool IsDOT) const {
4883   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
4884   Register Reg;
4885   unsigned Mods;
4886   std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
4887 
4888   Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
4889   return {{
4890       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4891       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4892   }};
4893 }
4894 
4895 InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand & Root) const4896 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4897 
4898   return selectVOP3PRetHelper(Root);
4899 }
4900 
4901 InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand & Root) const4902 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4903 
4904   return selectVOP3PRetHelper(Root, true);
4905 }
4906 
4907 InstructionSelector::ComplexRendererFns
selectVOP3PModsNeg(MachineOperand & Root) const4908 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4909   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4910   // Value is in Imm operand as i1 sign extended to int64_t.
4911   // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4912   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4913          "expected i1 value");
4914   unsigned Mods = SISrcMods::OP_SEL_1;
4915   if (Root.getImm() == -1)
4916     Mods ^= SISrcMods::NEG;
4917   return {{
4918       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4919   }};
4920 }
4921 
4922 InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand & Root) const4923 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4924     MachineOperand &Root) const {
4925   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4926          "expected i1 value");
4927   unsigned Mods = SISrcMods::OP_SEL_1;
4928   if (Root.getImm() != 0)
4929     Mods |= SISrcMods::OP_SEL_0;
4930 
4931   return {{
4932       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4933   }};
4934 }
4935 
buildRegSequence(SmallVectorImpl<Register> & Elts,MachineInstr * InsertPt,MachineRegisterInfo & MRI)4936 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
4937                                  MachineInstr *InsertPt,
4938                                  MachineRegisterInfo &MRI) {
4939   const TargetRegisterClass *DstRegClass;
4940   switch (Elts.size()) {
4941   case 8:
4942     DstRegClass = &AMDGPU::VReg_256RegClass;
4943     break;
4944   case 4:
4945     DstRegClass = &AMDGPU::VReg_128RegClass;
4946     break;
4947   case 2:
4948     DstRegClass = &AMDGPU::VReg_64RegClass;
4949     break;
4950   default:
4951     llvm_unreachable("unhandled Reg sequence size");
4952   }
4953 
4954   MachineIRBuilder B(*InsertPt);
4955   auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4956                  .addDef(MRI.createVirtualRegister(DstRegClass));
4957   for (unsigned i = 0; i < Elts.size(); ++i) {
4958     MIB.addReg(Elts[i]);
4959     MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
4960   }
4961   return MIB->getOperand(0).getReg();
4962 }
4963 
selectWMMAModsNegAbs(unsigned ModOpcode,unsigned & Mods,SmallVectorImpl<Register> & Elts,Register & Src,MachineInstr * InsertPt,MachineRegisterInfo & MRI)4964 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4965                                  SmallVectorImpl<Register> &Elts, Register &Src,
4966                                  MachineInstr *InsertPt,
4967                                  MachineRegisterInfo &MRI) {
4968   if (ModOpcode == TargetOpcode::G_FNEG) {
4969     Mods |= SISrcMods::NEG;
4970     // Check if all elements also have abs modifier
4971     SmallVector<Register, 8> NegAbsElts;
4972     for (auto El : Elts) {
4973       Register FabsSrc;
4974       if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4975         break;
4976       NegAbsElts.push_back(FabsSrc);
4977     }
4978     if (Elts.size() != NegAbsElts.size()) {
4979       // Neg
4980       Src = buildRegSequence(Elts, InsertPt, MRI);
4981     } else {
4982       // Neg and Abs
4983       Mods |= SISrcMods::NEG_HI;
4984       Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4985     }
4986   } else {
4987     assert(ModOpcode == TargetOpcode::G_FABS);
4988     // Abs
4989     Mods |= SISrcMods::NEG_HI;
4990     Src = buildRegSequence(Elts, InsertPt, MRI);
4991   }
4992 }
4993 
4994 InstructionSelector::ComplexRendererFns
selectWMMAModsF32NegAbs(MachineOperand & Root) const4995 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4996   Register Src = Root.getReg();
4997   unsigned Mods = SISrcMods::OP_SEL_1;
4998   SmallVector<Register, 8> EltsF32;
4999 
5000   if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5001     assert(BV->getNumSources() > 0);
5002     // Based on first element decide which mod we match, neg or abs
5003     MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5004     unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5005                              ? AMDGPU::G_FNEG
5006                              : AMDGPU::G_FABS;
5007     for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5008       ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5009       if (ElF32->getOpcode() != ModOpcode)
5010         break;
5011       EltsF32.push_back(ElF32->getOperand(1).getReg());
5012     }
5013 
5014     // All elements had ModOpcode modifier
5015     if (BV->getNumSources() == EltsF32.size()) {
5016       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5017                            *MRI);
5018     }
5019   }
5020 
5021   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5022            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5023 }
5024 
5025 InstructionSelector::ComplexRendererFns
selectWMMAModsF16Neg(MachineOperand & Root) const5026 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5027   Register Src = Root.getReg();
5028   unsigned Mods = SISrcMods::OP_SEL_1;
5029   SmallVector<Register, 8> EltsV2F16;
5030 
5031   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5032     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5033       Register FNegSrc;
5034       if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5035         break;
5036       EltsV2F16.push_back(FNegSrc);
5037     }
5038 
5039     // All elements had ModOpcode modifier
5040     if (CV->getNumSources() == EltsV2F16.size()) {
5041       Mods |= SISrcMods::NEG;
5042       Mods |= SISrcMods::NEG_HI;
5043       Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5044     }
5045   }
5046 
5047   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5048            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5049 }
5050 
5051 InstructionSelector::ComplexRendererFns
selectWMMAModsF16NegAbs(MachineOperand & Root) const5052 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5053   Register Src = Root.getReg();
5054   unsigned Mods = SISrcMods::OP_SEL_1;
5055   SmallVector<Register, 8> EltsV2F16;
5056 
5057   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5058     assert(CV->getNumSources() > 0);
5059     MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5060     // Based on first element decide which mod we match, neg or abs
5061     unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5062                              ? AMDGPU::G_FNEG
5063                              : AMDGPU::G_FABS;
5064 
5065     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5066       ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5067       if (ElV2F16->getOpcode() != ModOpcode)
5068         break;
5069       EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5070     }
5071 
5072     // All elements had ModOpcode modifier
5073     if (CV->getNumSources() == EltsV2F16.size()) {
5074       MachineIRBuilder B(*Root.getParent());
5075       selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5076                            *MRI);
5077     }
5078   }
5079 
5080   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5081            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5082 }
5083 
5084 InstructionSelector::ComplexRendererFns
selectWMMAVISrc(MachineOperand & Root) const5085 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5086   std::optional<FPValueAndVReg> FPValReg;
5087   if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5088     if (TII.isInlineConstant(FPValReg->Value)) {
5089       return {{[=](MachineInstrBuilder &MIB) {
5090         MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5091       }}};
5092     }
5093     // Non-inlineable splat floats should not fall-through for integer immediate
5094     // checks.
5095     return {};
5096   }
5097 
5098   APInt ICst;
5099   if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5100     if (TII.isInlineConstant(ICst)) {
5101       return {
5102           {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5103     }
5104   }
5105 
5106   return {};
5107 }
5108 
5109 InstructionSelector::ComplexRendererFns
selectSWMMACIndex8(MachineOperand & Root) const5110 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5111   Register Src =
5112       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5113   unsigned Key = 0;
5114 
5115   Register ShiftSrc;
5116   std::optional<ValueAndVReg> ShiftAmt;
5117   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5118       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5119       ShiftAmt->Value.getZExtValue() % 8 == 0) {
5120     Key = ShiftAmt->Value.getZExtValue() / 8;
5121     Src = ShiftSrc;
5122   }
5123 
5124   return {{
5125       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5126       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5127   }};
5128 }
5129 
5130 InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand & Root) const5131 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5132 
5133   Register Src =
5134       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5135   unsigned Key = 0;
5136 
5137   Register ShiftSrc;
5138   std::optional<ValueAndVReg> ShiftAmt;
5139   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5140       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5141       ShiftAmt->Value.getZExtValue() == 16) {
5142     Src = ShiftSrc;
5143     Key = 1;
5144   }
5145 
5146   return {{
5147       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5148       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5149   }};
5150 }
5151 
5152 InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand & Root) const5153 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5154   Register Src;
5155   unsigned Mods;
5156   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5157 
5158   // FIXME: Handle op_sel
5159   return {{
5160       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5161       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5162   }};
5163 }
5164 
5165 // FIXME-TRUE16 remove when fake16 is removed
5166 InstructionSelector::ComplexRendererFns
selectVINTERPMods(MachineOperand & Root) const5167 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5168   Register Src;
5169   unsigned Mods;
5170   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5171                                            /*IsCanonicalizing=*/true,
5172                                            /*AllowAbs=*/false,
5173                                            /*OpSel=*/false);
5174 
5175   return {{
5176       [=](MachineInstrBuilder &MIB) {
5177         MIB.addReg(
5178             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5179       },
5180       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5181   }};
5182 }
5183 
5184 InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand & Root) const5185 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5186   Register Src;
5187   unsigned Mods;
5188   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5189                                            /*IsCanonicalizing=*/true,
5190                                            /*AllowAbs=*/false,
5191                                            /*OpSel=*/true);
5192 
5193   return {{
5194       [=](MachineInstrBuilder &MIB) {
5195         MIB.addReg(
5196             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5197       },
5198       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5199   }};
5200 }
5201 
selectSmrdOffset(MachineOperand & Root,Register & Base,Register * SOffset,int64_t * Offset) const5202 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5203                                                  Register &Base,
5204                                                  Register *SOffset,
5205                                                  int64_t *Offset) const {
5206   MachineInstr *MI = Root.getParent();
5207   MachineBasicBlock *MBB = MI->getParent();
5208 
5209   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5210   // then we can select all ptr + 32-bit offsets.
5211   SmallVector<GEPInfo, 4> AddrInfo;
5212   getAddrModeInfo(*MI, *MRI, AddrInfo);
5213 
5214   if (AddrInfo.empty())
5215     return false;
5216 
5217   const GEPInfo &GEPI = AddrInfo[0];
5218   std::optional<int64_t> EncodedImm;
5219 
5220   if (SOffset && Offset) {
5221     EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5222                                               /*HasSOffset=*/true);
5223     if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5224         AddrInfo.size() > 1) {
5225       const GEPInfo &GEPI2 = AddrInfo[1];
5226       if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5227         if (Register OffsetReg =
5228                 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
5229           Base = GEPI2.SgprParts[0];
5230           *SOffset = OffsetReg;
5231           *Offset = *EncodedImm;
5232           if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5233             return true;
5234 
5235           // For unbuffered smem loads, it is illegal for the Immediate Offset
5236           // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5237           // is negative. Handle the case where the Immediate Offset + SOffset
5238           // is negative.
5239           auto SKnown = VT->getKnownBits(*SOffset);
5240           if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5241             return false;
5242 
5243           return true;
5244         }
5245       }
5246     }
5247     return false;
5248   }
5249 
5250   EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5251                                             /*HasSOffset=*/false);
5252   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5253     Base = GEPI.SgprParts[0];
5254     *Offset = *EncodedImm;
5255     return true;
5256   }
5257 
5258   // SGPR offset is unsigned.
5259   if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5260       GEPI.Imm != 0) {
5261     // If we make it this far we have a load with an 32-bit immediate offset.
5262     // It is OK to select this using a sgpr offset, because we have already
5263     // failed trying to select this load into one of the _IMM variants since
5264     // the _IMM Patterns are considered before the _SGPR patterns.
5265     Base = GEPI.SgprParts[0];
5266     *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5267     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5268         .addImm(GEPI.Imm);
5269     return true;
5270   }
5271 
5272   if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5273     if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
5274       Base = GEPI.SgprParts[0];
5275       *SOffset = OffsetReg;
5276       return true;
5277     }
5278   }
5279 
5280   return false;
5281 }
5282 
5283 InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand & Root) const5284 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5285   Register Base;
5286   int64_t Offset;
5287   if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
5288     return std::nullopt;
5289 
5290   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5291            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5292 }
5293 
5294 InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand & Root) const5295 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5296   SmallVector<GEPInfo, 4> AddrInfo;
5297   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5298 
5299   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5300     return std::nullopt;
5301 
5302   const GEPInfo &GEPInfo = AddrInfo[0];
5303   Register PtrReg = GEPInfo.SgprParts[0];
5304   std::optional<int64_t> EncodedImm =
5305       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5306   if (!EncodedImm)
5307     return std::nullopt;
5308 
5309   return {{
5310     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5311     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5312   }};
5313 }
5314 
5315 InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand & Root) const5316 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5317   Register Base, SOffset;
5318   if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
5319     return std::nullopt;
5320 
5321   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5322            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5323 }
5324 
5325 InstructionSelector::ComplexRendererFns
selectSmrdSgprImm(MachineOperand & Root) const5326 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5327   Register Base, SOffset;
5328   int64_t Offset;
5329   if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
5330     return std::nullopt;
5331 
5332   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5333            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5334            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5335 }
5336 
5337 std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand & Root,uint64_t FlatVariant) const5338 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5339                                                 uint64_t FlatVariant) const {
5340   MachineInstr *MI = Root.getParent();
5341 
5342   auto Default = std::pair(Root.getReg(), 0);
5343 
5344   if (!STI.hasFlatInstOffsets())
5345     return Default;
5346 
5347   Register PtrBase;
5348   int64_t ConstOffset;
5349   std::tie(PtrBase, ConstOffset) =
5350       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5351 
5352   if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
5353                            !isFlatScratchBaseLegal(Root.getReg())))
5354     return Default;
5355 
5356   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5357   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5358     return Default;
5359 
5360   return std::pair(PtrBase, ConstOffset);
5361 }
5362 
5363 InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand & Root) const5364 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5365   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5366 
5367   return {{
5368       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5369       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5370     }};
5371 }
5372 
5373 InstructionSelector::ComplexRendererFns
selectGlobalOffset(MachineOperand & Root) const5374 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5375   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5376 
5377   return {{
5378       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5379       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5380   }};
5381 }
5382 
5383 InstructionSelector::ComplexRendererFns
selectScratchOffset(MachineOperand & Root) const5384 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5385   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5386 
5387   return {{
5388       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5389       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5390     }};
5391 }
5392 
5393 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5394 InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand & Root) const5395 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5396   Register Addr = Root.getReg();
5397   Register PtrBase;
5398   int64_t ConstOffset;
5399   int64_t ImmOffset = 0;
5400 
5401   // Match the immediate offset first, which canonically is moved as low as
5402   // possible.
5403   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5404 
5405   if (ConstOffset != 0) {
5406     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5407                               SIInstrFlags::FlatGlobal)) {
5408       Addr = PtrBase;
5409       ImmOffset = ConstOffset;
5410     } else {
5411       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5412       if (isSGPR(PtrBaseDef->Reg)) {
5413         if (ConstOffset > 0) {
5414           // Offset is too large.
5415           //
5416           // saddr + large_offset -> saddr +
5417           //                         (voffset = large_offset & ~MaxOffset) +
5418           //                         (large_offset & MaxOffset);
5419           int64_t SplitImmOffset, RemainderOffset;
5420           std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
5421               ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
5422 
5423           if (isUInt<32>(RemainderOffset)) {
5424             MachineInstr *MI = Root.getParent();
5425             MachineBasicBlock *MBB = MI->getParent();
5426             Register HighBits =
5427                 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5428 
5429             BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5430                     HighBits)
5431                 .addImm(RemainderOffset);
5432 
5433             return {{
5434                 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5435                 [=](MachineInstrBuilder &MIB) {
5436                   MIB.addReg(HighBits);
5437                 }, // voffset
5438                 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5439             }};
5440           }
5441         }
5442 
5443         // We are adding a 64 bit SGPR and a constant. If constant bus limit
5444         // is 1 we would need to perform 1 or 2 extra moves for each half of
5445         // the constant and it is better to do a scalar add and then issue a
5446         // single VALU instruction to materialize zero. Otherwise it is less
5447         // instructions to perform VALU adds with immediates or inline literals.
5448         unsigned NumLiterals =
5449             !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5450             !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5451         if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5452           return std::nullopt;
5453       }
5454     }
5455   }
5456 
5457   // Match the variable offset.
5458   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5459   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5460     // Look through the SGPR->VGPR copy.
5461     Register SAddr =
5462         getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5463 
5464     if (isSGPR(SAddr)) {
5465       Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5466 
5467       // It's possible voffset is an SGPR here, but the copy to VGPR will be
5468       // inserted later.
5469       if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
5470         return {{[=](MachineInstrBuilder &MIB) { // saddr
5471                    MIB.addReg(SAddr);
5472                  },
5473                  [=](MachineInstrBuilder &MIB) { // voffset
5474                    MIB.addReg(VOffset);
5475                  },
5476                  [=](MachineInstrBuilder &MIB) { // offset
5477                    MIB.addImm(ImmOffset);
5478                  }}};
5479       }
5480     }
5481   }
5482 
5483   // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5484   // drop this.
5485   if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5486       AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5487     return std::nullopt;
5488 
5489   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5490   // moves required to copy a 64-bit SGPR to VGPR.
5491   MachineInstr *MI = Root.getParent();
5492   MachineBasicBlock *MBB = MI->getParent();
5493   Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5494 
5495   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5496       .addImm(0);
5497 
5498   return {{
5499       [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5500       [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset
5501       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }     // offset
5502   }};
5503 }
5504 
5505 InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand & Root) const5506 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5507   Register Addr = Root.getReg();
5508   Register PtrBase;
5509   int64_t ConstOffset;
5510   int64_t ImmOffset = 0;
5511 
5512   // Match the immediate offset first, which canonically is moved as low as
5513   // possible.
5514   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5515 
5516   if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5517       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5518                             SIInstrFlags::FlatScratch)) {
5519     Addr = PtrBase;
5520     ImmOffset = ConstOffset;
5521   }
5522 
5523   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5524   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5525     int FI = AddrDef->MI->getOperand(1).getIndex();
5526     return {{
5527         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5528         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5529     }};
5530   }
5531 
5532   Register SAddr = AddrDef->Reg;
5533 
5534   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5535     Register LHS = AddrDef->MI->getOperand(1).getReg();
5536     Register RHS = AddrDef->MI->getOperand(2).getReg();
5537     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5538     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5539 
5540     if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5541         isSGPR(RHSDef->Reg)) {
5542       int FI = LHSDef->MI->getOperand(1).getIndex();
5543       MachineInstr &I = *Root.getParent();
5544       MachineBasicBlock *BB = I.getParent();
5545       const DebugLoc &DL = I.getDebugLoc();
5546       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5547 
5548       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5549           .addFrameIndex(FI)
5550           .addReg(RHSDef->Reg)
5551           .setOperandDead(3); // Dead scc
5552     }
5553   }
5554 
5555   if (!isSGPR(SAddr))
5556     return std::nullopt;
5557 
5558   return {{
5559       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5560       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5561   }};
5562 }
5563 
5564 // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(Register VAddr,Register SAddr,uint64_t ImmOffset) const5565 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5566     Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5567   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5568     return false;
5569 
5570   // The bug affects the swizzling of SVS accesses if there is any carry out
5571   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5572   // voffset to (soffset + inst_offset).
5573   auto VKnown = VT->getKnownBits(VAddr);
5574   auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5575                                KnownBits::makeConstant(APInt(32, ImmOffset)));
5576   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5577   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5578   return (VMax & 3) + (SMax & 3) >= 4;
5579 }
5580 
5581 InstructionSelector::ComplexRendererFns
selectScratchSVAddr(MachineOperand & Root) const5582 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5583   Register Addr = Root.getReg();
5584   Register PtrBase;
5585   int64_t ConstOffset;
5586   int64_t ImmOffset = 0;
5587 
5588   // Match the immediate offset first, which canonically is moved as low as
5589   // possible.
5590   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5591 
5592   Register OrigAddr = Addr;
5593   if (ConstOffset != 0 &&
5594       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5595                             SIInstrFlags::FlatScratch)) {
5596     Addr = PtrBase;
5597     ImmOffset = ConstOffset;
5598   }
5599 
5600   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5601   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5602     return std::nullopt;
5603 
5604   Register RHS = AddrDef->MI->getOperand(2).getReg();
5605   if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5606     return std::nullopt;
5607 
5608   Register LHS = AddrDef->MI->getOperand(1).getReg();
5609   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5610 
5611   if (OrigAddr != Addr) {
5612     if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5613       return std::nullopt;
5614   } else {
5615     if (!isFlatScratchBaseLegalSV(OrigAddr))
5616       return std::nullopt;
5617   }
5618 
5619   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5620     return std::nullopt;
5621 
5622   if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5623     int FI = LHSDef->MI->getOperand(1).getIndex();
5624     return {{
5625         [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5626         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5627         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5628     }};
5629   }
5630 
5631   if (!isSGPR(LHS))
5632     return std::nullopt;
5633 
5634   return {{
5635       [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5636       [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5637       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5638   }};
5639 }
5640 
5641 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand & Root) const5642 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5643   MachineInstr *MI = Root.getParent();
5644   MachineBasicBlock *MBB = MI->getParent();
5645   MachineFunction *MF = MBB->getParent();
5646   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5647 
5648   int64_t Offset = 0;
5649   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5650       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5651     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5652 
5653     // TODO: Should this be inside the render function? The iterator seems to
5654     // move.
5655     const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5656     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5657             HighBits)
5658         .addImm(Offset & ~MaxOffset);
5659 
5660     return {{[=](MachineInstrBuilder &MIB) { // rsrc
5661                MIB.addReg(Info->getScratchRSrcReg());
5662              },
5663              [=](MachineInstrBuilder &MIB) { // vaddr
5664                MIB.addReg(HighBits);
5665              },
5666              [=](MachineInstrBuilder &MIB) { // soffset
5667                // Use constant zero for soffset and rely on eliminateFrameIndex
5668                // to choose the appropriate frame register if need be.
5669                MIB.addImm(0);
5670              },
5671              [=](MachineInstrBuilder &MIB) { // offset
5672                MIB.addImm(Offset & MaxOffset);
5673              }}};
5674   }
5675 
5676   assert(Offset == 0 || Offset == -1);
5677 
5678   // Try to fold a frame index directly into the MUBUF vaddr field, and any
5679   // offsets.
5680   std::optional<int> FI;
5681   Register VAddr = Root.getReg();
5682 
5683   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5684   Register PtrBase;
5685   int64_t ConstOffset;
5686   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5687   if (ConstOffset != 0) {
5688     if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5689         (!STI.privateMemoryResourceIsRangeChecked() ||
5690          VT->signBitIsZero(PtrBase))) {
5691       const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5692       if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5693         FI = PtrBaseDef->getOperand(1).getIndex();
5694       else
5695         VAddr = PtrBase;
5696       Offset = ConstOffset;
5697     }
5698   } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5699     FI = RootDef->getOperand(1).getIndex();
5700   }
5701 
5702   return {{[=](MachineInstrBuilder &MIB) { // rsrc
5703              MIB.addReg(Info->getScratchRSrcReg());
5704            },
5705            [=](MachineInstrBuilder &MIB) { // vaddr
5706              if (FI)
5707                MIB.addFrameIndex(*FI);
5708              else
5709                MIB.addReg(VAddr);
5710            },
5711            [=](MachineInstrBuilder &MIB) { // soffset
5712              // Use constant zero for soffset and rely on eliminateFrameIndex
5713              // to choose the appropriate frame register if need be.
5714              MIB.addImm(0);
5715            },
5716            [=](MachineInstrBuilder &MIB) { // offset
5717              MIB.addImm(Offset);
5718            }}};
5719 }
5720 
isDSOffsetLegal(Register Base,int64_t Offset) const5721 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5722                                                 int64_t Offset) const {
5723   if (!isUInt<16>(Offset))
5724     return false;
5725 
5726   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5727     return true;
5728 
5729   // On Southern Islands instruction with a negative base value and an offset
5730   // don't seem to work.
5731   return VT->signBitIsZero(Base);
5732 }
5733 
isDSOffset2Legal(Register Base,int64_t Offset0,int64_t Offset1,unsigned Size) const5734 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5735                                                  int64_t Offset1,
5736                                                  unsigned Size) const {
5737   if (Offset0 % Size != 0 || Offset1 % Size != 0)
5738     return false;
5739   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5740     return false;
5741 
5742   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5743     return true;
5744 
5745   // On Southern Islands instruction with a negative base value and an offset
5746   // don't seem to work.
5747   return VT->signBitIsZero(Base);
5748 }
5749 
5750 // Return whether the operation has NoUnsignedWrap property.
isNoUnsignedWrap(MachineInstr * Addr)5751 static bool isNoUnsignedWrap(MachineInstr *Addr) {
5752   return Addr->getOpcode() == TargetOpcode::G_OR ||
5753          (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5754           Addr->getFlag(MachineInstr::NoUWrap));
5755 }
5756 
5757 // Check that the base address of flat scratch load/store in the form of `base +
5758 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5759 // requirement). We always treat the first operand as the base address here.
isFlatScratchBaseLegal(Register Addr) const5760 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5761   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5762 
5763   if (isNoUnsignedWrap(AddrMI))
5764     return true;
5765 
5766   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5767   // values.
5768   if (STI.hasSignedScratchOffsets())
5769     return true;
5770 
5771   Register LHS = AddrMI->getOperand(1).getReg();
5772   Register RHS = AddrMI->getOperand(2).getReg();
5773 
5774   if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5775     std::optional<ValueAndVReg> RhsValReg =
5776         getIConstantVRegValWithLookThrough(RHS, *MRI);
5777     // If the immediate offset is negative and within certain range, the base
5778     // address cannot also be negative. If the base is also negative, the sum
5779     // would be either negative or much larger than the valid range of scratch
5780     // memory a thread can access.
5781     if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5782         RhsValReg->Value.getSExtValue() > -0x40000000)
5783       return true;
5784   }
5785 
5786   return VT->signBitIsZero(LHS);
5787 }
5788 
5789 // Check address value in SGPR/VGPR are legal for flat scratch in the form
5790 // of: SGPR + VGPR.
isFlatScratchBaseLegalSV(Register Addr) const5791 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5792   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5793 
5794   if (isNoUnsignedWrap(AddrMI))
5795     return true;
5796 
5797   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5798   // values.
5799   if (STI.hasSignedScratchOffsets())
5800     return true;
5801 
5802   Register LHS = AddrMI->getOperand(1).getReg();
5803   Register RHS = AddrMI->getOperand(2).getReg();
5804   return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
5805 }
5806 
5807 // Check address value in SGPR/VGPR are legal for flat scratch in the form
5808 // of: SGPR + VGPR + Imm.
isFlatScratchBaseLegalSVImm(Register Addr) const5809 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5810     Register Addr) const {
5811   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5812   // values.
5813   if (STI.hasSignedScratchOffsets())
5814     return true;
5815 
5816   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5817   Register Base = AddrMI->getOperand(1).getReg();
5818   std::optional<DefinitionAndSourceRegister> BaseDef =
5819       getDefSrcRegIgnoringCopies(Base, *MRI);
5820   std::optional<ValueAndVReg> RHSOffset =
5821       getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
5822   assert(RHSOffset);
5823 
5824   // If the immediate offset is negative and within certain range, the base
5825   // address cannot also be negative. If the base is also negative, the sum
5826   // would be either negative or much larger than the valid range of scratch
5827   // memory a thread can access.
5828   if (isNoUnsignedWrap(BaseDef->MI) &&
5829       (isNoUnsignedWrap(AddrMI) ||
5830        (RHSOffset->Value.getSExtValue() < 0 &&
5831         RHSOffset->Value.getSExtValue() > -0x40000000)))
5832     return true;
5833 
5834   Register LHS = BaseDef->MI->getOperand(1).getReg();
5835   Register RHS = BaseDef->MI->getOperand(2).getReg();
5836   return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
5837 }
5838 
isUnneededShiftMask(const MachineInstr & MI,unsigned ShAmtBits) const5839 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5840                                                     unsigned ShAmtBits) const {
5841   assert(MI.getOpcode() == TargetOpcode::G_AND);
5842 
5843   std::optional<APInt> RHS =
5844       getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5845   if (!RHS)
5846     return false;
5847 
5848   if (RHS->countr_one() >= ShAmtBits)
5849     return true;
5850 
5851   const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
5852   return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5853 }
5854 
5855 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand & Root) const5856 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5857     MachineOperand &Root) const {
5858   Register Reg = Root.getReg();
5859   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5860 
5861   std::optional<DefinitionAndSourceRegister> Def =
5862     getDefSrcRegIgnoringCopies(Reg, *MRI);
5863   assert(Def && "this shouldn't be an optional result");
5864   Reg = Def->Reg;
5865 
5866   if (Register WaveBase = getWaveAddress(Def->MI)) {
5867     return {{
5868         [=](MachineInstrBuilder &MIB) { // rsrc
5869           MIB.addReg(Info->getScratchRSrcReg());
5870         },
5871         [=](MachineInstrBuilder &MIB) { // soffset
5872           MIB.addReg(WaveBase);
5873         },
5874         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5875     }};
5876   }
5877 
5878   int64_t Offset = 0;
5879 
5880   // FIXME: Copy check is a hack
5881   Register BasePtr;
5882   if (mi_match(Reg, *MRI,
5883                m_GPtrAdd(m_Reg(BasePtr),
5884                          m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
5885     if (!TII.isLegalMUBUFImmOffset(Offset))
5886       return {};
5887     MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5888     Register WaveBase = getWaveAddress(BasePtrDef);
5889     if (!WaveBase)
5890       return {};
5891 
5892     return {{
5893         [=](MachineInstrBuilder &MIB) { // rsrc
5894           MIB.addReg(Info->getScratchRSrcReg());
5895         },
5896         [=](MachineInstrBuilder &MIB) { // soffset
5897           MIB.addReg(WaveBase);
5898         },
5899         [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5900     }};
5901   }
5902 
5903   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5904       !TII.isLegalMUBUFImmOffset(Offset))
5905     return {};
5906 
5907   return {{
5908       [=](MachineInstrBuilder &MIB) { // rsrc
5909         MIB.addReg(Info->getScratchRSrcReg());
5910       },
5911       [=](MachineInstrBuilder &MIB) { // soffset
5912         MIB.addImm(0);
5913       },
5914       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5915   }};
5916 }
5917 
5918 std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand & Root) const5919 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5920   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5921   int64_t ConstAddr = 0;
5922 
5923   Register PtrBase;
5924   int64_t Offset;
5925   std::tie(PtrBase, Offset) =
5926     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5927 
5928   if (Offset) {
5929     if (isDSOffsetLegal(PtrBase, Offset)) {
5930       // (add n0, c0)
5931       return std::pair(PtrBase, Offset);
5932     }
5933   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5934     // TODO
5935 
5936 
5937   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5938     // TODO
5939 
5940   }
5941 
5942   return std::pair(Root.getReg(), 0);
5943 }
5944 
5945 InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand & Root) const5946 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5947   Register Reg;
5948   unsigned Offset;
5949   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5950   return {{
5951       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5952       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5953     }};
5954 }
5955 
5956 InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand & Root) const5957 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5958   return selectDSReadWrite2(Root, 4);
5959 }
5960 
5961 InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand & Root) const5962 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5963   return selectDSReadWrite2(Root, 8);
5964 }
5965 
5966 InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand & Root,unsigned Size) const5967 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5968                                               unsigned Size) const {
5969   Register Reg;
5970   unsigned Offset;
5971   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5972   return {{
5973       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5974       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5975       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5976     }};
5977 }
5978 
5979 std::pair<Register, unsigned>
selectDSReadWrite2Impl(MachineOperand & Root,unsigned Size) const5980 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5981                                                   unsigned Size) const {
5982   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5983   int64_t ConstAddr = 0;
5984 
5985   Register PtrBase;
5986   int64_t Offset;
5987   std::tie(PtrBase, Offset) =
5988     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5989 
5990   if (Offset) {
5991     int64_t OffsetValue0 = Offset;
5992     int64_t OffsetValue1 = Offset + Size;
5993     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5994       // (add n0, c0)
5995       return std::pair(PtrBase, OffsetValue0 / Size);
5996     }
5997   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5998     // TODO
5999 
6000   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6001     // TODO
6002 
6003   }
6004 
6005   return std::pair(Root.getReg(), 0);
6006 }
6007 
6008 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6009 /// the base value with the constant offset. There may be intervening copies
6010 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
6011 /// not match the pattern.
6012 std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,const MachineRegisterInfo & MRI) const6013 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6014   Register Root, const MachineRegisterInfo &MRI) const {
6015   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6016   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6017     return {Root, 0};
6018 
6019   MachineOperand &RHS = RootI->getOperand(2);
6020   std::optional<ValueAndVReg> MaybeOffset =
6021       getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
6022   if (!MaybeOffset)
6023     return {Root, 0};
6024   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
6025 }
6026 
addZeroImm(MachineInstrBuilder & MIB)6027 static void addZeroImm(MachineInstrBuilder &MIB) {
6028   MIB.addImm(0);
6029 }
6030 
6031 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6032 /// BasePtr is not valid, a null base pointer will be used.
buildRSRC(MachineIRBuilder & B,MachineRegisterInfo & MRI,uint32_t FormatLo,uint32_t FormatHi,Register BasePtr)6033 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6034                           uint32_t FormatLo, uint32_t FormatHi,
6035                           Register BasePtr) {
6036   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6037   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6038   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6039   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6040 
6041   B.buildInstr(AMDGPU::S_MOV_B32)
6042     .addDef(RSrc2)
6043     .addImm(FormatLo);
6044   B.buildInstr(AMDGPU::S_MOV_B32)
6045     .addDef(RSrc3)
6046     .addImm(FormatHi);
6047 
6048   // Build the half of the subregister with the constants before building the
6049   // full 128-bit register. If we are building multiple resource descriptors,
6050   // this will allow CSEing of the 2-component register.
6051   B.buildInstr(AMDGPU::REG_SEQUENCE)
6052     .addDef(RSrcHi)
6053     .addReg(RSrc2)
6054     .addImm(AMDGPU::sub0)
6055     .addReg(RSrc3)
6056     .addImm(AMDGPU::sub1);
6057 
6058   Register RSrcLo = BasePtr;
6059   if (!BasePtr) {
6060     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6061     B.buildInstr(AMDGPU::S_MOV_B64)
6062       .addDef(RSrcLo)
6063       .addImm(0);
6064   }
6065 
6066   B.buildInstr(AMDGPU::REG_SEQUENCE)
6067     .addDef(RSrc)
6068     .addReg(RSrcLo)
6069     .addImm(AMDGPU::sub0_sub1)
6070     .addReg(RSrcHi)
6071     .addImm(AMDGPU::sub2_sub3);
6072 
6073   return RSrc;
6074 }
6075 
buildAddr64RSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)6076 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6077                                 const SIInstrInfo &TII, Register BasePtr) {
6078   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6079 
6080   // FIXME: Why are half the "default" bits ignored based on the addressing
6081   // mode?
6082   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6083 }
6084 
buildOffsetSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)6085 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6086                                const SIInstrInfo &TII, Register BasePtr) {
6087   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6088 
6089   // FIXME: Why are half the "default" bits ignored based on the addressing
6090   // mode?
6091   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6092 }
6093 
6094 AMDGPUInstructionSelector::MUBUFAddressData
parseMUBUFAddress(Register Src) const6095 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6096   MUBUFAddressData Data;
6097   Data.N0 = Src;
6098 
6099   Register PtrBase;
6100   int64_t Offset;
6101 
6102   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
6103   if (isUInt<32>(Offset)) {
6104     Data.N0 = PtrBase;
6105     Data.Offset = Offset;
6106   }
6107 
6108   if (MachineInstr *InputAdd
6109       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6110     Data.N2 = InputAdd->getOperand(1).getReg();
6111     Data.N3 = InputAdd->getOperand(2).getReg();
6112 
6113     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6114     // FIXME: Don't know this was defined by operand 0
6115     //
6116     // TODO: Remove this when we have copy folding optimizations after
6117     // RegBankSelect.
6118     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6119     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6120   }
6121 
6122   return Data;
6123 }
6124 
6125 /// Return if the addr64 mubuf mode should be used for the given address.
shouldUseAddr64(MUBUFAddressData Addr) const6126 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6127   // (ptr_add N2, N3) -> addr64, or
6128   // (ptr_add (ptr_add N2, N3), C1) -> addr64
6129   if (Addr.N2)
6130     return true;
6131 
6132   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6133   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6134 }
6135 
6136 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
6137 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6138 /// component.
splitIllegalMUBUFOffset(MachineIRBuilder & B,Register & SOffset,int64_t & ImmOffset) const6139 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6140   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6141   if (TII.isLegalMUBUFImmOffset(ImmOffset))
6142     return;
6143 
6144   // Illegal offset, store it in soffset.
6145   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6146   B.buildInstr(AMDGPU::S_MOV_B32)
6147     .addDef(SOffset)
6148     .addImm(ImmOffset);
6149   ImmOffset = 0;
6150 }
6151 
selectMUBUFAddr64Impl(MachineOperand & Root,Register & VAddr,Register & RSrcReg,Register & SOffset,int64_t & Offset) const6152 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6153   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6154   Register &SOffset, int64_t &Offset) const {
6155   // FIXME: Predicates should stop this from reaching here.
6156   // addr64 bit was removed for volcanic islands.
6157   if (!STI.hasAddr64() || STI.useFlatForGlobal())
6158     return false;
6159 
6160   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6161   if (!shouldUseAddr64(AddrData))
6162     return false;
6163 
6164   Register N0 = AddrData.N0;
6165   Register N2 = AddrData.N2;
6166   Register N3 = AddrData.N3;
6167   Offset = AddrData.Offset;
6168 
6169   // Base pointer for the SRD.
6170   Register SRDPtr;
6171 
6172   if (N2) {
6173     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6174       assert(N3);
6175       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6176         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6177         // addr64, and construct the default resource from a 0 address.
6178         VAddr = N0;
6179       } else {
6180         SRDPtr = N3;
6181         VAddr = N2;
6182       }
6183     } else {
6184       // N2 is not divergent.
6185       SRDPtr = N2;
6186       VAddr = N3;
6187     }
6188   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6189     // Use the default null pointer in the resource
6190     VAddr = N0;
6191   } else {
6192     // N0 -> offset, or
6193     // (N0 + C1) -> offset
6194     SRDPtr = N0;
6195   }
6196 
6197   MachineIRBuilder B(*Root.getParent());
6198   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6199   splitIllegalMUBUFOffset(B, SOffset, Offset);
6200   return true;
6201 }
6202 
selectMUBUFOffsetImpl(MachineOperand & Root,Register & RSrcReg,Register & SOffset,int64_t & Offset) const6203 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6204   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6205   int64_t &Offset) const {
6206 
6207   // FIXME: Pattern should not reach here.
6208   if (STI.useFlatForGlobal())
6209     return false;
6210 
6211   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6212   if (shouldUseAddr64(AddrData))
6213     return false;
6214 
6215   // N0 -> offset, or
6216   // (N0 + C1) -> offset
6217   Register SRDPtr = AddrData.N0;
6218   Offset = AddrData.Offset;
6219 
6220   // TODO: Look through extensions for 32-bit soffset.
6221   MachineIRBuilder B(*Root.getParent());
6222 
6223   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6224   splitIllegalMUBUFOffset(B, SOffset, Offset);
6225   return true;
6226 }
6227 
6228 InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand & Root) const6229 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6230   Register VAddr;
6231   Register RSrcReg;
6232   Register SOffset;
6233   int64_t Offset = 0;
6234 
6235   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6236     return {};
6237 
6238   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6239   // pattern.
6240   return {{
6241       [=](MachineInstrBuilder &MIB) {  // rsrc
6242         MIB.addReg(RSrcReg);
6243       },
6244       [=](MachineInstrBuilder &MIB) { // vaddr
6245         MIB.addReg(VAddr);
6246       },
6247       [=](MachineInstrBuilder &MIB) { // soffset
6248         if (SOffset)
6249           MIB.addReg(SOffset);
6250         else if (STI.hasRestrictedSOffset())
6251           MIB.addReg(AMDGPU::SGPR_NULL);
6252         else
6253           MIB.addImm(0);
6254       },
6255       [=](MachineInstrBuilder &MIB) { // offset
6256         MIB.addImm(Offset);
6257       },
6258       addZeroImm, //  cpol
6259       addZeroImm, //  tfe
6260       addZeroImm  //  swz
6261     }};
6262 }
6263 
6264 InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand & Root) const6265 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6266   Register RSrcReg;
6267   Register SOffset;
6268   int64_t Offset = 0;
6269 
6270   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6271     return {};
6272 
6273   return {{
6274       [=](MachineInstrBuilder &MIB) {  // rsrc
6275         MIB.addReg(RSrcReg);
6276       },
6277       [=](MachineInstrBuilder &MIB) { // soffset
6278         if (SOffset)
6279           MIB.addReg(SOffset);
6280         else if (STI.hasRestrictedSOffset())
6281           MIB.addReg(AMDGPU::SGPR_NULL);
6282         else
6283           MIB.addImm(0);
6284       },
6285       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6286       addZeroImm, //  cpol
6287       addZeroImm, //  tfe
6288       addZeroImm, //  swz
6289     }};
6290 }
6291 
6292 InstructionSelector::ComplexRendererFns
selectBUFSOffset(MachineOperand & Root) const6293 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6294 
6295   Register SOffset = Root.getReg();
6296 
6297   if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6298     SOffset = AMDGPU::SGPR_NULL;
6299 
6300   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6301 }
6302 
6303 /// Get an immediate that must be 32-bits, and treated as zero extended.
6304 static std::optional<uint64_t>
getConstantZext32Val(Register Reg,const MachineRegisterInfo & MRI)6305 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
6306   // getIConstantVRegVal sexts any values, so see if that matters.
6307   std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6308   if (!OffsetVal || !isInt<32>(*OffsetVal))
6309     return std::nullopt;
6310   return Lo_32(*OffsetVal);
6311 }
6312 
6313 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm(MachineOperand & Root) const6314 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6315   std::optional<uint64_t> OffsetVal =
6316       Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6317   if (!OffsetVal)
6318     return {};
6319 
6320   std::optional<int64_t> EncodedImm =
6321       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6322   if (!EncodedImm)
6323     return {};
6324 
6325   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
6326 }
6327 
6328 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm32(MachineOperand & Root) const6329 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6330   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6331 
6332   std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6333   if (!OffsetVal)
6334     return {};
6335 
6336   std::optional<int64_t> EncodedImm =
6337       AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
6338   if (!EncodedImm)
6339     return {};
6340 
6341   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
6342 }
6343 
6344 InstructionSelector::ComplexRendererFns
selectSMRDBufferSgprImm(MachineOperand & Root) const6345 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6346   // Match the (soffset + offset) pair as a 32-bit register base and
6347   // an immediate offset.
6348   Register SOffset;
6349   unsigned Offset;
6350   std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6351       *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6352   if (!SOffset)
6353     return std::nullopt;
6354 
6355   std::optional<int64_t> EncodedOffset =
6356       AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6357   if (!EncodedOffset)
6358     return std::nullopt;
6359 
6360   assert(MRI->getType(SOffset) == LLT::scalar(32));
6361   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6362            [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6363 }
6364 
6365 std::pair<Register, unsigned>
selectVOP3PMadMixModsImpl(MachineOperand & Root,bool & Matched) const6366 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6367                                                      bool &Matched) const {
6368   Matched = false;
6369 
6370   Register Src;
6371   unsigned Mods;
6372   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6373 
6374   if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6375     assert(MRI->getType(Src) == LLT::scalar(16));
6376 
6377     // Only change Src if src modifier could be gained. In such cases new Src
6378     // could be sgpr but this does not violate constant bus restriction for
6379     // instruction that is being selected.
6380     Src = stripBitCast(Src, *MRI);
6381 
6382     const auto CheckAbsNeg = [&]() {
6383       // Be careful about folding modifiers if we already have an abs. fneg is
6384       // applied last, so we don't want to apply an earlier fneg.
6385       if ((Mods & SISrcMods::ABS) == 0) {
6386         unsigned ModsTmp;
6387         std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6388 
6389         if ((ModsTmp & SISrcMods::NEG) != 0)
6390           Mods ^= SISrcMods::NEG;
6391 
6392         if ((ModsTmp & SISrcMods::ABS) != 0)
6393           Mods |= SISrcMods::ABS;
6394       }
6395     };
6396 
6397     CheckAbsNeg();
6398 
6399     // op_sel/op_sel_hi decide the source type and source.
6400     // If the source's op_sel_hi is set, it indicates to do a conversion from
6401     // fp16. If the sources's op_sel is set, it picks the high half of the
6402     // source register.
6403 
6404     Mods |= SISrcMods::OP_SEL_1;
6405 
6406     if (isExtractHiElt(*MRI, Src, Src)) {
6407       Mods |= SISrcMods::OP_SEL_0;
6408       CheckAbsNeg();
6409     }
6410 
6411     Matched = true;
6412   }
6413 
6414   return {Src, Mods};
6415 }
6416 
6417 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixModsExt(MachineOperand & Root) const6418 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6419     MachineOperand &Root) const {
6420   Register Src;
6421   unsigned Mods;
6422   bool Matched;
6423   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6424   if (!Matched)
6425     return {};
6426 
6427   return {{
6428       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6429       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6430   }};
6431 }
6432 
6433 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixMods(MachineOperand & Root) const6434 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6435   Register Src;
6436   unsigned Mods;
6437   bool Matched;
6438   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6439 
6440   return {{
6441       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6442       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6443   }};
6444 }
6445 
selectSBarrierSignalIsfirst(MachineInstr & I,Intrinsic::ID IntrID) const6446 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6447     MachineInstr &I, Intrinsic::ID IntrID) const {
6448   MachineBasicBlock *MBB = I.getParent();
6449   const DebugLoc &DL = I.getDebugLoc();
6450   Register CCReg = I.getOperand(0).getReg();
6451 
6452   // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6453   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6454 
6455   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6456       .addImm(I.getOperand(2).getImm());
6457 
6458   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6459 
6460   I.eraseFromParent();
6461   return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6462                                       *MRI);
6463 }
6464 
selectSGetBarrierState(MachineInstr & I,Intrinsic::ID IntrID) const6465 bool AMDGPUInstructionSelector::selectSGetBarrierState(
6466     MachineInstr &I, Intrinsic::ID IntrID) const {
6467   MachineBasicBlock *MBB = I.getParent();
6468   const DebugLoc &DL = I.getDebugLoc();
6469   MachineOperand BarOp = I.getOperand(2);
6470   std::optional<int64_t> BarValImm =
6471       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6472 
6473   if (!BarValImm) {
6474     auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6475                        .addReg(BarOp.getReg());
6476     constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6477   }
6478   MachineInstrBuilder MIB;
6479   unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6480                            : AMDGPU::S_GET_BARRIER_STATE_M0;
6481   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6482 
6483   auto DstReg = I.getOperand(0).getReg();
6484   const TargetRegisterClass *DstRC =
6485       TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6486   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6487     return false;
6488   MIB.addDef(DstReg);
6489   if (BarValImm) {
6490     MIB.addImm(*BarValImm);
6491   }
6492   I.eraseFromParent();
6493   return true;
6494 }
6495 
getNamedBarrierOp(bool HasInlineConst,Intrinsic::ID IntrID)6496 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6497   if (HasInlineConst) {
6498     switch (IntrID) {
6499     default:
6500       llvm_unreachable("not a named barrier op");
6501     case Intrinsic::amdgcn_s_get_named_barrier_state:
6502       return AMDGPU::S_GET_BARRIER_STATE_IMM;
6503     };
6504   } else {
6505     switch (IntrID) {
6506     default:
6507       llvm_unreachable("not a named barrier op");
6508     case Intrinsic::amdgcn_s_get_named_barrier_state:
6509       return AMDGPU::S_GET_BARRIER_STATE_M0;
6510     };
6511   }
6512 }
6513 
selectNamedBarrierInit(MachineInstr & I,Intrinsic::ID IntrID) const6514 bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6515     MachineInstr &I, Intrinsic::ID IntrID) const {
6516   MachineBasicBlock *MBB = I.getParent();
6517   const DebugLoc &DL = I.getDebugLoc();
6518   MachineOperand BarOp = I.getOperand(1);
6519   MachineOperand CntOp = I.getOperand(2);
6520 
6521   // BarID = (BarOp >> 4) & 0x3F
6522   Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6523   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6524       .add(BarOp)
6525       .addImm(4u)
6526       .setOperandDead(3); // Dead scc
6527 
6528   Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6529   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6530       .addReg(TmpReg0)
6531       .addImm(0x3F)
6532       .setOperandDead(3); // Dead scc
6533 
6534   // MO = ((CntOp & 0x3F) << shAmt) | BarID
6535   Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6536   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6537       .add(CntOp)
6538       .addImm(0x3F)
6539       .setOperandDead(3); // Dead scc
6540 
6541   Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6542   constexpr unsigned ShAmt = 16;
6543   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6544       .addReg(TmpReg2)
6545       .addImm(ShAmt)
6546       .setOperandDead(3); // Dead scc
6547 
6548   Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6549   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6550       .addReg(TmpReg1)
6551       .addReg(TmpReg3)
6552       .setOperandDead(3); // Dead scc;
6553 
6554   auto CopyMIB =
6555       BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6556   constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6557 
6558   MachineInstrBuilder MIB;
6559   MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
6560 
6561   I.eraseFromParent();
6562   return true;
6563 }
6564 
selectNamedBarrierInst(MachineInstr & I,Intrinsic::ID IntrID) const6565 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6566     MachineInstr &I, Intrinsic::ID IntrID) const {
6567   MachineBasicBlock *MBB = I.getParent();
6568   const DebugLoc &DL = I.getDebugLoc();
6569   MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6570                              ? I.getOperand(2)
6571                              : I.getOperand(1);
6572   std::optional<int64_t> BarValImm =
6573       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6574 
6575   if (!BarValImm) {
6576     // BarID = (BarOp >> 4) & 0x3F
6577     Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6578     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6579         .addReg(BarOp.getReg())
6580         .addImm(4u)
6581         .setOperandDead(3); // Dead scc;
6582 
6583     Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6584     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6585         .addReg(TmpReg0)
6586         .addImm(0x3F)
6587         .setOperandDead(3); // Dead scc;
6588 
6589     auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6590                        .addReg(TmpReg1);
6591     constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6592   }
6593 
6594   MachineInstrBuilder MIB;
6595   unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6596   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6597 
6598   if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6599     auto DstReg = I.getOperand(0).getReg();
6600     const TargetRegisterClass *DstRC =
6601         TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6602     if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6603       return false;
6604     MIB.addDef(DstReg);
6605   }
6606 
6607   if (BarValImm) {
6608     auto BarId = ((*BarValImm) >> 4) & 0x3F;
6609     MIB.addImm(BarId);
6610   }
6611 
6612   I.eraseFromParent();
6613   return true;
6614 }
6615 
renderTruncImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6616 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6617                                                  const MachineInstr &MI,
6618                                                  int OpIdx) const {
6619   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6620          "Expected G_CONSTANT");
6621   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6622 }
6623 
renderNegateImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6624 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6625                                                 const MachineInstr &MI,
6626                                                 int OpIdx) const {
6627   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6628          "Expected G_CONSTANT");
6629   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6630 }
6631 
renderBitcastFPImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6632 void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6633                                                    const MachineInstr &MI,
6634                                                    int OpIdx) const {
6635   const MachineOperand &Op = MI.getOperand(1);
6636   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6637   MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6638 }
6639 
renderPopcntImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6640 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6641                                                 const MachineInstr &MI,
6642                                                 int OpIdx) const {
6643   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6644          "Expected G_CONSTANT");
6645   MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6646 }
6647 
6648 /// This only really exists to satisfy DAG type checking machinery, so is a
6649 /// no-op here.
renderTruncTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6650 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6651                                                 const MachineInstr &MI,
6652                                                 int OpIdx) const {
6653   const MachineOperand &Op = MI.getOperand(OpIdx);
6654   int64_t Imm;
6655   if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6656     MIB.addImm(Imm);
6657   else
6658     MIB.addImm(Op.getImm());
6659 }
6660 
renderZextBoolTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6661 void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6662                                                    const MachineInstr &MI,
6663                                                    int OpIdx) const {
6664   MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6665 }
6666 
renderOpSelTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6667 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6668                                                 const MachineInstr &MI,
6669                                                 int OpIdx) const {
6670   assert(OpIdx >= 0 && "expected to match an immediate operand");
6671   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6672 }
6673 
renderSrcAndDstSelToOpSelXForm_0_0(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6674 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6675     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6676   assert(OpIdx >= 0 && "expected to match an immediate operand");
6677   MIB.addImm(
6678       (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6679 }
6680 
renderSrcAndDstSelToOpSelXForm_0_1(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6681 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6682     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6683   assert(OpIdx >= 0 && "expected to match an immediate operand");
6684   MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6685                  ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
6686                  : (int64_t)SISrcMods::DST_OP_SEL);
6687 }
6688 
renderSrcAndDstSelToOpSelXForm_1_0(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6689 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6690     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6691   assert(OpIdx >= 0 && "expected to match an immediate operand");
6692   MIB.addImm(
6693       (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6694 }
6695 
renderSrcAndDstSelToOpSelXForm_1_1(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6696 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6697     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6698   assert(OpIdx >= 0 && "expected to match an immediate operand");
6699   MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6700                  ? (int64_t)(SISrcMods::OP_SEL_0)
6701                  : 0);
6702 }
6703 
renderDstSelToOpSelXForm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6704 void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6705     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6706   assert(OpIdx >= 0 && "expected to match an immediate operand");
6707   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6708                                            : 0);
6709 }
6710 
renderSrcSelToOpSelXForm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6711 void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6712     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6713   assert(OpIdx >= 0 && "expected to match an immediate operand");
6714   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6715                                            : 0);
6716 }
6717 
renderSrcAndDstSelToOpSelXForm_2_0(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6718 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6719     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6720   assert(OpIdx >= 0 && "expected to match an immediate operand");
6721   MIB.addImm(
6722       (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6723 }
6724 
renderDstSelToOpSel3XFormXForm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6725 void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6726     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6727   assert(OpIdx >= 0 && "expected to match an immediate operand");
6728   MIB.addImm(
6729       (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL  : 0);
6730 }
6731 
renderExtractCPol(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6732 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6733                                                   const MachineInstr &MI,
6734                                                   int OpIdx) const {
6735   assert(OpIdx >= 0 && "expected to match an immediate operand");
6736   MIB.addImm(MI.getOperand(OpIdx).getImm() &
6737              (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6738                                        : AMDGPU::CPol::ALL_pregfx12));
6739 }
6740 
renderExtractSWZ(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6741 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6742                                                  const MachineInstr &MI,
6743                                                  int OpIdx) const {
6744   assert(OpIdx >= 0 && "expected to match an immediate operand");
6745   const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6746                        (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
6747                                                  : AMDGPU::CPol::SWZ_pregfx12);
6748   MIB.addImm(Swizzle);
6749 }
6750 
renderExtractCpolSetGLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6751 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6752     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6753   assert(OpIdx >= 0 && "expected to match an immediate operand");
6754   const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6755                         (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6756                                                   : AMDGPU::CPol::ALL_pregfx12);
6757   MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6758 }
6759 
renderFrameIndex(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6760 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6761                                                  const MachineInstr &MI,
6762                                                  int OpIdx) const {
6763   MIB.addFrameIndex(MI.getOperand(1).getIndex());
6764 }
6765 
renderFPPow2ToExponent(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6766 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6767                                                        const MachineInstr &MI,
6768                                                        int OpIdx) const {
6769   const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6770   int ExpVal = APF.getExactLog2Abs();
6771   assert(ExpVal != INT_MIN);
6772   MIB.addImm(ExpVal);
6773 }
6774 
renderRoundMode(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6775 void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6776                                                 const MachineInstr &MI,
6777                                                 int OpIdx) const {
6778   // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
6779   // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6780   // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
6781   // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
6782   MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6783 }
6784 
6785 /// Convert from 2-bit value to enum values used for op_sel* source modifiers.
renderScaledMAIIntrinsicOperand(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6786 void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6787     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6788   unsigned Val = MI.getOperand(OpIdx).getImm();
6789   unsigned New = 0;
6790   if (Val & 0x1)
6791     New |= SISrcMods::OP_SEL_0;
6792   if (Val & 0x2)
6793     New |= SISrcMods::OP_SEL_1;
6794   MIB.addImm(New);
6795 }
6796 
isInlineImmediate(const APInt & Imm) const6797 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6798   return TII.isInlineConstant(Imm);
6799 }
6800 
isInlineImmediate(const APFloat & Imm) const6801 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6802   return TII.isInlineConstant(Imm);
6803 }
6804