xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31 
32 #define DEBUG_TYPE "amdgpu-isel"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 #define GET_GLOBALISEL_IMPL
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenGlobalISel.inc"
40 #undef GET_GLOBALISEL_IMPL
41 #undef AMDGPUSubtarget
42 
43 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45     const AMDGPUTargetMachine &TM)
46     : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47       STI(STI),
48       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
49 #define GET_GLOBALISEL_PREDICATES_INIT
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_PREDICATES_INIT
52 #define GET_GLOBALISEL_TEMPORARIES_INIT
53 #include "AMDGPUGenGlobalISel.inc"
54 #undef GET_GLOBALISEL_TEMPORARIES_INIT
55 {
56 }
57 
58 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59 
60 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
61                                         CodeGenCoverage *CoverageInfo,
62                                         ProfileSummaryInfo *PSI,
63                                         BlockFrequencyInfo *BFI) {
64   MRI = &MF.getRegInfo();
65   Subtarget = &MF.getSubtarget<GCNSubtarget>();
66   Subtarget->checkSubtargetFeatures(MF.getFunction());
67   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
68 }
69 
70 // Return the wave level SGPR base address if this is a wave address.
71 static Register getWaveAddress(const MachineInstr *Def) {
72   return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73              ? Def->getOperand(1).getReg()
74              : Register();
75 }
76 
77 bool AMDGPUInstructionSelector::isVCC(Register Reg,
78                                       const MachineRegisterInfo &MRI) const {
79   // The verifier is oblivious to s1 being a valid value for wavesize registers.
80   if (Reg.isPhysical())
81     return false;
82 
83   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84   const TargetRegisterClass *RC =
85       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86   if (RC) {
87     const LLT Ty = MRI.getType(Reg);
88     if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89       return false;
90     // G_TRUNC s1 result is never vcc.
91     return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92            RC->hasSuperClassEq(TRI.getBoolRC());
93   }
94 
95   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96   return RB->getID() == AMDGPU::VCCRegBankID;
97 }
98 
99 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100                                                         unsigned NewOpc) const {
101   MI.setDesc(TII.get(NewOpc));
102   MI.removeOperand(1); // Remove intrinsic ID.
103   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104 
105   MachineOperand &Dst = MI.getOperand(0);
106   MachineOperand &Src = MI.getOperand(1);
107 
108   // TODO: This should be legalized to s32 if needed
109   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110     return false;
111 
112   const TargetRegisterClass *DstRC
113     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114   const TargetRegisterClass *SrcRC
115     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116   if (!DstRC || DstRC != SrcRC)
117     return false;
118 
119   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121 }
122 
123 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124   const DebugLoc &DL = I.getDebugLoc();
125   MachineBasicBlock *BB = I.getParent();
126   I.setDesc(TII.get(TargetOpcode::COPY));
127 
128   const MachineOperand &Src = I.getOperand(1);
129   MachineOperand &Dst = I.getOperand(0);
130   Register DstReg = Dst.getReg();
131   Register SrcReg = Src.getReg();
132 
133   if (isVCC(DstReg, *MRI)) {
134     if (SrcReg == AMDGPU::SCC) {
135       const TargetRegisterClass *RC
136         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137       if (!RC)
138         return true;
139       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140     }
141 
142     if (!isVCC(SrcReg, *MRI)) {
143       // TODO: Should probably leave the copy and let copyPhysReg expand it.
144       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145         return false;
146 
147       const TargetRegisterClass *SrcRC
148         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149 
150       std::optional<ValueAndVReg> ConstVal =
151           getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152       if (ConstVal) {
153         unsigned MovOpc =
154             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155         BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156             .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157       } else {
158         Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159 
160         // We can't trust the high bits at this point, so clear them.
161 
162         // TODO: Skip masking high bits if def is known boolean.
163 
164         bool IsSGPR = TRI.isSGPRClass(SrcRC);
165         unsigned AndOpc =
166             IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167         auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168             .addImm(1)
169             .addReg(SrcReg);
170         if (IsSGPR)
171           And.setOperandDead(3); // Dead scc
172 
173         BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174             .addImm(0)
175             .addReg(MaskedReg);
176       }
177 
178       if (!MRI->getRegClassOrNull(SrcReg))
179         MRI->setRegClass(SrcReg, SrcRC);
180       I.eraseFromParent();
181       return true;
182     }
183 
184     const TargetRegisterClass *RC =
185       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
186     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187       return false;
188 
189     return true;
190   }
191 
192   for (const MachineOperand &MO : I.operands()) {
193     if (MO.getReg().isPhysical())
194       continue;
195 
196     const TargetRegisterClass *RC =
197             TRI.getConstrainedRegClassForOperand(MO, *MRI);
198     if (!RC)
199       continue;
200     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201   }
202   return true;
203 }
204 
205 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206   const Register DefReg = I.getOperand(0).getReg();
207   const LLT DefTy = MRI->getType(DefReg);
208 
209   // S1 G_PHIs should not be selected in instruction-select, instead:
210   // - divergent S1 G_PHI should go through lane mask merging algorithm
211   //   and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212   // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213   if (DefTy == LLT::scalar(1))
214     return false;
215 
216   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217 
218   const RegClassOrRegBank &RegClassOrBank =
219     MRI->getRegClassOrRegBank(DefReg);
220 
221   const TargetRegisterClass *DefRC
222     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223   if (!DefRC) {
224     if (!DefTy.isValid()) {
225       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226       return false;
227     }
228 
229     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231     if (!DefRC) {
232       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233       return false;
234     }
235   }
236 
237   // TODO: Verify that all registers have the same bank
238   I.setDesc(TII.get(TargetOpcode::PHI));
239   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240 }
241 
242 MachineOperand
243 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244                                            const TargetRegisterClass &SubRC,
245                                            unsigned SubIdx) const {
246 
247   MachineInstr *MI = MO.getParent();
248   MachineBasicBlock *BB = MO.getParent()->getParent();
249   Register DstReg = MRI->createVirtualRegister(&SubRC);
250 
251   if (MO.isReg()) {
252     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253     Register Reg = MO.getReg();
254     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255             .addReg(Reg, 0, ComposedSubIdx);
256 
257     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258                                      MO.isKill(), MO.isDead(), MO.isUndef(),
259                                      MO.isEarlyClobber(), 0, MO.isDebug(),
260                                      MO.isInternalRead());
261   }
262 
263   assert(MO.isImm());
264 
265   APInt Imm(64, MO.getImm());
266 
267   switch (SubIdx) {
268   default:
269     llvm_unreachable("do not know to split immediate with this sub index.");
270   case AMDGPU::sub0:
271     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272   case AMDGPU::sub1:
273     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274   }
275 }
276 
277 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278   switch (Opc) {
279   case AMDGPU::G_AND:
280     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281   case AMDGPU::G_OR:
282     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283   case AMDGPU::G_XOR:
284     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285   default:
286     llvm_unreachable("not a bit op");
287   }
288 }
289 
290 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291   Register DstReg = I.getOperand(0).getReg();
292   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293 
294   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296       DstRB->getID() != AMDGPU::VCCRegBankID)
297     return false;
298 
299   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300                             STI.isWave64());
301   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302 
303   // Dead implicit-def of scc
304   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305                                          true, // isImp
306                                          false, // isKill
307                                          true)); // isDead
308   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309 }
310 
311 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312   MachineBasicBlock *BB = I.getParent();
313   MachineFunction *MF = BB->getParent();
314   Register DstReg = I.getOperand(0).getReg();
315   const DebugLoc &DL = I.getDebugLoc();
316   LLT Ty = MRI->getType(DstReg);
317   if (Ty.isVector())
318     return false;
319 
320   unsigned Size = Ty.getSizeInBits();
321   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324 
325   if (Size == 32) {
326     if (IsSALU) {
327       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
328       MachineInstr *Add =
329         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330         .add(I.getOperand(1))
331         .add(I.getOperand(2))
332         .setOperandDead(3); // Dead scc
333       I.eraseFromParent();
334       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335     }
336 
337     if (STI.hasAddNoCarry()) {
338       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339       I.setDesc(TII.get(Opc));
340       I.addOperand(*MF, MachineOperand::CreateImm(0));
341       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343     }
344 
345     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346 
347     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
348     MachineInstr *Add
349       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350       .addDef(UnusedCarry, RegState::Dead)
351       .add(I.getOperand(1))
352       .add(I.getOperand(2))
353       .addImm(0);
354     I.eraseFromParent();
355     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356   }
357 
358   assert(!Sub && "illegal sub should not reach here");
359 
360   const TargetRegisterClass &RC
361     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362   const TargetRegisterClass &HalfRC
363     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364 
365   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369 
370   Register DstLo = MRI->createVirtualRegister(&HalfRC);
371   Register DstHi = MRI->createVirtualRegister(&HalfRC);
372 
373   if (IsSALU) {
374     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375       .add(Lo1)
376       .add(Lo2);
377     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378       .add(Hi1)
379       .add(Hi2)
380       .setOperandDead(3); // Dead scc
381   } else {
382     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383     Register CarryReg = MRI->createVirtualRegister(CarryRC);
384     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385       .addDef(CarryReg)
386       .add(Lo1)
387       .add(Lo2)
388       .addImm(0);
389     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391       .add(Hi1)
392       .add(Hi2)
393       .addReg(CarryReg, RegState::Kill)
394       .addImm(0);
395 
396     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397       return false;
398   }
399 
400   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401     .addReg(DstLo)
402     .addImm(AMDGPU::sub0)
403     .addReg(DstHi)
404     .addImm(AMDGPU::sub1);
405 
406 
407   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408     return false;
409 
410   I.eraseFromParent();
411   return true;
412 }
413 
414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415   MachineInstr &I) const {
416   MachineBasicBlock *BB = I.getParent();
417   MachineFunction *MF = BB->getParent();
418   const DebugLoc &DL = I.getDebugLoc();
419   Register Dst0Reg = I.getOperand(0).getReg();
420   Register Dst1Reg = I.getOperand(1).getReg();
421   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422                      I.getOpcode() == AMDGPU::G_UADDE;
423   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424                           I.getOpcode() == AMDGPU::G_USUBE;
425 
426   if (isVCC(Dst1Reg, *MRI)) {
427     unsigned NoCarryOpc =
428         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432     I.addOperand(*MF, MachineOperand::CreateImm(0));
433     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434   }
435 
436   Register Src0Reg = I.getOperand(2).getReg();
437   Register Src1Reg = I.getOperand(3).getReg();
438 
439   if (HasCarryIn) {
440     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441       .addReg(I.getOperand(4).getReg());
442   }
443 
444   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446 
447   auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448     .add(I.getOperand(2))
449     .add(I.getOperand(3));
450 
451   if (MRI->use_nodbg_empty(Dst1Reg)) {
452     CarryInst.setOperandDead(3); // Dead scc
453   } else {
454     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455       .addReg(AMDGPU::SCC);
456     if (!MRI->getRegClassOrNull(Dst1Reg))
457       MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458   }
459 
460   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463     return false;
464 
465   if (HasCarryIn &&
466       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467                                     AMDGPU::SReg_32RegClass, *MRI))
468     return false;
469 
470   I.eraseFromParent();
471   return true;
472 }
473 
474 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475     MachineInstr &I) const {
476   MachineBasicBlock *BB = I.getParent();
477   MachineFunction *MF = BB->getParent();
478   const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479 
480   unsigned Opc;
481   if (Subtarget->hasMADIntraFwdBug())
482     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483                      : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484   else
485     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486   I.setDesc(TII.get(Opc));
487   I.addOperand(*MF, MachineOperand::CreateImm(0));
488   I.addImplicitDefUseOperands(*MF);
489   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490 }
491 
492 // TODO: We should probably legalize these to only using 32-bit results.
493 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494   MachineBasicBlock *BB = I.getParent();
495   Register DstReg = I.getOperand(0).getReg();
496   Register SrcReg = I.getOperand(1).getReg();
497   LLT DstTy = MRI->getType(DstReg);
498   LLT SrcTy = MRI->getType(SrcReg);
499   const unsigned SrcSize = SrcTy.getSizeInBits();
500   unsigned DstSize = DstTy.getSizeInBits();
501 
502   // TODO: Should handle any multiple of 32 offset.
503   unsigned Offset = I.getOperand(2).getImm();
504   if (Offset % 32 != 0 || DstSize > 128)
505     return false;
506 
507   // 16-bit operations really use 32-bit registers.
508   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509   if (DstSize == 16)
510     DstSize = 32;
511 
512   const TargetRegisterClass *DstRC =
513     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515     return false;
516 
517   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518   const TargetRegisterClass *SrcRC =
519       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520   if (!SrcRC)
521     return false;
522   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
523                                                          DstSize / 32);
524   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525   if (!SrcRC)
526     return false;
527 
528   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529                                     *SrcRC, I.getOperand(1));
530   const DebugLoc &DL = I.getDebugLoc();
531   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532     .addReg(SrcReg, 0, SubReg);
533 
534   I.eraseFromParent();
535   return true;
536 }
537 
538 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539   MachineBasicBlock *BB = MI.getParent();
540   Register DstReg = MI.getOperand(0).getReg();
541   LLT DstTy = MRI->getType(DstReg);
542   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543 
544   const unsigned SrcSize = SrcTy.getSizeInBits();
545   if (SrcSize < 32)
546     return selectImpl(MI, *CoverageInfo);
547 
548   const DebugLoc &DL = MI.getDebugLoc();
549   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const TargetRegisterClass *DstRC =
552       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553   if (!DstRC)
554     return false;
555 
556   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
557   MachineInstrBuilder MIB =
558     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560     MachineOperand &Src = MI.getOperand(I + 1);
561     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562     MIB.addImm(SubRegs[I]);
563 
564     const TargetRegisterClass *SrcRC
565       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567       return false;
568   }
569 
570   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571     return false;
572 
573   MI.eraseFromParent();
574   return true;
575 }
576 
577 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578   MachineBasicBlock *BB = MI.getParent();
579   const int NumDst = MI.getNumOperands() - 1;
580 
581   MachineOperand &Src = MI.getOperand(NumDst);
582 
583   Register SrcReg = Src.getReg();
584   Register DstReg0 = MI.getOperand(0).getReg();
585   LLT DstTy = MRI->getType(DstReg0);
586   LLT SrcTy = MRI->getType(SrcReg);
587 
588   const unsigned DstSize = DstTy.getSizeInBits();
589   const unsigned SrcSize = SrcTy.getSizeInBits();
590   const DebugLoc &DL = MI.getDebugLoc();
591   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592 
593   const TargetRegisterClass *SrcRC =
594       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596     return false;
597 
598   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599   // source, and this relies on the fact that the same subregister indices are
600   // used for both.
601   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602   for (int I = 0, E = NumDst; I != E; ++I) {
603     MachineOperand &Dst = MI.getOperand(I);
604     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605       .addReg(SrcReg, 0, SubRegs[I]);
606 
607     // Make sure the subregister index is valid for the source register.
608     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610       return false;
611 
612     const TargetRegisterClass *DstRC =
613       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
614     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615       return false;
616   }
617 
618   MI.eraseFromParent();
619   return true;
620 }
621 
622 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623   assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624          MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625 
626   Register Src0 = MI.getOperand(1).getReg();
627   Register Src1 = MI.getOperand(2).getReg();
628   LLT SrcTy = MRI->getType(Src0);
629   const unsigned SrcSize = SrcTy.getSizeInBits();
630 
631   // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633     return selectG_MERGE_VALUES(MI);
634   }
635 
636   // Selection logic below is for V2S16 only.
637   // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638   Register Dst = MI.getOperand(0).getReg();
639   if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640       (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641        SrcTy != LLT::scalar(32)))
642     return selectImpl(MI, *CoverageInfo);
643 
644   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645   if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646     return false;
647 
648   assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649          DstBank->getID() == AMDGPU::VGPRRegBankID);
650   const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651 
652   const DebugLoc &DL = MI.getDebugLoc();
653   MachineBasicBlock *BB = MI.getParent();
654 
655   // First, before trying TableGen patterns, check if both sources are
656   // constants. In those cases, we can trivially compute the final constant
657   // and emit a simple move.
658   auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659   if (ConstSrc1) {
660     auto ConstSrc0 =
661         getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662     if (ConstSrc0) {
663       const int64_t K0 = ConstSrc0->Value.getSExtValue();
664       const int64_t K1 = ConstSrc1->Value.getSExtValue();
665       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667       uint32_t Imm = Lo16 | (Hi16 << 16);
668 
669       // VALU
670       if (IsVector) {
671         BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672         MI.eraseFromParent();
673         return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674       }
675 
676       // SALU
677       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678       MI.eraseFromParent();
679       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680     }
681   }
682 
683   // Now try TableGen patterns.
684   if (selectImpl(MI, *CoverageInfo))
685     return true;
686 
687   // TODO: This should probably be a combine somewhere
688   // (build_vector $src0, undef)  -> copy $src0
689   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690   if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691     MI.setDesc(TII.get(AMDGPU::COPY));
692     MI.removeOperand(2);
693     const auto &RC =
694         IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695     return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696            RBI.constrainGenericRegister(Src0, RC, *MRI);
697   }
698 
699   // TODO: Can be improved?
700   if (IsVector) {
701     Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702     auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703                    .addImm(0xFFFF)
704                    .addReg(Src0);
705     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706       return false;
707 
708     MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709               .addReg(Src1)
710               .addImm(16)
711               .addReg(TmpReg);
712     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713       return false;
714 
715     MI.eraseFromParent();
716     return true;
717   }
718 
719   Register ShiftSrc0;
720   Register ShiftSrc1;
721 
722   // With multiple uses of the shift, this will duplicate the shift and
723   // increase register pressure.
724   //
725   // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726   //  => (S_PACK_HH_B32_B16 $src0, $src1)
727   // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728   //  => (S_PACK_HL_B32_B16 $src0, $src1)
729   // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730   //  => (S_PACK_LH_B32_B16 $src0, $src1)
731   // (build_vector $src0, $src1)
732   //  => (S_PACK_LL_B32_B16 $src0, $src1)
733 
734   bool Shift0 = mi_match(
735       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736 
737   bool Shift1 = mi_match(
738       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739 
740   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741   if (Shift0 && Shift1) {
742     Opc = AMDGPU::S_PACK_HH_B32_B16;
743     MI.getOperand(1).setReg(ShiftSrc0);
744     MI.getOperand(2).setReg(ShiftSrc1);
745   } else if (Shift1) {
746     Opc = AMDGPU::S_PACK_LH_B32_B16;
747     MI.getOperand(2).setReg(ShiftSrc1);
748   } else if (Shift0) {
749     auto ConstSrc1 =
750         getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751     if (ConstSrc1 && ConstSrc1->Value == 0) {
752       // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753       auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754                      .addReg(ShiftSrc0)
755                      .addImm(16)
756                      .setOperandDead(3); // Dead scc
757 
758       MI.eraseFromParent();
759       return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760     }
761     if (STI.hasSPackHL()) {
762       Opc = AMDGPU::S_PACK_HL_B32_B16;
763       MI.getOperand(1).setReg(ShiftSrc0);
764     }
765   }
766 
767   MI.setDesc(TII.get(Opc));
768   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769 }
770 
771 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772   const MachineOperand &MO = I.getOperand(0);
773 
774   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775   // regbank check here is to know why getConstrainedRegClassForOperand failed.
776   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
777   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
778       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
779     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
780     return true;
781   }
782 
783   return false;
784 }
785 
786 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787   MachineBasicBlock *BB = I.getParent();
788 
789   Register DstReg = I.getOperand(0).getReg();
790   Register Src0Reg = I.getOperand(1).getReg();
791   Register Src1Reg = I.getOperand(2).getReg();
792   LLT Src1Ty = MRI->getType(Src1Reg);
793 
794   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
795   unsigned InsSize = Src1Ty.getSizeInBits();
796 
797   int64_t Offset = I.getOperand(3).getImm();
798 
799   // FIXME: These cases should have been illegal and unnecessary to check here.
800   if (Offset % 32 != 0 || InsSize % 32 != 0)
801     return false;
802 
803   // Currently not handled by getSubRegFromChannel.
804   if (InsSize > 128)
805     return false;
806 
807   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
808   if (SubReg == AMDGPU::NoSubRegister)
809     return false;
810 
811   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
812   const TargetRegisterClass *DstRC =
813       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
814   if (!DstRC)
815     return false;
816 
817   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
818   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
819   const TargetRegisterClass *Src0RC =
820       TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
821   const TargetRegisterClass *Src1RC =
822       TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
823 
824   // Deal with weird cases where the class only partially supports the subreg
825   // index.
826   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827   if (!Src0RC || !Src1RC)
828     return false;
829 
830   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
831       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
832       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
833     return false;
834 
835   const DebugLoc &DL = I.getDebugLoc();
836   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
837     .addReg(Src0Reg)
838     .addReg(Src1Reg)
839     .addImm(SubReg);
840 
841   I.eraseFromParent();
842   return true;
843 }
844 
845 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846   Register DstReg = MI.getOperand(0).getReg();
847   Register SrcReg = MI.getOperand(1).getReg();
848   Register OffsetReg = MI.getOperand(2).getReg();
849   Register WidthReg = MI.getOperand(3).getReg();
850 
851   assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852          "scalar BFX instructions are expanded in regbankselect");
853   assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854          "64-bit vector BFX instructions are expanded in regbankselect");
855 
856   const DebugLoc &DL = MI.getDebugLoc();
857   MachineBasicBlock *MBB = MI.getParent();
858 
859   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860   unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
862                  .addReg(SrcReg)
863                  .addReg(OffsetReg)
864                  .addReg(WidthReg);
865   MI.eraseFromParent();
866   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867 }
868 
869 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870   if (STI.getLDSBankCount() != 16)
871     return selectImpl(MI, *CoverageInfo);
872 
873   Register Dst = MI.getOperand(0).getReg();
874   Register Src0 = MI.getOperand(2).getReg();
875   Register M0Val = MI.getOperand(6).getReg();
876   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
877       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
878       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
879     return false;
880 
881   // This requires 2 instructions. It is possible to write a pattern to support
882   // this, but the generated isel emitter doesn't correctly deal with multiple
883   // output instructions using the same physical register input. The copy to m0
884   // is incorrectly placed before the second instruction.
885   //
886   // TODO: Match source modifiers.
887 
888   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
889   const DebugLoc &DL = MI.getDebugLoc();
890   MachineBasicBlock *MBB = MI.getParent();
891 
892   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
893     .addReg(M0Val);
894   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
895     .addImm(2)
896     .addImm(MI.getOperand(4).getImm())  // $attr
897     .addImm(MI.getOperand(3).getImm()); // $attrchan
898 
899   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
900     .addImm(0)                          // $src0_modifiers
901     .addReg(Src0)                       // $src0
902     .addImm(MI.getOperand(4).getImm())  // $attr
903     .addImm(MI.getOperand(3).getImm())  // $attrchan
904     .addImm(0)                          // $src2_modifiers
905     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
906     .addImm(MI.getOperand(5).getImm())  // $high
907     .addImm(0)                          // $clamp
908     .addImm(0);                         // $omod
909 
910   MI.eraseFromParent();
911   return true;
912 }
913 
914 // Writelane is special in that it can use SGPR and M0 (which would normally
915 // count as using the constant bus twice - but in this case it is allowed since
916 // the lane selector doesn't count as a use of the constant bus). However, it is
917 // still required to abide by the 1 SGPR rule. Fix this up if we might have
918 // multiple SGPRs.
919 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920   // With a constant bus limit of at least 2, there's no issue.
921   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
922     return selectImpl(MI, *CoverageInfo);
923 
924   MachineBasicBlock *MBB = MI.getParent();
925   const DebugLoc &DL = MI.getDebugLoc();
926   Register VDst = MI.getOperand(0).getReg();
927   Register Val = MI.getOperand(2).getReg();
928   Register LaneSelect = MI.getOperand(3).getReg();
929   Register VDstIn = MI.getOperand(4).getReg();
930 
931   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
932 
933   std::optional<ValueAndVReg> ConstSelect =
934       getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
935   if (ConstSelect) {
936     // The selector has to be an inline immediate, so we can use whatever for
937     // the other operands.
938     MIB.addReg(Val);
939     MIB.addImm(ConstSelect->Value.getSExtValue() &
940                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
941   } else {
942     std::optional<ValueAndVReg> ConstVal =
943         getIConstantVRegValWithLookThrough(Val, *MRI);
944 
945     // If the value written is an inline immediate, we can get away without a
946     // copy to m0.
947     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
948                                                  STI.hasInv2PiInlineImm())) {
949       MIB.addImm(ConstVal->Value.getSExtValue());
950       MIB.addReg(LaneSelect);
951     } else {
952       MIB.addReg(Val);
953 
954       // If the lane selector was originally in a VGPR and copied with
955       // readfirstlane, there's a hazard to read the same SGPR from the
956       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
958 
959       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
960         .addReg(LaneSelect);
961       MIB.addReg(AMDGPU::M0);
962     }
963   }
964 
965   MIB.addReg(VDstIn);
966 
967   MI.eraseFromParent();
968   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
969 }
970 
971 // We need to handle this here because tablegen doesn't support matching
972 // instructions with multiple outputs.
973 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974   Register Dst0 = MI.getOperand(0).getReg();
975   Register Dst1 = MI.getOperand(1).getReg();
976 
977   LLT Ty = MRI->getType(Dst0);
978   unsigned Opc;
979   if (Ty == LLT::scalar(32))
980     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981   else if (Ty == LLT::scalar(64))
982     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983   else
984     return false;
985 
986   // TODO: Match source modifiers.
987 
988   const DebugLoc &DL = MI.getDebugLoc();
989   MachineBasicBlock *MBB = MI.getParent();
990 
991   Register Numer = MI.getOperand(3).getReg();
992   Register Denom = MI.getOperand(4).getReg();
993   unsigned ChooseDenom = MI.getOperand(5).getImm();
994 
995   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996 
997   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
998     .addDef(Dst1)
999     .addImm(0)     // $src0_modifiers
1000     .addUse(Src0)  // $src0
1001     .addImm(0)     // $src1_modifiers
1002     .addUse(Denom) // $src1
1003     .addImm(0)     // $src2_modifiers
1004     .addUse(Numer) // $src2
1005     .addImm(0)     // $clamp
1006     .addImm(0);    // $omod
1007 
1008   MI.eraseFromParent();
1009   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1010 }
1011 
1012 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1014   switch (IntrinsicID) {
1015   case Intrinsic::amdgcn_if_break: {
1016     MachineBasicBlock *BB = I.getParent();
1017 
1018     // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019     // SelectionDAG uses for wave32 vs wave64.
1020     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1021       .add(I.getOperand(0))
1022       .add(I.getOperand(2))
1023       .add(I.getOperand(3));
1024 
1025     Register DstReg = I.getOperand(0).getReg();
1026     Register Src0Reg = I.getOperand(2).getReg();
1027     Register Src1Reg = I.getOperand(3).getReg();
1028 
1029     I.eraseFromParent();
1030 
1031     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1033 
1034     return true;
1035   }
1036   case Intrinsic::amdgcn_interp_p1_f16:
1037     return selectInterpP1F16(I);
1038   case Intrinsic::amdgcn_wqm:
1039     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1040   case Intrinsic::amdgcn_softwqm:
1041     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1042   case Intrinsic::amdgcn_strict_wwm:
1043   case Intrinsic::amdgcn_wwm:
1044     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1045   case Intrinsic::amdgcn_strict_wqm:
1046     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1047   case Intrinsic::amdgcn_writelane:
1048     return selectWritelane(I);
1049   case Intrinsic::amdgcn_div_scale:
1050     return selectDivScale(I);
1051   case Intrinsic::amdgcn_icmp:
1052   case Intrinsic::amdgcn_fcmp:
1053     if (selectImpl(I, *CoverageInfo))
1054       return true;
1055     return selectIntrinsicCmp(I);
1056   case Intrinsic::amdgcn_ballot:
1057     return selectBallot(I);
1058   case Intrinsic::amdgcn_reloc_constant:
1059     return selectRelocConstant(I);
1060   case Intrinsic::amdgcn_groupstaticsize:
1061     return selectGroupStaticSize(I);
1062   case Intrinsic::returnaddress:
1063     return selectReturnAddress(I);
1064   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1065   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1066   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1067   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1068   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1069   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1070   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1071   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1072   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1073   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1074   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1075   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1076   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1077   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078     return selectSMFMACIntrin(I);
1079   default:
1080     return selectImpl(I, *CoverageInfo);
1081   }
1082 }
1083 
1084 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1085                           const GCNSubtarget &ST) {
1086   if (Size != 16 && Size != 32 && Size != 64)
1087     return -1;
1088 
1089   if (Size == 16 && !ST.has16BitInsts())
1090     return -1;
1091 
1092   const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1093                           unsigned S64Opc) {
1094     if (Size == 16)
1095       return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1096     if (Size == 32)
1097       return S32Opc;
1098     return S64Opc;
1099   };
1100 
1101   switch (P) {
1102   default:
1103     llvm_unreachable("Unknown condition code!");
1104   case CmpInst::ICMP_NE:
1105     return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1106                   AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1107   case CmpInst::ICMP_EQ:
1108     return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1109                   AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1110   case CmpInst::ICMP_SGT:
1111     return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1112                   AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1113   case CmpInst::ICMP_SGE:
1114     return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1115                   AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1116   case CmpInst::ICMP_SLT:
1117     return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1118                   AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1119   case CmpInst::ICMP_SLE:
1120     return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1121                   AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1122   case CmpInst::ICMP_UGT:
1123     return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1124                   AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1125   case CmpInst::ICMP_UGE:
1126     return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1127                   AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1128   case CmpInst::ICMP_ULT:
1129     return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1130                   AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1131   case CmpInst::ICMP_ULE:
1132     return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1133                   AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1134 
1135   case CmpInst::FCMP_OEQ:
1136     return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1137                   AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1138   case CmpInst::FCMP_OGT:
1139     return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1140                   AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1141   case CmpInst::FCMP_OGE:
1142     return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1143                   AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1144   case CmpInst::FCMP_OLT:
1145     return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1146                   AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1147   case CmpInst::FCMP_OLE:
1148     return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1149                   AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1150   case CmpInst::FCMP_ONE:
1151     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1152                   AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1153   case CmpInst::FCMP_ORD:
1154     return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1155                   AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1156   case CmpInst::FCMP_UNO:
1157     return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1158                   AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1159   case CmpInst::FCMP_UEQ:
1160     return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1161                   AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1162   case CmpInst::FCMP_UGT:
1163     return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1164                   AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1165   case CmpInst::FCMP_UGE:
1166     return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1167                   AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1168   case CmpInst::FCMP_ULT:
1169     return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1170                   AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1171   case CmpInst::FCMP_ULE:
1172     return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1173                   AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1174   case CmpInst::FCMP_UNE:
1175     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1176                   AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1177   case CmpInst::FCMP_TRUE:
1178     return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1179                   AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1180   case CmpInst::FCMP_FALSE:
1181     return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1182                   AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1183   }
1184 }
1185 
1186 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1187                                               unsigned Size) const {
1188   if (Size == 64) {
1189     if (!STI.hasScalarCompareEq64())
1190       return -1;
1191 
1192     switch (P) {
1193     case CmpInst::ICMP_NE:
1194       return AMDGPU::S_CMP_LG_U64;
1195     case CmpInst::ICMP_EQ:
1196       return AMDGPU::S_CMP_EQ_U64;
1197     default:
1198       return -1;
1199     }
1200   }
1201 
1202   if (Size == 32) {
1203     switch (P) {
1204     case CmpInst::ICMP_NE:
1205       return AMDGPU::S_CMP_LG_U32;
1206     case CmpInst::ICMP_EQ:
1207       return AMDGPU::S_CMP_EQ_U32;
1208     case CmpInst::ICMP_SGT:
1209       return AMDGPU::S_CMP_GT_I32;
1210     case CmpInst::ICMP_SGE:
1211       return AMDGPU::S_CMP_GE_I32;
1212     case CmpInst::ICMP_SLT:
1213       return AMDGPU::S_CMP_LT_I32;
1214     case CmpInst::ICMP_SLE:
1215       return AMDGPU::S_CMP_LE_I32;
1216     case CmpInst::ICMP_UGT:
1217       return AMDGPU::S_CMP_GT_U32;
1218     case CmpInst::ICMP_UGE:
1219       return AMDGPU::S_CMP_GE_U32;
1220     case CmpInst::ICMP_ULT:
1221       return AMDGPU::S_CMP_LT_U32;
1222     case CmpInst::ICMP_ULE:
1223       return AMDGPU::S_CMP_LE_U32;
1224     case CmpInst::FCMP_OEQ:
1225       return AMDGPU::S_CMP_EQ_F32;
1226     case CmpInst::FCMP_OGT:
1227       return AMDGPU::S_CMP_GT_F32;
1228     case CmpInst::FCMP_OGE:
1229       return AMDGPU::S_CMP_GE_F32;
1230     case CmpInst::FCMP_OLT:
1231       return AMDGPU::S_CMP_LT_F32;
1232     case CmpInst::FCMP_OLE:
1233       return AMDGPU::S_CMP_LE_F32;
1234     case CmpInst::FCMP_ONE:
1235       return AMDGPU::S_CMP_LG_F32;
1236     case CmpInst::FCMP_ORD:
1237       return AMDGPU::S_CMP_O_F32;
1238     case CmpInst::FCMP_UNO:
1239       return AMDGPU::S_CMP_U_F32;
1240     case CmpInst::FCMP_UEQ:
1241       return AMDGPU::S_CMP_NLG_F32;
1242     case CmpInst::FCMP_UGT:
1243       return AMDGPU::S_CMP_NLE_F32;
1244     case CmpInst::FCMP_UGE:
1245       return AMDGPU::S_CMP_NLT_F32;
1246     case CmpInst::FCMP_ULT:
1247       return AMDGPU::S_CMP_NGE_F32;
1248     case CmpInst::FCMP_ULE:
1249       return AMDGPU::S_CMP_NGT_F32;
1250     case CmpInst::FCMP_UNE:
1251       return AMDGPU::S_CMP_NEQ_F32;
1252     default:
1253       llvm_unreachable("Unknown condition code!");
1254     }
1255   }
1256 
1257   if (Size == 16) {
1258     if (!STI.hasSALUFloatInsts())
1259       return -1;
1260 
1261     switch (P) {
1262     case CmpInst::FCMP_OEQ:
1263       return AMDGPU::S_CMP_EQ_F16;
1264     case CmpInst::FCMP_OGT:
1265       return AMDGPU::S_CMP_GT_F16;
1266     case CmpInst::FCMP_OGE:
1267       return AMDGPU::S_CMP_GE_F16;
1268     case CmpInst::FCMP_OLT:
1269       return AMDGPU::S_CMP_LT_F16;
1270     case CmpInst::FCMP_OLE:
1271       return AMDGPU::S_CMP_LE_F16;
1272     case CmpInst::FCMP_ONE:
1273       return AMDGPU::S_CMP_LG_F16;
1274     case CmpInst::FCMP_ORD:
1275       return AMDGPU::S_CMP_O_F16;
1276     case CmpInst::FCMP_UNO:
1277       return AMDGPU::S_CMP_U_F16;
1278     case CmpInst::FCMP_UEQ:
1279       return AMDGPU::S_CMP_NLG_F16;
1280     case CmpInst::FCMP_UGT:
1281       return AMDGPU::S_CMP_NLE_F16;
1282     case CmpInst::FCMP_UGE:
1283       return AMDGPU::S_CMP_NLT_F16;
1284     case CmpInst::FCMP_ULT:
1285       return AMDGPU::S_CMP_NGE_F16;
1286     case CmpInst::FCMP_ULE:
1287       return AMDGPU::S_CMP_NGT_F16;
1288     case CmpInst::FCMP_UNE:
1289       return AMDGPU::S_CMP_NEQ_F16;
1290     default:
1291       llvm_unreachable("Unknown condition code!");
1292     }
1293   }
1294 
1295   return -1;
1296 }
1297 
1298 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1299 
1300   MachineBasicBlock *BB = I.getParent();
1301   const DebugLoc &DL = I.getDebugLoc();
1302 
1303   Register SrcReg = I.getOperand(2).getReg();
1304   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1305 
1306   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1307 
1308   Register CCReg = I.getOperand(0).getReg();
1309   if (!isVCC(CCReg, *MRI)) {
1310     int Opcode = getS_CMPOpcode(Pred, Size);
1311     if (Opcode == -1)
1312       return false;
1313     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1314             .add(I.getOperand(2))
1315             .add(I.getOperand(3));
1316     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1317       .addReg(AMDGPU::SCC);
1318     bool Ret =
1319         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1320         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1321     I.eraseFromParent();
1322     return Ret;
1323   }
1324 
1325   if (I.getOpcode() == AMDGPU::G_FCMP)
1326     return false;
1327 
1328   int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1329   if (Opcode == -1)
1330     return false;
1331 
1332   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1333             I.getOperand(0).getReg())
1334             .add(I.getOperand(2))
1335             .add(I.getOperand(3));
1336   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1337                                *TRI.getBoolRC(), *MRI);
1338   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1339   I.eraseFromParent();
1340   return Ret;
1341 }
1342 
1343 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1344   Register Dst = I.getOperand(0).getReg();
1345   if (isVCC(Dst, *MRI))
1346     return false;
1347 
1348   LLT DstTy = MRI->getType(Dst);
1349   if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1350     return false;
1351 
1352   MachineBasicBlock *BB = I.getParent();
1353   const DebugLoc &DL = I.getDebugLoc();
1354   Register SrcReg = I.getOperand(2).getReg();
1355   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1356 
1357   // i1 inputs are not supported in GlobalISel.
1358   if (Size == 1)
1359     return false;
1360 
1361   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1362   if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1363     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1364     I.eraseFromParent();
1365     return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1366   }
1367 
1368   const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1369   if (Opcode == -1)
1370     return false;
1371 
1372   MachineInstrBuilder SelectedMI;
1373   MachineOperand &LHS = I.getOperand(2);
1374   MachineOperand &RHS = I.getOperand(3);
1375   auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1376   auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1377   Register Src0Reg =
1378       copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1379   Register Src1Reg =
1380       copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1381   SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1382   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1383     SelectedMI.addImm(Src0Mods);
1384   SelectedMI.addReg(Src0Reg);
1385   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1386     SelectedMI.addImm(Src1Mods);
1387   SelectedMI.addReg(Src1Reg);
1388   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1389     SelectedMI.addImm(0); // clamp
1390   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1391     SelectedMI.addImm(0); // op_sel
1392 
1393   RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1394   if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1395     return false;
1396 
1397   I.eraseFromParent();
1398   return true;
1399 }
1400 
1401 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1402   MachineBasicBlock *BB = I.getParent();
1403   const DebugLoc &DL = I.getDebugLoc();
1404   Register DstReg = I.getOperand(0).getReg();
1405   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1406   const bool Is64 = Size == 64;
1407   const bool IsWave32 = (STI.getWavefrontSize() == 32);
1408 
1409   // In the common case, the return type matches the wave size.
1410   // However we also support emitting i64 ballots in wave32 mode.
1411   if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1412     return false;
1413 
1414   std::optional<ValueAndVReg> Arg =
1415       getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1416 
1417   const auto BuildCopy = [&](Register SrcReg) {
1418     if (Size == STI.getWavefrontSize()) {
1419       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1420           .addReg(SrcReg);
1421       return;
1422     }
1423 
1424     // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1425     Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1426     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1427     BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1428         .addReg(SrcReg)
1429         .addImm(AMDGPU::sub0)
1430         .addReg(HiReg)
1431         .addImm(AMDGPU::sub1);
1432   };
1433 
1434   if (Arg) {
1435     const int64_t Value = Arg->Value.getSExtValue();
1436     if (Value == 0) {
1437       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1438       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1439     } else if (Value == -1) // all ones
1440       BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1441     else
1442       return false;
1443   } else
1444     BuildCopy(I.getOperand(2).getReg());
1445 
1446   I.eraseFromParent();
1447   return true;
1448 }
1449 
1450 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1451   Register DstReg = I.getOperand(0).getReg();
1452   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1453   const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1454   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1455     return false;
1456 
1457   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1458 
1459   Module *M = MF->getFunction().getParent();
1460   const MDNode *Metadata = I.getOperand(2).getMetadata();
1461   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1462   auto RelocSymbol = cast<GlobalVariable>(
1463     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1464 
1465   MachineBasicBlock *BB = I.getParent();
1466   BuildMI(*BB, &I, I.getDebugLoc(),
1467           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1468     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1469 
1470   I.eraseFromParent();
1471   return true;
1472 }
1473 
1474 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1475   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1476 
1477   Register DstReg = I.getOperand(0).getReg();
1478   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1479   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1480     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1481 
1482   MachineBasicBlock *MBB = I.getParent();
1483   const DebugLoc &DL = I.getDebugLoc();
1484 
1485   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1486 
1487   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1488     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1489     MIB.addImm(MFI->getLDSSize());
1490   } else {
1491     Module *M = MF->getFunction().getParent();
1492     const GlobalValue *GV
1493       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1494     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1495   }
1496 
1497   I.eraseFromParent();
1498   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1499 }
1500 
1501 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1502   MachineBasicBlock *MBB = I.getParent();
1503   MachineFunction &MF = *MBB->getParent();
1504   const DebugLoc &DL = I.getDebugLoc();
1505 
1506   MachineOperand &Dst = I.getOperand(0);
1507   Register DstReg = Dst.getReg();
1508   unsigned Depth = I.getOperand(2).getImm();
1509 
1510   const TargetRegisterClass *RC
1511     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1512   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1513       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1514     return false;
1515 
1516   // Check for kernel and shader functions
1517   if (Depth != 0 ||
1518       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1519     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1520       .addImm(0);
1521     I.eraseFromParent();
1522     return true;
1523   }
1524 
1525   MachineFrameInfo &MFI = MF.getFrameInfo();
1526   // There is a call to @llvm.returnaddress in this function
1527   MFI.setReturnAddressIsTaken(true);
1528 
1529   // Get the return address reg and mark it as an implicit live-in
1530   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1531   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1532                                              AMDGPU::SReg_64RegClass, DL);
1533   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1534     .addReg(LiveIn);
1535   I.eraseFromParent();
1536   return true;
1537 }
1538 
1539 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1540   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1541   // SelectionDAG uses for wave32 vs wave64.
1542   MachineBasicBlock *BB = MI.getParent();
1543   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1544       .add(MI.getOperand(1));
1545 
1546   Register Reg = MI.getOperand(1).getReg();
1547   MI.eraseFromParent();
1548 
1549   if (!MRI->getRegClassOrNull(Reg))
1550     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1551   return true;
1552 }
1553 
1554 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1555   MachineInstr &MI, Intrinsic::ID IntrID) const {
1556   MachineBasicBlock *MBB = MI.getParent();
1557   MachineFunction *MF = MBB->getParent();
1558   const DebugLoc &DL = MI.getDebugLoc();
1559 
1560   unsigned IndexOperand = MI.getOperand(7).getImm();
1561   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1562   bool WaveDone = MI.getOperand(9).getImm() != 0;
1563 
1564   if (WaveDone && !WaveRelease)
1565     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1566 
1567   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1568   IndexOperand &= ~0x3f;
1569   unsigned CountDw = 0;
1570 
1571   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1572     CountDw = (IndexOperand >> 24) & 0xf;
1573     IndexOperand &= ~(0xf << 24);
1574 
1575     if (CountDw < 1 || CountDw > 4) {
1576       report_fatal_error(
1577         "ds_ordered_count: dword count must be between 1 and 4");
1578     }
1579   }
1580 
1581   if (IndexOperand)
1582     report_fatal_error("ds_ordered_count: bad index operand");
1583 
1584   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1585   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1586 
1587   unsigned Offset0 = OrderedCountIndex << 2;
1588   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1589 
1590   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1591     Offset1 |= (CountDw - 1) << 6;
1592 
1593   if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1594     Offset1 |= ShaderType << 2;
1595 
1596   unsigned Offset = Offset0 | (Offset1 << 8);
1597 
1598   Register M0Val = MI.getOperand(2).getReg();
1599   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1600     .addReg(M0Val);
1601 
1602   Register DstReg = MI.getOperand(0).getReg();
1603   Register ValReg = MI.getOperand(3).getReg();
1604   MachineInstrBuilder DS =
1605     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1606       .addReg(ValReg)
1607       .addImm(Offset)
1608       .cloneMemRefs(MI);
1609 
1610   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1611     return false;
1612 
1613   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1614   MI.eraseFromParent();
1615   return Ret;
1616 }
1617 
1618 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1619   switch (IntrID) {
1620   case Intrinsic::amdgcn_ds_gws_init:
1621     return AMDGPU::DS_GWS_INIT;
1622   case Intrinsic::amdgcn_ds_gws_barrier:
1623     return AMDGPU::DS_GWS_BARRIER;
1624   case Intrinsic::amdgcn_ds_gws_sema_v:
1625     return AMDGPU::DS_GWS_SEMA_V;
1626   case Intrinsic::amdgcn_ds_gws_sema_br:
1627     return AMDGPU::DS_GWS_SEMA_BR;
1628   case Intrinsic::amdgcn_ds_gws_sema_p:
1629     return AMDGPU::DS_GWS_SEMA_P;
1630   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1631     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1632   default:
1633     llvm_unreachable("not a gws intrinsic");
1634   }
1635 }
1636 
1637 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1638                                                      Intrinsic::ID IID) const {
1639   if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1640                         !STI.hasGWSSemaReleaseAll()))
1641     return false;
1642 
1643   // intrinsic ID, vsrc, offset
1644   const bool HasVSrc = MI.getNumOperands() == 3;
1645   assert(HasVSrc || MI.getNumOperands() == 2);
1646 
1647   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1648   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1649   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1650     return false;
1651 
1652   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1653   unsigned ImmOffset;
1654 
1655   MachineBasicBlock *MBB = MI.getParent();
1656   const DebugLoc &DL = MI.getDebugLoc();
1657 
1658   MachineInstr *Readfirstlane = nullptr;
1659 
1660   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1661   // incoming offset, in case there's an add of a constant. We'll have to put it
1662   // back later.
1663   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1664     Readfirstlane = OffsetDef;
1665     BaseOffset = OffsetDef->getOperand(1).getReg();
1666     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1667   }
1668 
1669   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1670     // If we have a constant offset, try to use the 0 in m0 as the base.
1671     // TODO: Look into changing the default m0 initialization value. If the
1672     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1673     // the immediate offset.
1674 
1675     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1676     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1677       .addImm(0);
1678   } else {
1679     std::tie(BaseOffset, ImmOffset) =
1680         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1681 
1682     if (Readfirstlane) {
1683       // We have the constant offset now, so put the readfirstlane back on the
1684       // variable component.
1685       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1686         return false;
1687 
1688       Readfirstlane->getOperand(1).setReg(BaseOffset);
1689       BaseOffset = Readfirstlane->getOperand(0).getReg();
1690     } else {
1691       if (!RBI.constrainGenericRegister(BaseOffset,
1692                                         AMDGPU::SReg_32RegClass, *MRI))
1693         return false;
1694     }
1695 
1696     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1697     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1698       .addReg(BaseOffset)
1699       .addImm(16)
1700       .setOperandDead(3); // Dead scc
1701 
1702     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1703       .addReg(M0Base);
1704   }
1705 
1706   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1707   // offset field) % 64. Some versions of the programming guide omit the m0
1708   // part, or claim it's from offset 0.
1709   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1710 
1711   if (HasVSrc) {
1712     Register VSrc = MI.getOperand(1).getReg();
1713     MIB.addReg(VSrc);
1714 
1715     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1716       return false;
1717   }
1718 
1719   MIB.addImm(ImmOffset)
1720      .cloneMemRefs(MI);
1721 
1722   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1723 
1724   MI.eraseFromParent();
1725   return true;
1726 }
1727 
1728 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1729                                                       bool IsAppend) const {
1730   Register PtrBase = MI.getOperand(2).getReg();
1731   LLT PtrTy = MRI->getType(PtrBase);
1732   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1733 
1734   unsigned Offset;
1735   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1736 
1737   // TODO: Should this try to look through readfirstlane like GWS?
1738   if (!isDSOffsetLegal(PtrBase, Offset)) {
1739     PtrBase = MI.getOperand(2).getReg();
1740     Offset = 0;
1741   }
1742 
1743   MachineBasicBlock *MBB = MI.getParent();
1744   const DebugLoc &DL = MI.getDebugLoc();
1745   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1746 
1747   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1748     .addReg(PtrBase);
1749   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1750     return false;
1751 
1752   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1753     .addImm(Offset)
1754     .addImm(IsGDS ? -1 : 0)
1755     .cloneMemRefs(MI);
1756   MI.eraseFromParent();
1757   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1758 }
1759 
1760 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1761   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1762     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1763     if (WGSize <= STI.getWavefrontSize()) {
1764       MachineBasicBlock *MBB = MI.getParent();
1765       const DebugLoc &DL = MI.getDebugLoc();
1766       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1767       MI.eraseFromParent();
1768       return true;
1769     }
1770   }
1771 
1772   // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1773   if (STI.hasSplitBarriers()) {
1774     MachineBasicBlock *MBB = MI.getParent();
1775     const DebugLoc &DL = MI.getDebugLoc();
1776     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1777         .addImm(AMDGPU::Barrier::WORKGROUP);
1778     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1779         .addImm(AMDGPU::Barrier::WORKGROUP);
1780     MI.eraseFromParent();
1781     return true;
1782   }
1783 
1784   return selectImpl(MI, *CoverageInfo);
1785 }
1786 
1787 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1788                          bool &IsTexFail) {
1789   if (TexFailCtrl)
1790     IsTexFail = true;
1791 
1792   TFE = (TexFailCtrl & 0x1) ? true : false;
1793   TexFailCtrl &= ~(uint64_t)0x1;
1794   LWE = (TexFailCtrl & 0x2) ? true : false;
1795   TexFailCtrl &= ~(uint64_t)0x2;
1796 
1797   return TexFailCtrl == 0;
1798 }
1799 
1800 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1801   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1802   MachineBasicBlock *MBB = MI.getParent();
1803   const DebugLoc &DL = MI.getDebugLoc();
1804 
1805   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1806     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1807 
1808   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1809   unsigned IntrOpcode = Intr->BaseOpcode;
1810   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1811   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1812   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1813 
1814   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1815 
1816   Register VDataIn, VDataOut;
1817   LLT VDataTy;
1818   int NumVDataDwords = -1;
1819   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1820                MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1821 
1822   bool Unorm;
1823   if (!BaseOpcode->Sampler)
1824     Unorm = true;
1825   else
1826     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1827 
1828   bool TFE;
1829   bool LWE;
1830   bool IsTexFail = false;
1831   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1832                     TFE, LWE, IsTexFail))
1833     return false;
1834 
1835   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1836   const bool IsA16 = (Flags & 1) != 0;
1837   const bool IsG16 = (Flags & 2) != 0;
1838 
1839   // A16 implies 16 bit gradients if subtarget doesn't support G16
1840   if (IsA16 && !STI.hasG16() && !IsG16)
1841     return false;
1842 
1843   unsigned DMask = 0;
1844   unsigned DMaskLanes = 0;
1845 
1846   if (BaseOpcode->Atomic) {
1847     VDataOut = MI.getOperand(0).getReg();
1848     VDataIn = MI.getOperand(2).getReg();
1849     LLT Ty = MRI->getType(VDataIn);
1850 
1851     // Be careful to allow atomic swap on 16-bit element vectors.
1852     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1853       Ty.getSizeInBits() == 128 :
1854       Ty.getSizeInBits() == 64;
1855 
1856     if (BaseOpcode->AtomicX2) {
1857       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1858 
1859       DMask = Is64Bit ? 0xf : 0x3;
1860       NumVDataDwords = Is64Bit ? 4 : 2;
1861     } else {
1862       DMask = Is64Bit ? 0x3 : 0x1;
1863       NumVDataDwords = Is64Bit ? 2 : 1;
1864     }
1865   } else {
1866     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1867     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1868 
1869     if (BaseOpcode->Store) {
1870       VDataIn = MI.getOperand(1).getReg();
1871       VDataTy = MRI->getType(VDataIn);
1872       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1873     } else if (BaseOpcode->NoReturn) {
1874       NumVDataDwords = 0;
1875     } else {
1876       VDataOut = MI.getOperand(0).getReg();
1877       VDataTy = MRI->getType(VDataOut);
1878       NumVDataDwords = DMaskLanes;
1879 
1880       if (IsD16 && !STI.hasUnpackedD16VMem())
1881         NumVDataDwords = (DMaskLanes + 1) / 2;
1882     }
1883   }
1884 
1885   // Set G16 opcode
1886   if (Subtarget->hasG16() && IsG16) {
1887     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1888         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1889     assert(G16MappingInfo);
1890     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1891   }
1892 
1893   // TODO: Check this in verifier.
1894   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1895 
1896   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1897   if (BaseOpcode->Atomic)
1898     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1899   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1900                AMDGPU::CPol::VOLATILE))
1901     return false;
1902 
1903   int NumVAddrRegs = 0;
1904   int NumVAddrDwords = 0;
1905   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1906     // Skip the $noregs and 0s inserted during legalization.
1907     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1908     if (!AddrOp.isReg())
1909       continue; // XXX - Break?
1910 
1911     Register Addr = AddrOp.getReg();
1912     if (!Addr)
1913       break;
1914 
1915     ++NumVAddrRegs;
1916     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1917   }
1918 
1919   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1920   // NSA, these should have been packed into a single value in the first
1921   // address register
1922   const bool UseNSA =
1923       NumVAddrRegs != 1 &&
1924       (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1925                                    : NumVAddrDwords == NumVAddrRegs);
1926   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1927     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1928     return false;
1929   }
1930 
1931   if (IsTexFail)
1932     ++NumVDataDwords;
1933 
1934   int Opcode = -1;
1935   if (IsGFX12Plus) {
1936     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1937                                    NumVDataDwords, NumVAddrDwords);
1938   } else if (IsGFX11Plus) {
1939     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1940                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
1941                                           : AMDGPU::MIMGEncGfx11Default,
1942                                    NumVDataDwords, NumVAddrDwords);
1943   } else if (IsGFX10Plus) {
1944     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1945                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1946                                           : AMDGPU::MIMGEncGfx10Default,
1947                                    NumVDataDwords, NumVAddrDwords);
1948   } else {
1949     if (Subtarget->hasGFX90AInsts()) {
1950       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1951                                      NumVDataDwords, NumVAddrDwords);
1952       if (Opcode == -1) {
1953         LLVM_DEBUG(
1954             dbgs()
1955             << "requested image instruction is not supported on this GPU\n");
1956         return false;
1957       }
1958     }
1959     if (Opcode == -1 &&
1960         STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1961       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1962                                      NumVDataDwords, NumVAddrDwords);
1963     if (Opcode == -1)
1964       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1965                                      NumVDataDwords, NumVAddrDwords);
1966   }
1967   if (Opcode == -1)
1968     return false;
1969 
1970   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1971     .cloneMemRefs(MI);
1972 
1973   if (VDataOut) {
1974     if (BaseOpcode->AtomicX2) {
1975       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1976 
1977       Register TmpReg = MRI->createVirtualRegister(
1978         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1979       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1980 
1981       MIB.addDef(TmpReg);
1982       if (!MRI->use_empty(VDataOut)) {
1983         BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1984             .addReg(TmpReg, RegState::Kill, SubReg);
1985       }
1986 
1987     } else {
1988       MIB.addDef(VDataOut); // vdata output
1989     }
1990   }
1991 
1992   if (VDataIn)
1993     MIB.addReg(VDataIn); // vdata input
1994 
1995   for (int I = 0; I != NumVAddrRegs; ++I) {
1996     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1997     if (SrcOp.isReg()) {
1998       assert(SrcOp.getReg() != 0);
1999       MIB.addReg(SrcOp.getReg());
2000     }
2001   }
2002 
2003   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2004   if (BaseOpcode->Sampler)
2005     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2006 
2007   MIB.addImm(DMask); // dmask
2008 
2009   if (IsGFX10Plus)
2010     MIB.addImm(DimInfo->Encoding);
2011   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2012     MIB.addImm(Unorm);
2013 
2014   MIB.addImm(CPol);
2015   MIB.addImm(IsA16 &&  // a16 or r128
2016              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2017   if (IsGFX10Plus)
2018     MIB.addImm(IsA16 ? -1 : 0);
2019 
2020   if (!Subtarget->hasGFX90AInsts()) {
2021     MIB.addImm(TFE); // tfe
2022   } else if (TFE) {
2023     LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2024     return false;
2025   }
2026 
2027   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2028     MIB.addImm(LWE); // lwe
2029   if (!IsGFX10Plus)
2030     MIB.addImm(DimInfo->DA ? -1 : 0);
2031   if (BaseOpcode->HasD16)
2032     MIB.addImm(IsD16 ? -1 : 0);
2033 
2034   MI.eraseFromParent();
2035   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2036   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2037   return true;
2038 }
2039 
2040 // We need to handle this here because tablegen doesn't support matching
2041 // instructions with multiple outputs.
2042 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2043     MachineInstr &MI) const {
2044   Register Dst0 = MI.getOperand(0).getReg();
2045   Register Dst1 = MI.getOperand(1).getReg();
2046 
2047   const DebugLoc &DL = MI.getDebugLoc();
2048   MachineBasicBlock *MBB = MI.getParent();
2049 
2050   Register Addr = MI.getOperand(3).getReg();
2051   Register Data0 = MI.getOperand(4).getReg();
2052   Register Data1 = MI.getOperand(5).getReg();
2053   unsigned Offset = MI.getOperand(6).getImm();
2054 
2055   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2056                  .addDef(Dst1)
2057                  .addUse(Addr)
2058                  .addUse(Data0)
2059                  .addUse(Data1)
2060                  .addImm(Offset)
2061                  .cloneMemRefs(MI);
2062 
2063   MI.eraseFromParent();
2064   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2065 }
2066 
2067 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2068     MachineInstr &I) const {
2069   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2070   switch (IntrinsicID) {
2071   case Intrinsic::amdgcn_end_cf:
2072     return selectEndCfIntrinsic(I);
2073   case Intrinsic::amdgcn_ds_ordered_add:
2074   case Intrinsic::amdgcn_ds_ordered_swap:
2075     return selectDSOrderedIntrinsic(I, IntrinsicID);
2076   case Intrinsic::amdgcn_ds_gws_init:
2077   case Intrinsic::amdgcn_ds_gws_barrier:
2078   case Intrinsic::amdgcn_ds_gws_sema_v:
2079   case Intrinsic::amdgcn_ds_gws_sema_br:
2080   case Intrinsic::amdgcn_ds_gws_sema_p:
2081   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2082     return selectDSGWSIntrinsic(I, IntrinsicID);
2083   case Intrinsic::amdgcn_ds_append:
2084     return selectDSAppendConsume(I, true);
2085   case Intrinsic::amdgcn_ds_consume:
2086     return selectDSAppendConsume(I, false);
2087   case Intrinsic::amdgcn_s_barrier:
2088     return selectSBarrier(I);
2089   case Intrinsic::amdgcn_raw_buffer_load_lds:
2090   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2091   case Intrinsic::amdgcn_struct_buffer_load_lds:
2092   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2093     return selectBufferLoadLds(I);
2094   case Intrinsic::amdgcn_global_load_lds:
2095     return selectGlobalLoadLds(I);
2096   case Intrinsic::amdgcn_exp_compr:
2097     if (!STI.hasCompressedExport()) {
2098       Function &F = I.getMF()->getFunction();
2099       DiagnosticInfoUnsupported NoFpRet(
2100           F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2101       F.getContext().diagnose(NoFpRet);
2102       return false;
2103     }
2104     break;
2105   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2106     return selectDSBvhStackIntrinsic(I);
2107   case Intrinsic::amdgcn_s_barrier_init:
2108   case Intrinsic::amdgcn_s_barrier_join:
2109   case Intrinsic::amdgcn_s_wakeup_barrier:
2110   case Intrinsic::amdgcn_s_get_barrier_state:
2111     return selectNamedBarrierInst(I, IntrinsicID);
2112   case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2113   case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2114     return selectSBarrierSignalIsfirst(I, IntrinsicID);
2115   case Intrinsic::amdgcn_s_barrier_leave:
2116     return selectSBarrierLeave(I);
2117   }
2118   return selectImpl(I, *CoverageInfo);
2119 }
2120 
2121 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2122   if (selectImpl(I, *CoverageInfo))
2123     return true;
2124 
2125   MachineBasicBlock *BB = I.getParent();
2126   const DebugLoc &DL = I.getDebugLoc();
2127 
2128   Register DstReg = I.getOperand(0).getReg();
2129   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2130   assert(Size <= 32 || Size == 64);
2131   const MachineOperand &CCOp = I.getOperand(1);
2132   Register CCReg = CCOp.getReg();
2133   if (!isVCC(CCReg, *MRI)) {
2134     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2135                                          AMDGPU::S_CSELECT_B32;
2136     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2137             .addReg(CCReg);
2138 
2139     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2140     // bank, because it does not cover the register class that we used to represent
2141     // for it.  So we need to manually set the register class here.
2142     if (!MRI->getRegClassOrNull(CCReg))
2143         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2144     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2145             .add(I.getOperand(2))
2146             .add(I.getOperand(3));
2147 
2148     bool Ret = false;
2149     Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2150     Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2151     I.eraseFromParent();
2152     return Ret;
2153   }
2154 
2155   // Wide VGPR select should have been split in RegBankSelect.
2156   if (Size > 32)
2157     return false;
2158 
2159   MachineInstr *Select =
2160       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2161               .addImm(0)
2162               .add(I.getOperand(3))
2163               .addImm(0)
2164               .add(I.getOperand(2))
2165               .add(I.getOperand(1));
2166 
2167   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2168   I.eraseFromParent();
2169   return Ret;
2170 }
2171 
2172 static int sizeToSubRegIndex(unsigned Size) {
2173   switch (Size) {
2174   case 32:
2175     return AMDGPU::sub0;
2176   case 64:
2177     return AMDGPU::sub0_sub1;
2178   case 96:
2179     return AMDGPU::sub0_sub1_sub2;
2180   case 128:
2181     return AMDGPU::sub0_sub1_sub2_sub3;
2182   case 256:
2183     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2184   default:
2185     if (Size < 32)
2186       return AMDGPU::sub0;
2187     if (Size > 256)
2188       return -1;
2189     return sizeToSubRegIndex(llvm::bit_ceil(Size));
2190   }
2191 }
2192 
2193 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2194   Register DstReg = I.getOperand(0).getReg();
2195   Register SrcReg = I.getOperand(1).getReg();
2196   const LLT DstTy = MRI->getType(DstReg);
2197   const LLT SrcTy = MRI->getType(SrcReg);
2198   const LLT S1 = LLT::scalar(1);
2199 
2200   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2201   const RegisterBank *DstRB;
2202   if (DstTy == S1) {
2203     // This is a special case. We don't treat s1 for legalization artifacts as
2204     // vcc booleans.
2205     DstRB = SrcRB;
2206   } else {
2207     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2208     if (SrcRB != DstRB)
2209       return false;
2210   }
2211 
2212   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2213 
2214   unsigned DstSize = DstTy.getSizeInBits();
2215   unsigned SrcSize = SrcTy.getSizeInBits();
2216 
2217   const TargetRegisterClass *SrcRC =
2218       TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2219   const TargetRegisterClass *DstRC =
2220       TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2221   if (!SrcRC || !DstRC)
2222     return false;
2223 
2224   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2225       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2226     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2227     return false;
2228   }
2229 
2230   if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2231     MachineBasicBlock *MBB = I.getParent();
2232     const DebugLoc &DL = I.getDebugLoc();
2233 
2234     Register LoReg = MRI->createVirtualRegister(DstRC);
2235     Register HiReg = MRI->createVirtualRegister(DstRC);
2236     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2237       .addReg(SrcReg, 0, AMDGPU::sub0);
2238     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2239       .addReg(SrcReg, 0, AMDGPU::sub1);
2240 
2241     if (IsVALU && STI.hasSDWA()) {
2242       // Write the low 16-bits of the high element into the high 16-bits of the
2243       // low element.
2244       MachineInstr *MovSDWA =
2245         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2246         .addImm(0)                             // $src0_modifiers
2247         .addReg(HiReg)                         // $src0
2248         .addImm(0)                             // $clamp
2249         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2250         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2251         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2252         .addReg(LoReg, RegState::Implicit);
2253       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2254     } else {
2255       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2256       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2257       Register ImmReg = MRI->createVirtualRegister(DstRC);
2258       if (IsVALU) {
2259         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2260           .addImm(16)
2261           .addReg(HiReg);
2262       } else {
2263         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2264           .addReg(HiReg)
2265           .addImm(16)
2266           .setOperandDead(3); // Dead scc
2267       }
2268 
2269       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2270       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2271       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2272 
2273       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2274         .addImm(0xffff);
2275       auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2276         .addReg(LoReg)
2277         .addReg(ImmReg);
2278       auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2279         .addReg(TmpReg0)
2280         .addReg(TmpReg1);
2281 
2282       if (!IsVALU) {
2283         And.setOperandDead(3); // Dead scc
2284         Or.setOperandDead(3); // Dead scc
2285       }
2286     }
2287 
2288     I.eraseFromParent();
2289     return true;
2290   }
2291 
2292   if (!DstTy.isScalar())
2293     return false;
2294 
2295   if (SrcSize > 32) {
2296     int SubRegIdx = sizeToSubRegIndex(DstSize);
2297     if (SubRegIdx == -1)
2298       return false;
2299 
2300     // Deal with weird cases where the class only partially supports the subreg
2301     // index.
2302     const TargetRegisterClass *SrcWithSubRC
2303       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2304     if (!SrcWithSubRC)
2305       return false;
2306 
2307     if (SrcWithSubRC != SrcRC) {
2308       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2309         return false;
2310     }
2311 
2312     I.getOperand(1).setSubReg(SubRegIdx);
2313   }
2314 
2315   I.setDesc(TII.get(TargetOpcode::COPY));
2316   return true;
2317 }
2318 
2319 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
2320 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2321   Mask = maskTrailingOnes<unsigned>(Size);
2322   int SignedMask = static_cast<int>(Mask);
2323   return SignedMask >= -16 && SignedMask <= 64;
2324 }
2325 
2326 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2327 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2328   Register Reg, const MachineRegisterInfo &MRI,
2329   const TargetRegisterInfo &TRI) const {
2330   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2331   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2332     return RB;
2333 
2334   // Ignore the type, since we don't use vcc in artifacts.
2335   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2336     return &RBI.getRegBankFromRegClass(*RC, LLT());
2337   return nullptr;
2338 }
2339 
2340 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2341   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2342   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2343   const DebugLoc &DL = I.getDebugLoc();
2344   MachineBasicBlock &MBB = *I.getParent();
2345   const Register DstReg = I.getOperand(0).getReg();
2346   const Register SrcReg = I.getOperand(1).getReg();
2347 
2348   const LLT DstTy = MRI->getType(DstReg);
2349   const LLT SrcTy = MRI->getType(SrcReg);
2350   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2351     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2352   const unsigned DstSize = DstTy.getSizeInBits();
2353   if (!DstTy.isScalar())
2354     return false;
2355 
2356   // Artifact casts should never use vcc.
2357   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2358 
2359   // FIXME: This should probably be illegal and split earlier.
2360   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2361     if (DstSize <= 32)
2362       return selectCOPY(I);
2363 
2364     const TargetRegisterClass *SrcRC =
2365         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2366     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2367     const TargetRegisterClass *DstRC =
2368         TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2369 
2370     Register UndefReg = MRI->createVirtualRegister(SrcRC);
2371     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2372     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2373       .addReg(SrcReg)
2374       .addImm(AMDGPU::sub0)
2375       .addReg(UndefReg)
2376       .addImm(AMDGPU::sub1);
2377     I.eraseFromParent();
2378 
2379     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2380            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2381   }
2382 
2383   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2384     // 64-bit should have been split up in RegBankSelect
2385 
2386     // Try to use an and with a mask if it will save code size.
2387     unsigned Mask;
2388     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2389       MachineInstr *ExtI =
2390       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2391         .addImm(Mask)
2392         .addReg(SrcReg);
2393       I.eraseFromParent();
2394       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2395     }
2396 
2397     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2398     MachineInstr *ExtI =
2399       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2400       .addReg(SrcReg)
2401       .addImm(0) // Offset
2402       .addImm(SrcSize); // Width
2403     I.eraseFromParent();
2404     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2405   }
2406 
2407   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2408     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2409       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2410     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2411       return false;
2412 
2413     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2414       const unsigned SextOpc = SrcSize == 8 ?
2415         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2416       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2417         .addReg(SrcReg);
2418       I.eraseFromParent();
2419       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2420     }
2421 
2422     // Using a single 32-bit SALU to calculate the high half is smaller than
2423     // S_BFE with a literal constant operand.
2424     if (DstSize > 32 && SrcSize == 32) {
2425       Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2426       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2427       if (Signed) {
2428         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2429           .addReg(SrcReg, 0, SubReg)
2430           .addImm(31)
2431           .setOperandDead(3); // Dead scc
2432       } else {
2433         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2434           .addImm(0);
2435       }
2436       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2437         .addReg(SrcReg, 0, SubReg)
2438         .addImm(AMDGPU::sub0)
2439         .addReg(HiReg)
2440         .addImm(AMDGPU::sub1);
2441       I.eraseFromParent();
2442       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2443                                           *MRI);
2444     }
2445 
2446     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2447     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2448 
2449     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2450     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2451       // We need a 64-bit register source, but the high bits don't matter.
2452       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2453       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2454       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2455 
2456       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2457       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2458         .addReg(SrcReg, 0, SubReg)
2459         .addImm(AMDGPU::sub0)
2460         .addReg(UndefReg)
2461         .addImm(AMDGPU::sub1);
2462 
2463       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2464         .addReg(ExtReg)
2465         .addImm(SrcSize << 16);
2466 
2467       I.eraseFromParent();
2468       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2469     }
2470 
2471     unsigned Mask;
2472     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2473       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2474         .addReg(SrcReg)
2475         .addImm(Mask)
2476         .setOperandDead(3); // Dead scc
2477     } else {
2478       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2479         .addReg(SrcReg)
2480         .addImm(SrcSize << 16);
2481     }
2482 
2483     I.eraseFromParent();
2484     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2485   }
2486 
2487   return false;
2488 }
2489 
2490 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2491                            Register &Out) {
2492   Register LShlSrc;
2493   if (mi_match(In, MRI,
2494                m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2495     Out = LShlSrc;
2496     return true;
2497   }
2498   return false;
2499 }
2500 
2501 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2502   if (!Subtarget->hasSALUFloatInsts())
2503     return false;
2504 
2505   Register Dst = I.getOperand(0).getReg();
2506   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2507   if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2508     return false;
2509 
2510   Register Src = I.getOperand(1).getReg();
2511 
2512   if (MRI->getType(Dst) == LLT::scalar(32) &&
2513       MRI->getType(Src) == LLT::scalar(16)) {
2514     if (isExtractHiElt(*MRI, Src, Src)) {
2515       MachineBasicBlock *BB = I.getParent();
2516       BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2517           .addUse(Src);
2518       I.eraseFromParent();
2519       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2520     }
2521   }
2522 
2523   return false;
2524 }
2525 
2526 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2527   MachineBasicBlock *BB = I.getParent();
2528   MachineOperand &ImmOp = I.getOperand(1);
2529   Register DstReg = I.getOperand(0).getReg();
2530   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2531   bool IsFP = false;
2532 
2533   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2534   if (ImmOp.isFPImm()) {
2535     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2536     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2537     IsFP = true;
2538   } else if (ImmOp.isCImm()) {
2539     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2540   } else {
2541     llvm_unreachable("Not supported by g_constants");
2542   }
2543 
2544   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2545   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2546 
2547   unsigned Opcode;
2548   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2549     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2550   } else if (Size == 64 &&
2551              AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2552     Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2553     I.setDesc(TII.get(Opcode));
2554     I.addImplicitDefUseOperands(*MF);
2555     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556   } else {
2557     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2558 
2559     // We should never produce s1 values on banks other than VCC. If the user of
2560     // this already constrained the register, we may incorrectly think it's VCC
2561     // if it wasn't originally.
2562     if (Size == 1)
2563       return false;
2564   }
2565 
2566   if (Size != 64) {
2567     I.setDesc(TII.get(Opcode));
2568     I.addImplicitDefUseOperands(*MF);
2569     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2570   }
2571 
2572   const DebugLoc &DL = I.getDebugLoc();
2573 
2574   APInt Imm(Size, I.getOperand(1).getImm());
2575 
2576   MachineInstr *ResInst;
2577   if (IsSgpr && TII.isInlineConstant(Imm)) {
2578     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2579       .addImm(I.getOperand(1).getImm());
2580   } else {
2581     const TargetRegisterClass *RC = IsSgpr ?
2582       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2583     Register LoReg = MRI->createVirtualRegister(RC);
2584     Register HiReg = MRI->createVirtualRegister(RC);
2585 
2586     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2587       .addImm(Imm.trunc(32).getZExtValue());
2588 
2589     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2590       .addImm(Imm.ashr(32).getZExtValue());
2591 
2592     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2593       .addReg(LoReg)
2594       .addImm(AMDGPU::sub0)
2595       .addReg(HiReg)
2596       .addImm(AMDGPU::sub1);
2597   }
2598 
2599   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2600   // work for target independent opcodes
2601   I.eraseFromParent();
2602   const TargetRegisterClass *DstRC =
2603     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2604   if (!DstRC)
2605     return true;
2606   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2607 }
2608 
2609 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2610   // Only manually handle the f64 SGPR case.
2611   //
2612   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2613   // the bit ops theoretically have a second result due to the implicit def of
2614   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2615   // that is easy by disabling the check. The result works, but uses a
2616   // nonsensical sreg32orlds_and_sreg_1 regclass.
2617   //
2618   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2619   // the variadic REG_SEQUENCE operands.
2620 
2621   Register Dst = MI.getOperand(0).getReg();
2622   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2623   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2624       MRI->getType(Dst) != LLT::scalar(64))
2625     return false;
2626 
2627   Register Src = MI.getOperand(1).getReg();
2628   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2629   if (Fabs)
2630     Src = Fabs->getOperand(1).getReg();
2631 
2632   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2633       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2634     return false;
2635 
2636   MachineBasicBlock *BB = MI.getParent();
2637   const DebugLoc &DL = MI.getDebugLoc();
2638   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2639   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2640   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2641   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2642 
2643   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2644     .addReg(Src, 0, AMDGPU::sub0);
2645   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2646     .addReg(Src, 0, AMDGPU::sub1);
2647   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2648     .addImm(0x80000000);
2649 
2650   // Set or toggle sign bit.
2651   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2652   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2653     .addReg(HiReg)
2654     .addReg(ConstReg)
2655     .setOperandDead(3); // Dead scc
2656   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2657     .addReg(LoReg)
2658     .addImm(AMDGPU::sub0)
2659     .addReg(OpReg)
2660     .addImm(AMDGPU::sub1);
2661   MI.eraseFromParent();
2662   return true;
2663 }
2664 
2665 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2666 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2667   Register Dst = MI.getOperand(0).getReg();
2668   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2669   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2670       MRI->getType(Dst) != LLT::scalar(64))
2671     return false;
2672 
2673   Register Src = MI.getOperand(1).getReg();
2674   MachineBasicBlock *BB = MI.getParent();
2675   const DebugLoc &DL = MI.getDebugLoc();
2676   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2677   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2678   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2679   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2680 
2681   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2682       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2683     return false;
2684 
2685   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2686     .addReg(Src, 0, AMDGPU::sub0);
2687   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2688     .addReg(Src, 0, AMDGPU::sub1);
2689   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2690     .addImm(0x7fffffff);
2691 
2692   // Clear sign bit.
2693   // TODO: Should this used S_BITSET0_*?
2694   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2695     .addReg(HiReg)
2696     .addReg(ConstReg)
2697     .setOperandDead(3); // Dead scc
2698   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2699     .addReg(LoReg)
2700     .addImm(AMDGPU::sub0)
2701     .addReg(OpReg)
2702     .addImm(AMDGPU::sub1);
2703 
2704   MI.eraseFromParent();
2705   return true;
2706 }
2707 
2708 static bool isConstant(const MachineInstr &MI) {
2709   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2710 }
2711 
2712 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2713     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2714 
2715   unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2716   const MachineInstr *PtrMI =
2717       MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2718 
2719   assert(PtrMI);
2720 
2721   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2722     return;
2723 
2724   GEPInfo GEPInfo;
2725 
2726   for (unsigned i = 1; i != 3; ++i) {
2727     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2728     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2729     assert(OpDef);
2730     if (i == 2 && isConstant(*OpDef)) {
2731       // TODO: Could handle constant base + variable offset, but a combine
2732       // probably should have commuted it.
2733       assert(GEPInfo.Imm == 0);
2734       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2735       continue;
2736     }
2737     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2738     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2739       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2740     else
2741       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2742   }
2743 
2744   AddrInfo.push_back(GEPInfo);
2745   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2746 }
2747 
2748 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2749   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2750 }
2751 
2752 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2753   if (!MI.hasOneMemOperand())
2754     return false;
2755 
2756   const MachineMemOperand *MMO = *MI.memoperands_begin();
2757   const Value *Ptr = MMO->getValue();
2758 
2759   // UndefValue means this is a load of a kernel input.  These are uniform.
2760   // Sometimes LDS instructions have constant pointers.
2761   // If Ptr is null, then that means this mem operand contains a
2762   // PseudoSourceValue like GOT.
2763   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2764       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2765     return true;
2766 
2767   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2768     return true;
2769 
2770   if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2771     return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2772            AMDGPU::SGPRRegBankID;
2773 
2774   const Instruction *I = dyn_cast<Instruction>(Ptr);
2775   return I && I->getMetadata("amdgpu.uniform");
2776 }
2777 
2778 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2779   for (const GEPInfo &GEPInfo : AddrInfo) {
2780     if (!GEPInfo.VgprParts.empty())
2781       return true;
2782   }
2783   return false;
2784 }
2785 
2786 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2787   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2788   unsigned AS = PtrTy.getAddressSpace();
2789   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2790       STI.ldsRequiresM0Init()) {
2791     MachineBasicBlock *BB = I.getParent();
2792 
2793     // If DS instructions require M0 initialization, insert it before selecting.
2794     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2795       .addImm(-1);
2796   }
2797 }
2798 
2799 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2800   MachineInstr &I) const {
2801   initM0(I);
2802   return selectImpl(I, *CoverageInfo);
2803 }
2804 
2805 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2806   if (Reg.isPhysical())
2807     return false;
2808 
2809   MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2810   const unsigned Opcode = MI.getOpcode();
2811 
2812   if (Opcode == AMDGPU::COPY)
2813     return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2814 
2815   if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2816       Opcode == AMDGPU::G_XOR)
2817     return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2818            isVCmpResult(MI.getOperand(2).getReg(), MRI);
2819 
2820   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2821     return GI->is(Intrinsic::amdgcn_class);
2822 
2823   return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2824 }
2825 
2826 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2827   MachineBasicBlock *BB = I.getParent();
2828   MachineOperand &CondOp = I.getOperand(0);
2829   Register CondReg = CondOp.getReg();
2830   const DebugLoc &DL = I.getDebugLoc();
2831 
2832   unsigned BrOpcode;
2833   Register CondPhysReg;
2834   const TargetRegisterClass *ConstrainRC;
2835 
2836   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2837   // whether the branch is uniform when selecting the instruction. In
2838   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2839   // RegBankSelect knows what it's doing if the branch condition is scc, even
2840   // though it currently does not.
2841   if (!isVCC(CondReg, *MRI)) {
2842     if (MRI->getType(CondReg) != LLT::scalar(32))
2843       return false;
2844 
2845     CondPhysReg = AMDGPU::SCC;
2846     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2847     ConstrainRC = &AMDGPU::SReg_32RegClass;
2848   } else {
2849     // FIXME: Should scc->vcc copies and with exec?
2850 
2851     // Unless the value of CondReg is a result of a V_CMP* instruction then we
2852     // need to insert an and with exec.
2853     if (!isVCmpResult(CondReg, *MRI)) {
2854       const bool Is64 = STI.isWave64();
2855       const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2856       const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2857 
2858       Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2859       BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2860           .addReg(CondReg)
2861           .addReg(Exec)
2862           .setOperandDead(3); // Dead scc
2863       CondReg = TmpReg;
2864     }
2865 
2866     CondPhysReg = TRI.getVCC();
2867     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2868     ConstrainRC = TRI.getBoolRC();
2869   }
2870 
2871   if (!MRI->getRegClassOrNull(CondReg))
2872     MRI->setRegClass(CondReg, ConstrainRC);
2873 
2874   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2875     .addReg(CondReg);
2876   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2877     .addMBB(I.getOperand(1).getMBB());
2878 
2879   I.eraseFromParent();
2880   return true;
2881 }
2882 
2883 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2884   MachineInstr &I) const {
2885   Register DstReg = I.getOperand(0).getReg();
2886   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2887   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2888   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2889   if (IsVGPR)
2890     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2891 
2892   return RBI.constrainGenericRegister(
2893     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2894 }
2895 
2896 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2897   Register DstReg = I.getOperand(0).getReg();
2898   Register SrcReg = I.getOperand(1).getReg();
2899   Register MaskReg = I.getOperand(2).getReg();
2900   LLT Ty = MRI->getType(DstReg);
2901   LLT MaskTy = MRI->getType(MaskReg);
2902   MachineBasicBlock *BB = I.getParent();
2903   const DebugLoc &DL = I.getDebugLoc();
2904 
2905   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2906   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2907   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2908   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2909   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2910     return false;
2911 
2912   // Try to avoid emitting a bit operation when we only need to touch half of
2913   // the 64-bit pointer.
2914   APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2915   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2916   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2917 
2918   const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2919   const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2920 
2921   if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2922       !CanCopyLow32 && !CanCopyHi32) {
2923     auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2924       .addReg(SrcReg)
2925       .addReg(MaskReg)
2926       .setOperandDead(3); // Dead scc
2927     I.eraseFromParent();
2928     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2929   }
2930 
2931   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2932   const TargetRegisterClass &RegRC
2933     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2934 
2935   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2936   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2937   const TargetRegisterClass *MaskRC =
2938       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2939 
2940   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2941       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2942       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2943     return false;
2944 
2945   if (Ty.getSizeInBits() == 32) {
2946     assert(MaskTy.getSizeInBits() == 32 &&
2947            "ptrmask should have been narrowed during legalize");
2948 
2949     auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2950       .addReg(SrcReg)
2951       .addReg(MaskReg);
2952 
2953     if (!IsVGPR)
2954       NewOp.setOperandDead(3); // Dead scc
2955     I.eraseFromParent();
2956     return true;
2957   }
2958 
2959   Register HiReg = MRI->createVirtualRegister(&RegRC);
2960   Register LoReg = MRI->createVirtualRegister(&RegRC);
2961 
2962   // Extract the subregisters from the source pointer.
2963   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2964     .addReg(SrcReg, 0, AMDGPU::sub0);
2965   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2966     .addReg(SrcReg, 0, AMDGPU::sub1);
2967 
2968   Register MaskedLo, MaskedHi;
2969 
2970   if (CanCopyLow32) {
2971     // If all the bits in the low half are 1, we only need a copy for it.
2972     MaskedLo = LoReg;
2973   } else {
2974     // Extract the mask subregister and apply the and.
2975     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2976     MaskedLo = MRI->createVirtualRegister(&RegRC);
2977 
2978     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2979       .addReg(MaskReg, 0, AMDGPU::sub0);
2980     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2981       .addReg(LoReg)
2982       .addReg(MaskLo);
2983   }
2984 
2985   if (CanCopyHi32) {
2986     // If all the bits in the high half are 1, we only need a copy for it.
2987     MaskedHi = HiReg;
2988   } else {
2989     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2990     MaskedHi = MRI->createVirtualRegister(&RegRC);
2991 
2992     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2993       .addReg(MaskReg, 0, AMDGPU::sub1);
2994     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2995       .addReg(HiReg)
2996       .addReg(MaskHi);
2997   }
2998 
2999   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3000     .addReg(MaskedLo)
3001     .addImm(AMDGPU::sub0)
3002     .addReg(MaskedHi)
3003     .addImm(AMDGPU::sub1);
3004   I.eraseFromParent();
3005   return true;
3006 }
3007 
3008 /// Return the register to use for the index value, and the subregister to use
3009 /// for the indirectly accessed register.
3010 static std::pair<Register, unsigned>
3011 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3012                         const TargetRegisterClass *SuperRC, Register IdxReg,
3013                         unsigned EltSize, GISelKnownBits &KnownBits) {
3014   Register IdxBaseReg;
3015   int Offset;
3016 
3017   std::tie(IdxBaseReg, Offset) =
3018       AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
3019   if (IdxBaseReg == AMDGPU::NoRegister) {
3020     // This will happen if the index is a known constant. This should ordinarily
3021     // be legalized out, but handle it as a register just in case.
3022     assert(Offset == 0);
3023     IdxBaseReg = IdxReg;
3024   }
3025 
3026   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3027 
3028   // Skip out of bounds offsets, or else we would end up using an undefined
3029   // register.
3030   if (static_cast<unsigned>(Offset) >= SubRegs.size())
3031     return std::pair(IdxReg, SubRegs[0]);
3032   return std::pair(IdxBaseReg, SubRegs[Offset]);
3033 }
3034 
3035 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3036   MachineInstr &MI) const {
3037   Register DstReg = MI.getOperand(0).getReg();
3038   Register SrcReg = MI.getOperand(1).getReg();
3039   Register IdxReg = MI.getOperand(2).getReg();
3040 
3041   LLT DstTy = MRI->getType(DstReg);
3042   LLT SrcTy = MRI->getType(SrcReg);
3043 
3044   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3045   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3046   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3047 
3048   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3049   // into a waterfall loop.
3050   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3051     return false;
3052 
3053   const TargetRegisterClass *SrcRC =
3054       TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3055   const TargetRegisterClass *DstRC =
3056       TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3057   if (!SrcRC || !DstRC)
3058     return false;
3059   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3060       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3061       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3062     return false;
3063 
3064   MachineBasicBlock *BB = MI.getParent();
3065   const DebugLoc &DL = MI.getDebugLoc();
3066   const bool Is64 = DstTy.getSizeInBits() == 64;
3067 
3068   unsigned SubReg;
3069   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3070       *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3071 
3072   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3073     if (DstTy.getSizeInBits() != 32 && !Is64)
3074       return false;
3075 
3076     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3077       .addReg(IdxReg);
3078 
3079     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3080     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3081       .addReg(SrcReg, 0, SubReg)
3082       .addReg(SrcReg, RegState::Implicit);
3083     MI.eraseFromParent();
3084     return true;
3085   }
3086 
3087   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3088     return false;
3089 
3090   if (!STI.useVGPRIndexMode()) {
3091     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3092       .addReg(IdxReg);
3093     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3094       .addReg(SrcReg, 0, SubReg)
3095       .addReg(SrcReg, RegState::Implicit);
3096     MI.eraseFromParent();
3097     return true;
3098   }
3099 
3100   const MCInstrDesc &GPRIDXDesc =
3101       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3102   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3103       .addReg(SrcReg)
3104       .addReg(IdxReg)
3105       .addImm(SubReg);
3106 
3107   MI.eraseFromParent();
3108   return true;
3109 }
3110 
3111 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3112 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3113   MachineInstr &MI) const {
3114   Register DstReg = MI.getOperand(0).getReg();
3115   Register VecReg = MI.getOperand(1).getReg();
3116   Register ValReg = MI.getOperand(2).getReg();
3117   Register IdxReg = MI.getOperand(3).getReg();
3118 
3119   LLT VecTy = MRI->getType(DstReg);
3120   LLT ValTy = MRI->getType(ValReg);
3121   unsigned VecSize = VecTy.getSizeInBits();
3122   unsigned ValSize = ValTy.getSizeInBits();
3123 
3124   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3125   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3126   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3127 
3128   assert(VecTy.getElementType() == ValTy);
3129 
3130   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3131   // into a waterfall loop.
3132   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3133     return false;
3134 
3135   const TargetRegisterClass *VecRC =
3136       TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3137   const TargetRegisterClass *ValRC =
3138       TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3139 
3140   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3141       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3142       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3143       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3144     return false;
3145 
3146   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3147     return false;
3148 
3149   unsigned SubReg;
3150   std::tie(IdxReg, SubReg) =
3151       computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3152 
3153   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3154                          STI.useVGPRIndexMode();
3155 
3156   MachineBasicBlock *BB = MI.getParent();
3157   const DebugLoc &DL = MI.getDebugLoc();
3158 
3159   if (!IndexMode) {
3160     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3161       .addReg(IdxReg);
3162 
3163     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3164         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3165     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3166         .addReg(VecReg)
3167         .addReg(ValReg)
3168         .addImm(SubReg);
3169     MI.eraseFromParent();
3170     return true;
3171   }
3172 
3173   const MCInstrDesc &GPRIDXDesc =
3174       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3175   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3176       .addReg(VecReg)
3177       .addReg(ValReg)
3178       .addReg(IdxReg)
3179       .addImm(SubReg);
3180 
3181   MI.eraseFromParent();
3182   return true;
3183 }
3184 
3185 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3186   assert(!AMDGPU::isGFX12Plus(STI));
3187   unsigned Opc;
3188   unsigned Size = MI.getOperand(3).getImm();
3189 
3190   // The struct intrinsic variants add one additional operand over raw.
3191   const bool HasVIndex = MI.getNumOperands() == 9;
3192   Register VIndex;
3193   int OpOffset = 0;
3194   if (HasVIndex) {
3195     VIndex = MI.getOperand(4).getReg();
3196     OpOffset = 1;
3197   }
3198 
3199   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3200   std::optional<ValueAndVReg> MaybeVOffset =
3201       getIConstantVRegValWithLookThrough(VOffset, *MRI);
3202   const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3203 
3204   switch (Size) {
3205   default:
3206     return false;
3207   case 1:
3208     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3209                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3210                     : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3211                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3212     break;
3213   case 2:
3214     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3215                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3216                     : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3217                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3218     break;
3219   case 4:
3220     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3221                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3222                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3223                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3224     break;
3225   }
3226 
3227   MachineBasicBlock *MBB = MI.getParent();
3228   const DebugLoc &DL = MI.getDebugLoc();
3229   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3230     .add(MI.getOperand(2));
3231 
3232   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3233 
3234   if (HasVIndex && HasVOffset) {
3235     Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3236     BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3237       .addReg(VIndex)
3238       .addImm(AMDGPU::sub0)
3239       .addReg(VOffset)
3240       .addImm(AMDGPU::sub1);
3241 
3242     MIB.addReg(IdxReg);
3243   } else if (HasVIndex) {
3244     MIB.addReg(VIndex);
3245   } else if (HasVOffset) {
3246     MIB.addReg(VOffset);
3247   }
3248 
3249   MIB.add(MI.getOperand(1));            // rsrc
3250   MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3251   MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3252   unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3253   MIB.addImm(Aux & AMDGPU::CPol::ALL);                  // cpol
3254   MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3255 
3256   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3257   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3258   LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3259   MachinePointerInfo StorePtrI = LoadPtrI;
3260   StorePtrI.V = nullptr;
3261   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3262 
3263   auto F = LoadMMO->getFlags() &
3264            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3265   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3266                                      Size, LoadMMO->getBaseAlign());
3267 
3268   MachineMemOperand *StoreMMO =
3269       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3270                                sizeof(int32_t), LoadMMO->getBaseAlign());
3271 
3272   MIB.setMemRefs({LoadMMO, StoreMMO});
3273 
3274   MI.eraseFromParent();
3275   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3276 }
3277 
3278 /// Match a zero extend from a 32-bit value to 64-bits.
3279 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3280   Register ZExtSrc;
3281   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3282     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3283 
3284   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3285   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3286   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3287     return Register();
3288 
3289   assert(Def->getNumOperands() == 3 &&
3290          MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3291   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3292     return Def->getOperand(1).getReg();
3293   }
3294 
3295   return Register();
3296 }
3297 
3298 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3299   unsigned Opc;
3300   unsigned Size = MI.getOperand(3).getImm();
3301 
3302   switch (Size) {
3303   default:
3304     return false;
3305   case 1:
3306     Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3307     break;
3308   case 2:
3309     Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3310     break;
3311   case 4:
3312     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3313     break;
3314   }
3315 
3316   MachineBasicBlock *MBB = MI.getParent();
3317   const DebugLoc &DL = MI.getDebugLoc();
3318   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3319     .add(MI.getOperand(2));
3320 
3321   Register Addr = MI.getOperand(1).getReg();
3322   Register VOffset;
3323   // Try to split SAddr and VOffset. Global and LDS pointers share the same
3324   // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3325   if (!isSGPR(Addr)) {
3326     auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3327     if (isSGPR(AddrDef->Reg)) {
3328       Addr = AddrDef->Reg;
3329     } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3330       Register SAddr =
3331           getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3332       if (isSGPR(SAddr)) {
3333         Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3334         if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3335           Addr = SAddr;
3336           VOffset = Off;
3337         }
3338       }
3339     }
3340   }
3341 
3342   if (isSGPR(Addr)) {
3343     Opc = AMDGPU::getGlobalSaddrOp(Opc);
3344     if (!VOffset) {
3345       VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3346       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3347         .addImm(0);
3348     }
3349   }
3350 
3351   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3352     .addReg(Addr);
3353 
3354   if (isSGPR(Addr))
3355     MIB.addReg(VOffset);
3356 
3357   MIB.add(MI.getOperand(4))  // offset
3358      .add(MI.getOperand(5)); // cpol
3359 
3360   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3361   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3362   LoadPtrI.Offset = MI.getOperand(4).getImm();
3363   MachinePointerInfo StorePtrI = LoadPtrI;
3364   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3365   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3366   auto F = LoadMMO->getFlags() &
3367            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3368   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3369                                      Size, LoadMMO->getBaseAlign());
3370   MachineMemOperand *StoreMMO =
3371       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3372                                sizeof(int32_t), Align(4));
3373 
3374   MIB.setMemRefs({LoadMMO, StoreMMO});
3375 
3376   MI.eraseFromParent();
3377   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3378 }
3379 
3380 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3381   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3382   MI.removeOperand(1);
3383   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3384   return true;
3385 }
3386 
3387 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3388   unsigned Opc;
3389   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3390   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3391     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3392     break;
3393   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3394     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3395     break;
3396   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3397     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3398     break;
3399   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3400     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3401     break;
3402   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3403     Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3404     break;
3405   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3406     Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3407     break;
3408   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3409     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3410     break;
3411   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3412     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3413     break;
3414   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3415     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3416     break;
3417   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3418     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3419     break;
3420   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3421     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3422     break;
3423   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3424     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3425     break;
3426   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3427     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3428     break;
3429   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3430     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3431     break;
3432   default:
3433     llvm_unreachable("unhandled smfmac intrinsic");
3434   }
3435 
3436   auto VDst_In = MI.getOperand(4);
3437 
3438   MI.setDesc(TII.get(Opc));
3439   MI.removeOperand(4); // VDst_In
3440   MI.removeOperand(1); // Intrinsic ID
3441   MI.addOperand(VDst_In); // Readd VDst_In to the end
3442   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3443   return true;
3444 }
3445 
3446 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3447   Register DstReg = MI.getOperand(0).getReg();
3448   Register SrcReg = MI.getOperand(1).getReg();
3449   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3450   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3451   MachineBasicBlock *MBB = MI.getParent();
3452   const DebugLoc &DL = MI.getDebugLoc();
3453 
3454   if (IsVALU) {
3455     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3456       .addImm(Subtarget->getWavefrontSizeLog2())
3457       .addReg(SrcReg);
3458   } else {
3459     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3460       .addReg(SrcReg)
3461       .addImm(Subtarget->getWavefrontSizeLog2())
3462       .setOperandDead(3); // Dead scc
3463   }
3464 
3465   const TargetRegisterClass &RC =
3466       IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3467   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3468     return false;
3469 
3470   MI.eraseFromParent();
3471   return true;
3472 }
3473 
3474 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3475   Register SrcReg = MI.getOperand(0).getReg();
3476   if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3477     return false;
3478 
3479   MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3480   Register SP =
3481       Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3482   Register WaveAddr = getWaveAddress(DefMI);
3483   MachineBasicBlock *MBB = MI.getParent();
3484   const DebugLoc &DL = MI.getDebugLoc();
3485 
3486   if (!WaveAddr) {
3487     WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3488     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3489       .addReg(SrcReg)
3490       .addImm(Subtarget->getWavefrontSizeLog2())
3491       .setOperandDead(3); // Dead scc
3492   }
3493 
3494   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3495     .addReg(WaveAddr);
3496 
3497   MI.eraseFromParent();
3498   return true;
3499 }
3500 
3501 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3502 
3503   if (!I.isPreISelOpcode()) {
3504     if (I.isCopy())
3505       return selectCOPY(I);
3506     return true;
3507   }
3508 
3509   switch (I.getOpcode()) {
3510   case TargetOpcode::G_AND:
3511   case TargetOpcode::G_OR:
3512   case TargetOpcode::G_XOR:
3513     if (selectImpl(I, *CoverageInfo))
3514       return true;
3515     return selectG_AND_OR_XOR(I);
3516   case TargetOpcode::G_ADD:
3517   case TargetOpcode::G_SUB:
3518   case TargetOpcode::G_PTR_ADD:
3519     if (selectImpl(I, *CoverageInfo))
3520       return true;
3521     return selectG_ADD_SUB(I);
3522   case TargetOpcode::G_UADDO:
3523   case TargetOpcode::G_USUBO:
3524   case TargetOpcode::G_UADDE:
3525   case TargetOpcode::G_USUBE:
3526     return selectG_UADDO_USUBO_UADDE_USUBE(I);
3527   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3528   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3529     return selectG_AMDGPU_MAD_64_32(I);
3530   case TargetOpcode::G_INTTOPTR:
3531   case TargetOpcode::G_BITCAST:
3532   case TargetOpcode::G_PTRTOINT:
3533   case TargetOpcode::G_FREEZE:
3534     return selectCOPY(I);
3535   case TargetOpcode::G_CONSTANT:
3536   case TargetOpcode::G_FCONSTANT:
3537     return selectG_CONSTANT(I);
3538   case TargetOpcode::G_FNEG:
3539     if (selectImpl(I, *CoverageInfo))
3540       return true;
3541     return selectG_FNEG(I);
3542   case TargetOpcode::G_FABS:
3543     if (selectImpl(I, *CoverageInfo))
3544       return true;
3545     return selectG_FABS(I);
3546   case TargetOpcode::G_EXTRACT:
3547     return selectG_EXTRACT(I);
3548   case TargetOpcode::G_MERGE_VALUES:
3549   case TargetOpcode::G_CONCAT_VECTORS:
3550     return selectG_MERGE_VALUES(I);
3551   case TargetOpcode::G_UNMERGE_VALUES:
3552     return selectG_UNMERGE_VALUES(I);
3553   case TargetOpcode::G_BUILD_VECTOR:
3554   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3555     return selectG_BUILD_VECTOR(I);
3556   case TargetOpcode::G_IMPLICIT_DEF:
3557     return selectG_IMPLICIT_DEF(I);
3558   case TargetOpcode::G_INSERT:
3559     return selectG_INSERT(I);
3560   case TargetOpcode::G_INTRINSIC:
3561   case TargetOpcode::G_INTRINSIC_CONVERGENT:
3562     return selectG_INTRINSIC(I);
3563   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3564   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3565     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3566   case TargetOpcode::G_ICMP:
3567   case TargetOpcode::G_FCMP:
3568     if (selectG_ICMP_or_FCMP(I))
3569       return true;
3570     return selectImpl(I, *CoverageInfo);
3571   case TargetOpcode::G_LOAD:
3572   case TargetOpcode::G_STORE:
3573   case TargetOpcode::G_ATOMIC_CMPXCHG:
3574   case TargetOpcode::G_ATOMICRMW_XCHG:
3575   case TargetOpcode::G_ATOMICRMW_ADD:
3576   case TargetOpcode::G_ATOMICRMW_SUB:
3577   case TargetOpcode::G_ATOMICRMW_AND:
3578   case TargetOpcode::G_ATOMICRMW_OR:
3579   case TargetOpcode::G_ATOMICRMW_XOR:
3580   case TargetOpcode::G_ATOMICRMW_MIN:
3581   case TargetOpcode::G_ATOMICRMW_MAX:
3582   case TargetOpcode::G_ATOMICRMW_UMIN:
3583   case TargetOpcode::G_ATOMICRMW_UMAX:
3584   case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3585   case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3586   case TargetOpcode::G_ATOMICRMW_FADD:
3587   case TargetOpcode::G_ATOMICRMW_FMIN:
3588   case TargetOpcode::G_ATOMICRMW_FMAX:
3589     return selectG_LOAD_STORE_ATOMICRMW(I);
3590   case TargetOpcode::G_SELECT:
3591     return selectG_SELECT(I);
3592   case TargetOpcode::G_TRUNC:
3593     return selectG_TRUNC(I);
3594   case TargetOpcode::G_SEXT:
3595   case TargetOpcode::G_ZEXT:
3596   case TargetOpcode::G_ANYEXT:
3597   case TargetOpcode::G_SEXT_INREG:
3598     // This is a workaround. For extension from type i1, `selectImpl()` uses
3599     // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3600     // i1 can only be hold in a SGPR class.
3601     if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3602         selectImpl(I, *CoverageInfo))
3603       return true;
3604     return selectG_SZA_EXT(I);
3605   case TargetOpcode::G_FPEXT:
3606     if (selectG_FPEXT(I))
3607       return true;
3608     return selectImpl(I, *CoverageInfo);
3609   case TargetOpcode::G_BRCOND:
3610     return selectG_BRCOND(I);
3611   case TargetOpcode::G_GLOBAL_VALUE:
3612     return selectG_GLOBAL_VALUE(I);
3613   case TargetOpcode::G_PTRMASK:
3614     return selectG_PTRMASK(I);
3615   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3616     return selectG_EXTRACT_VECTOR_ELT(I);
3617   case TargetOpcode::G_INSERT_VECTOR_ELT:
3618     return selectG_INSERT_VECTOR_ELT(I);
3619   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3620   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3621   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3622   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3623   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3624     const AMDGPU::ImageDimIntrinsicInfo *Intr =
3625         AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
3626     assert(Intr && "not an image intrinsic with image pseudo");
3627     return selectImageIntrinsic(I, Intr);
3628   }
3629   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3630     return selectBVHIntrinsic(I);
3631   case AMDGPU::G_SBFX:
3632   case AMDGPU::G_UBFX:
3633     return selectG_SBFX_UBFX(I);
3634   case AMDGPU::G_SI_CALL:
3635     I.setDesc(TII.get(AMDGPU::SI_CALL));
3636     return true;
3637   case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3638     return selectWaveAddress(I);
3639   case AMDGPU::G_STACKRESTORE:
3640     return selectStackRestore(I);
3641   case AMDGPU::G_PHI:
3642     return selectPHI(I);
3643   default:
3644     return selectImpl(I, *CoverageInfo);
3645   }
3646   return false;
3647 }
3648 
3649 InstructionSelector::ComplexRendererFns
3650 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3651   return {{
3652       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3653   }};
3654 
3655 }
3656 
3657 std::pair<Register, unsigned>
3658 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3659                                               bool IsCanonicalizing,
3660                                               bool AllowAbs, bool OpSel) const {
3661   Register Src = Root.getReg();
3662   unsigned Mods = 0;
3663   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3664 
3665   if (MI->getOpcode() == AMDGPU::G_FNEG) {
3666     Src = MI->getOperand(1).getReg();
3667     Mods |= SISrcMods::NEG;
3668     MI = getDefIgnoringCopies(Src, *MRI);
3669   } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3670     // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3671     // denormal mode, but we're implicitly canonicalizing in a source operand.
3672     const ConstantFP *LHS =
3673         getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3674     if (LHS && LHS->isZero()) {
3675       Mods |= SISrcMods::NEG;
3676       Src = MI->getOperand(2).getReg();
3677     }
3678   }
3679 
3680   if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3681     Src = MI->getOperand(1).getReg();
3682     Mods |= SISrcMods::ABS;
3683   }
3684 
3685   if (OpSel)
3686     Mods |= SISrcMods::OP_SEL_0;
3687 
3688   return std::pair(Src, Mods);
3689 }
3690 
3691 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3692     Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3693     bool ForceVGPR) const {
3694   if ((Mods != 0 || ForceVGPR) &&
3695       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3696 
3697     // If we looked through copies to find source modifiers on an SGPR operand,
3698     // we now have an SGPR register source. To avoid potentially violating the
3699     // constant bus restriction, we need to insert a copy to a VGPR.
3700     Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3701     BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3702             TII.get(AMDGPU::COPY), VGPRSrc)
3703         .addReg(Src);
3704     Src = VGPRSrc;
3705   }
3706 
3707   return Src;
3708 }
3709 
3710 ///
3711 /// This will select either an SGPR or VGPR operand and will save us from
3712 /// having to write an extra tablegen pattern.
3713 InstructionSelector::ComplexRendererFns
3714 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3715   return {{
3716       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3717   }};
3718 }
3719 
3720 InstructionSelector::ComplexRendererFns
3721 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3722   Register Src;
3723   unsigned Mods;
3724   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3725 
3726   return {{
3727       [=](MachineInstrBuilder &MIB) {
3728         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3729       },
3730       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3731       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3732       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3733   }};
3734 }
3735 
3736 InstructionSelector::ComplexRendererFns
3737 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3738   Register Src;
3739   unsigned Mods;
3740   std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3741                                            /*IsCanonicalizing=*/true,
3742                                            /*AllowAbs=*/false);
3743 
3744   return {{
3745       [=](MachineInstrBuilder &MIB) {
3746         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3747       },
3748       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3749       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3750       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3751   }};
3752 }
3753 
3754 InstructionSelector::ComplexRendererFns
3755 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3756   return {{
3757       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3758       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3759       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3760   }};
3761 }
3762 
3763 InstructionSelector::ComplexRendererFns
3764 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3765   Register Src;
3766   unsigned Mods;
3767   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3768 
3769   return {{
3770       [=](MachineInstrBuilder &MIB) {
3771         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3772       },
3773       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3774   }};
3775 }
3776 
3777 InstructionSelector::ComplexRendererFns
3778 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3779     MachineOperand &Root) const {
3780   Register Src;
3781   unsigned Mods;
3782   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3783 
3784   return {{
3785       [=](MachineInstrBuilder &MIB) {
3786         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3787       },
3788       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3789   }};
3790 }
3791 
3792 InstructionSelector::ComplexRendererFns
3793 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3794   Register Src;
3795   unsigned Mods;
3796   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3797                                            /*AllowAbs=*/false);
3798 
3799   return {{
3800       [=](MachineInstrBuilder &MIB) {
3801         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3802       },
3803       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3804   }};
3805 }
3806 
3807 InstructionSelector::ComplexRendererFns
3808 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3809   Register Reg = Root.getReg();
3810   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3811   if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3812     return {};
3813   return {{
3814       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3815   }};
3816 }
3817 
3818 std::pair<Register, unsigned>
3819 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3820   Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3821   unsigned Mods = 0;
3822   MachineInstr *MI = MRI.getVRegDef(Src);
3823 
3824   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3825       // It's possible to see an f32 fneg here, but unlikely.
3826       // TODO: Treat f32 fneg as only high bit.
3827       MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3828     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3829     Src = MI->getOperand(1).getReg();
3830     MI = MRI.getVRegDef(Src);
3831   }
3832 
3833   // TODO: Handle G_FSUB 0 as fneg
3834 
3835   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3836   (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3837 
3838   // Packed instructions do not have abs modifiers.
3839   Mods |= SISrcMods::OP_SEL_1;
3840 
3841   return std::pair(Src, Mods);
3842 }
3843 
3844 InstructionSelector::ComplexRendererFns
3845 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3846   MachineRegisterInfo &MRI
3847     = Root.getParent()->getParent()->getParent()->getRegInfo();
3848 
3849   Register Src;
3850   unsigned Mods;
3851   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3852 
3853   return {{
3854       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3855       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3856   }};
3857 }
3858 
3859 InstructionSelector::ComplexRendererFns
3860 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3861   MachineRegisterInfo &MRI
3862     = Root.getParent()->getParent()->getParent()->getRegInfo();
3863 
3864   Register Src;
3865   unsigned Mods;
3866   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3867 
3868   return {{
3869       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3870       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3871   }};
3872 }
3873 
3874 InstructionSelector::ComplexRendererFns
3875 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3876   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3877   // Value is in Imm operand as i1 sign extended to int64_t.
3878   // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3879   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3880          "expected i1 value");
3881   unsigned Mods = SISrcMods::OP_SEL_1;
3882   if (Root.getImm() == -1)
3883     Mods ^= SISrcMods::NEG;
3884   return {{
3885       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3886   }};
3887 }
3888 
3889 InstructionSelector::ComplexRendererFns
3890 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3891     MachineOperand &Root) const {
3892   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3893          "expected i1 value");
3894   unsigned Mods = SISrcMods::OP_SEL_1;
3895   if (Root.getImm() != 0)
3896     Mods |= SISrcMods::OP_SEL_0;
3897 
3898   return {{
3899       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3900   }};
3901 }
3902 
3903 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
3904                                  MachineInstr *InsertPt,
3905                                  MachineRegisterInfo &MRI) {
3906   const TargetRegisterClass *DstRegClass;
3907   switch (Elts.size()) {
3908   case 8:
3909     DstRegClass = &AMDGPU::VReg_256RegClass;
3910     break;
3911   case 4:
3912     DstRegClass = &AMDGPU::VReg_128RegClass;
3913     break;
3914   case 2:
3915     DstRegClass = &AMDGPU::VReg_64RegClass;
3916     break;
3917   default:
3918     llvm_unreachable("unhandled Reg sequence size");
3919   }
3920 
3921   MachineIRBuilder B(*InsertPt);
3922   auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3923                  .addDef(MRI.createVirtualRegister(DstRegClass));
3924   for (unsigned i = 0; i < Elts.size(); ++i) {
3925     MIB.addReg(Elts[i]);
3926     MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
3927   }
3928   return MIB->getOperand(0).getReg();
3929 }
3930 
3931 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3932                                  SmallVectorImpl<Register> &Elts, Register &Src,
3933                                  MachineInstr *InsertPt,
3934                                  MachineRegisterInfo &MRI) {
3935   if (ModOpcode == TargetOpcode::G_FNEG) {
3936     Mods |= SISrcMods::NEG;
3937     // Check if all elements also have abs modifier
3938     SmallVector<Register, 8> NegAbsElts;
3939     for (auto El : Elts) {
3940       Register FabsSrc;
3941       if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3942         break;
3943       NegAbsElts.push_back(FabsSrc);
3944     }
3945     if (Elts.size() != NegAbsElts.size()) {
3946       // Neg
3947       Src = buildRegSequence(Elts, InsertPt, MRI);
3948     } else {
3949       // Neg and Abs
3950       Mods |= SISrcMods::NEG_HI;
3951       Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3952     }
3953   } else {
3954     assert(ModOpcode == TargetOpcode::G_FABS);
3955     // Abs
3956     Mods |= SISrcMods::NEG_HI;
3957     Src = buildRegSequence(Elts, InsertPt, MRI);
3958   }
3959 }
3960 
3961 InstructionSelector::ComplexRendererFns
3962 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3963   Register Src = Root.getReg();
3964   unsigned Mods = SISrcMods::OP_SEL_1;
3965   SmallVector<Register, 8> EltsF32;
3966 
3967   if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3968     assert(BV->getNumSources() > 0);
3969     // Based on first element decide which mod we match, neg or abs
3970     MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3971     unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3972                              ? AMDGPU::G_FNEG
3973                              : AMDGPU::G_FABS;
3974     for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3975       ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3976       if (ElF32->getOpcode() != ModOpcode)
3977         break;
3978       EltsF32.push_back(ElF32->getOperand(1).getReg());
3979     }
3980 
3981     // All elements had ModOpcode modifier
3982     if (BV->getNumSources() == EltsF32.size()) {
3983       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3984                            *MRI);
3985     }
3986   }
3987 
3988   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3989            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3990 }
3991 
3992 InstructionSelector::ComplexRendererFns
3993 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3994   Register Src = Root.getReg();
3995   unsigned Mods = SISrcMods::OP_SEL_1;
3996   SmallVector<Register, 8> EltsV2F16;
3997 
3998   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3999     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4000       Register FNegSrc;
4001       if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4002         break;
4003       EltsV2F16.push_back(FNegSrc);
4004     }
4005 
4006     // All elements had ModOpcode modifier
4007     if (CV->getNumSources() == EltsV2F16.size()) {
4008       Mods |= SISrcMods::NEG;
4009       Mods |= SISrcMods::NEG_HI;
4010       Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4011     }
4012   }
4013 
4014   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4015            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4016 }
4017 
4018 InstructionSelector::ComplexRendererFns
4019 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4020   Register Src = Root.getReg();
4021   unsigned Mods = SISrcMods::OP_SEL_1;
4022   SmallVector<Register, 8> EltsV2F16;
4023 
4024   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4025     assert(CV->getNumSources() > 0);
4026     MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4027     // Based on first element decide which mod we match, neg or abs
4028     unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4029                              ? AMDGPU::G_FNEG
4030                              : AMDGPU::G_FABS;
4031 
4032     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4033       ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4034       if (ElV2F16->getOpcode() != ModOpcode)
4035         break;
4036       EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4037     }
4038 
4039     // All elements had ModOpcode modifier
4040     if (CV->getNumSources() == EltsV2F16.size()) {
4041       MachineIRBuilder B(*Root.getParent());
4042       selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4043                            *MRI);
4044     }
4045   }
4046 
4047   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4048            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4049 }
4050 
4051 InstructionSelector::ComplexRendererFns
4052 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4053   std::optional<FPValueAndVReg> FPValReg;
4054   if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4055     if (TII.isInlineConstant(FPValReg->Value)) {
4056       return {{[=](MachineInstrBuilder &MIB) {
4057         MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4058       }}};
4059     }
4060     // Non-inlineable splat floats should not fall-through for integer immediate
4061     // checks.
4062     return {};
4063   }
4064 
4065   APInt ICst;
4066   if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4067     if (TII.isInlineConstant(ICst)) {
4068       return {
4069           {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4070     }
4071   }
4072 
4073   return {};
4074 }
4075 
4076 InstructionSelector::ComplexRendererFns
4077 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4078   Register Src =
4079       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4080   unsigned Key = 0;
4081 
4082   Register ShiftSrc;
4083   std::optional<ValueAndVReg> ShiftAmt;
4084   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4085       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4086       ShiftAmt->Value.getZExtValue() % 8 == 0) {
4087     Key = ShiftAmt->Value.getZExtValue() / 8;
4088     Src = ShiftSrc;
4089   }
4090 
4091   return {{
4092       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4093       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4094   }};
4095 }
4096 
4097 InstructionSelector::ComplexRendererFns
4098 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4099 
4100   Register Src =
4101       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4102   unsigned Key = 0;
4103 
4104   Register ShiftSrc;
4105   std::optional<ValueAndVReg> ShiftAmt;
4106   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4107       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4108       ShiftAmt->Value.getZExtValue() == 16) {
4109     Src = ShiftSrc;
4110     Key = 1;
4111   }
4112 
4113   return {{
4114       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4115       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4116   }};
4117 }
4118 
4119 InstructionSelector::ComplexRendererFns
4120 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4121   Register Src;
4122   unsigned Mods;
4123   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4124 
4125   // FIXME: Handle op_sel
4126   return {{
4127       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4128       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4129   }};
4130 }
4131 
4132 InstructionSelector::ComplexRendererFns
4133 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4134   Register Src;
4135   unsigned Mods;
4136   std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4137                                            /*IsCanonicalizing=*/true,
4138                                            /*AllowAbs=*/false,
4139                                            /*OpSel=*/false);
4140 
4141   return {{
4142       [=](MachineInstrBuilder &MIB) {
4143         MIB.addReg(
4144             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4145       },
4146       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4147   }};
4148 }
4149 
4150 InstructionSelector::ComplexRendererFns
4151 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4152   Register Src;
4153   unsigned Mods;
4154   std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4155                                            /*IsCanonicalizing=*/true,
4156                                            /*AllowAbs=*/false,
4157                                            /*OpSel=*/true);
4158 
4159   return {{
4160       [=](MachineInstrBuilder &MIB) {
4161         MIB.addReg(
4162             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4163       },
4164       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4165   }};
4166 }
4167 
4168 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4169                                                  Register &Base,
4170                                                  Register *SOffset,
4171                                                  int64_t *Offset) const {
4172   MachineInstr *MI = Root.getParent();
4173   MachineBasicBlock *MBB = MI->getParent();
4174 
4175   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4176   // then we can select all ptr + 32-bit offsets.
4177   SmallVector<GEPInfo, 4> AddrInfo;
4178   getAddrModeInfo(*MI, *MRI, AddrInfo);
4179 
4180   if (AddrInfo.empty())
4181     return false;
4182 
4183   const GEPInfo &GEPI = AddrInfo[0];
4184   std::optional<int64_t> EncodedImm;
4185 
4186   if (SOffset && Offset) {
4187     EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4188                                               /*HasSOffset=*/true);
4189     if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4190         AddrInfo.size() > 1) {
4191       const GEPInfo &GEPI2 = AddrInfo[1];
4192       if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4193         if (Register OffsetReg =
4194                 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4195           Base = GEPI2.SgprParts[0];
4196           *SOffset = OffsetReg;
4197           *Offset = *EncodedImm;
4198           if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4199             return true;
4200 
4201           // For unbuffered smem loads, it is illegal for the Immediate Offset
4202           // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4203           // is negative. Handle the case where the Immediate Offset + SOffset
4204           // is negative.
4205           auto SKnown = KB->getKnownBits(*SOffset);
4206           if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4207             return false;
4208 
4209           return true;
4210         }
4211       }
4212     }
4213     return false;
4214   }
4215 
4216   EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4217                                             /*HasSOffset=*/false);
4218   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4219     Base = GEPI.SgprParts[0];
4220     *Offset = *EncodedImm;
4221     return true;
4222   }
4223 
4224   // SGPR offset is unsigned.
4225   if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4226       GEPI.Imm != 0) {
4227     // If we make it this far we have a load with an 32-bit immediate offset.
4228     // It is OK to select this using a sgpr offset, because we have already
4229     // failed trying to select this load into one of the _IMM variants since
4230     // the _IMM Patterns are considered before the _SGPR patterns.
4231     Base = GEPI.SgprParts[0];
4232     *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4233     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4234         .addImm(GEPI.Imm);
4235     return true;
4236   }
4237 
4238   if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4239     if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4240       Base = GEPI.SgprParts[0];
4241       *SOffset = OffsetReg;
4242       return true;
4243     }
4244   }
4245 
4246   return false;
4247 }
4248 
4249 InstructionSelector::ComplexRendererFns
4250 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4251   Register Base;
4252   int64_t Offset;
4253   if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4254     return std::nullopt;
4255 
4256   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4257            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4258 }
4259 
4260 InstructionSelector::ComplexRendererFns
4261 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4262   SmallVector<GEPInfo, 4> AddrInfo;
4263   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4264 
4265   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4266     return std::nullopt;
4267 
4268   const GEPInfo &GEPInfo = AddrInfo[0];
4269   Register PtrReg = GEPInfo.SgprParts[0];
4270   std::optional<int64_t> EncodedImm =
4271       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4272   if (!EncodedImm)
4273     return std::nullopt;
4274 
4275   return {{
4276     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4277     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4278   }};
4279 }
4280 
4281 InstructionSelector::ComplexRendererFns
4282 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4283   Register Base, SOffset;
4284   if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4285     return std::nullopt;
4286 
4287   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4288            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4289 }
4290 
4291 InstructionSelector::ComplexRendererFns
4292 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4293   Register Base, SOffset;
4294   int64_t Offset;
4295   if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4296     return std::nullopt;
4297 
4298   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4299            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4300            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4301 }
4302 
4303 std::pair<Register, int>
4304 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4305                                                 uint64_t FlatVariant) const {
4306   MachineInstr *MI = Root.getParent();
4307 
4308   auto Default = std::pair(Root.getReg(), 0);
4309 
4310   if (!STI.hasFlatInstOffsets())
4311     return Default;
4312 
4313   Register PtrBase;
4314   int64_t ConstOffset;
4315   std::tie(PtrBase, ConstOffset) =
4316       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4317 
4318   if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4319                            !isFlatScratchBaseLegal(Root.getReg())))
4320     return Default;
4321 
4322   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4323   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4324     return Default;
4325 
4326   return std::pair(PtrBase, ConstOffset);
4327 }
4328 
4329 InstructionSelector::ComplexRendererFns
4330 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4331   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4332 
4333   return {{
4334       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4335       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4336     }};
4337 }
4338 
4339 InstructionSelector::ComplexRendererFns
4340 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4341   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4342 
4343   return {{
4344       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4345       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4346   }};
4347 }
4348 
4349 InstructionSelector::ComplexRendererFns
4350 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4351   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4352 
4353   return {{
4354       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4355       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4356     }};
4357 }
4358 
4359 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4360 InstructionSelector::ComplexRendererFns
4361 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4362   Register Addr = Root.getReg();
4363   Register PtrBase;
4364   int64_t ConstOffset;
4365   int64_t ImmOffset = 0;
4366 
4367   // Match the immediate offset first, which canonically is moved as low as
4368   // possible.
4369   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4370 
4371   if (ConstOffset != 0) {
4372     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4373                               SIInstrFlags::FlatGlobal)) {
4374       Addr = PtrBase;
4375       ImmOffset = ConstOffset;
4376     } else {
4377       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4378       if (isSGPR(PtrBaseDef->Reg)) {
4379         if (ConstOffset > 0) {
4380           // Offset is too large.
4381           //
4382           // saddr + large_offset -> saddr +
4383           //                         (voffset = large_offset & ~MaxOffset) +
4384           //                         (large_offset & MaxOffset);
4385           int64_t SplitImmOffset, RemainderOffset;
4386           std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4387               ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
4388 
4389           if (isUInt<32>(RemainderOffset)) {
4390             MachineInstr *MI = Root.getParent();
4391             MachineBasicBlock *MBB = MI->getParent();
4392             Register HighBits =
4393                 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4394 
4395             BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4396                     HighBits)
4397                 .addImm(RemainderOffset);
4398 
4399             return {{
4400                 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4401                 [=](MachineInstrBuilder &MIB) {
4402                   MIB.addReg(HighBits);
4403                 }, // voffset
4404                 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4405             }};
4406           }
4407         }
4408 
4409         // We are adding a 64 bit SGPR and a constant. If constant bus limit
4410         // is 1 we would need to perform 1 or 2 extra moves for each half of
4411         // the constant and it is better to do a scalar add and then issue a
4412         // single VALU instruction to materialize zero. Otherwise it is less
4413         // instructions to perform VALU adds with immediates or inline literals.
4414         unsigned NumLiterals =
4415             !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4416             !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4417         if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4418           return std::nullopt;
4419       }
4420     }
4421   }
4422 
4423   // Match the variable offset.
4424   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4425   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4426     // Look through the SGPR->VGPR copy.
4427     Register SAddr =
4428         getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4429 
4430     if (isSGPR(SAddr)) {
4431       Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4432 
4433       // It's possible voffset is an SGPR here, but the copy to VGPR will be
4434       // inserted later.
4435       if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4436         return {{[=](MachineInstrBuilder &MIB) { // saddr
4437                    MIB.addReg(SAddr);
4438                  },
4439                  [=](MachineInstrBuilder &MIB) { // voffset
4440                    MIB.addReg(VOffset);
4441                  },
4442                  [=](MachineInstrBuilder &MIB) { // offset
4443                    MIB.addImm(ImmOffset);
4444                  }}};
4445       }
4446     }
4447   }
4448 
4449   // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4450   // drop this.
4451   if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4452       AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4453     return std::nullopt;
4454 
4455   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4456   // moves required to copy a 64-bit SGPR to VGPR.
4457   MachineInstr *MI = Root.getParent();
4458   MachineBasicBlock *MBB = MI->getParent();
4459   Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4460 
4461   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4462       .addImm(0);
4463 
4464   return {{
4465       [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4466       [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset
4467       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }     // offset
4468   }};
4469 }
4470 
4471 InstructionSelector::ComplexRendererFns
4472 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4473   Register Addr = Root.getReg();
4474   Register PtrBase;
4475   int64_t ConstOffset;
4476   int64_t ImmOffset = 0;
4477 
4478   // Match the immediate offset first, which canonically is moved as low as
4479   // possible.
4480   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4481 
4482   if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4483       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
4484                             SIInstrFlags::FlatScratch)) {
4485     Addr = PtrBase;
4486     ImmOffset = ConstOffset;
4487   }
4488 
4489   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4490   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4491     int FI = AddrDef->MI->getOperand(1).getIndex();
4492     return {{
4493         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4494         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4495     }};
4496   }
4497 
4498   Register SAddr = AddrDef->Reg;
4499 
4500   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4501     Register LHS = AddrDef->MI->getOperand(1).getReg();
4502     Register RHS = AddrDef->MI->getOperand(2).getReg();
4503     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4504     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4505 
4506     if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4507         isSGPR(RHSDef->Reg)) {
4508       int FI = LHSDef->MI->getOperand(1).getIndex();
4509       MachineInstr &I = *Root.getParent();
4510       MachineBasicBlock *BB = I.getParent();
4511       const DebugLoc &DL = I.getDebugLoc();
4512       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4513 
4514       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4515           .addFrameIndex(FI)
4516           .addReg(RHSDef->Reg)
4517           .setOperandDead(3); // Dead scc
4518     }
4519   }
4520 
4521   if (!isSGPR(SAddr))
4522     return std::nullopt;
4523 
4524   return {{
4525       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4526       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4527   }};
4528 }
4529 
4530 // Check whether the flat scratch SVS swizzle bug affects this access.
4531 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4532     Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4533   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4534     return false;
4535 
4536   // The bug affects the swizzling of SVS accesses if there is any carry out
4537   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4538   // voffset to (soffset + inst_offset).
4539   auto VKnown = KB->getKnownBits(VAddr);
4540   auto SKnown = KnownBits::computeForAddSub(
4541       /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4542       KnownBits::makeConstant(APInt(32, ImmOffset)));
4543   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4544   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4545   return (VMax & 3) + (SMax & 3) >= 4;
4546 }
4547 
4548 InstructionSelector::ComplexRendererFns
4549 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4550   Register Addr = Root.getReg();
4551   Register PtrBase;
4552   int64_t ConstOffset;
4553   int64_t ImmOffset = 0;
4554 
4555   // Match the immediate offset first, which canonically is moved as low as
4556   // possible.
4557   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4558 
4559   Register OrigAddr = Addr;
4560   if (ConstOffset != 0 &&
4561       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4562     Addr = PtrBase;
4563     ImmOffset = ConstOffset;
4564   }
4565 
4566   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4567   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4568     return std::nullopt;
4569 
4570   Register RHS = AddrDef->MI->getOperand(2).getReg();
4571   if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4572     return std::nullopt;
4573 
4574   Register LHS = AddrDef->MI->getOperand(1).getReg();
4575   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4576 
4577   if (OrigAddr != Addr) {
4578     if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4579       return std::nullopt;
4580   } else {
4581     if (!isFlatScratchBaseLegalSV(OrigAddr))
4582       return std::nullopt;
4583   }
4584 
4585   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4586     return std::nullopt;
4587 
4588   if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4589     int FI = LHSDef->MI->getOperand(1).getIndex();
4590     return {{
4591         [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4592         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4593         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4594     }};
4595   }
4596 
4597   if (!isSGPR(LHS))
4598     return std::nullopt;
4599 
4600   return {{
4601       [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4602       [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4603       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4604   }};
4605 }
4606 
4607 InstructionSelector::ComplexRendererFns
4608 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4609   MachineInstr *MI = Root.getParent();
4610   MachineBasicBlock *MBB = MI->getParent();
4611   MachineFunction *MF = MBB->getParent();
4612   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4613 
4614   int64_t Offset = 0;
4615   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4616       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
4617     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4618 
4619     // TODO: Should this be inside the render function? The iterator seems to
4620     // move.
4621     const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4622     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4623             HighBits)
4624         .addImm(Offset & ~MaxOffset);
4625 
4626     return {{[=](MachineInstrBuilder &MIB) { // rsrc
4627                MIB.addReg(Info->getScratchRSrcReg());
4628              },
4629              [=](MachineInstrBuilder &MIB) { // vaddr
4630                MIB.addReg(HighBits);
4631              },
4632              [=](MachineInstrBuilder &MIB) { // soffset
4633                // Use constant zero for soffset and rely on eliminateFrameIndex
4634                // to choose the appropriate frame register if need be.
4635                MIB.addImm(0);
4636              },
4637              [=](MachineInstrBuilder &MIB) { // offset
4638                MIB.addImm(Offset & MaxOffset);
4639              }}};
4640   }
4641 
4642   assert(Offset == 0 || Offset == -1);
4643 
4644   // Try to fold a frame index directly into the MUBUF vaddr field, and any
4645   // offsets.
4646   std::optional<int> FI;
4647   Register VAddr = Root.getReg();
4648   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4649     Register PtrBase;
4650     int64_t ConstOffset;
4651     std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4652     if (ConstOffset != 0) {
4653       if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4654           (!STI.privateMemoryResourceIsRangeChecked() ||
4655            KB->signBitIsZero(PtrBase))) {
4656         const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4657         if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4658           FI = PtrBaseDef->getOperand(1).getIndex();
4659         else
4660           VAddr = PtrBase;
4661         Offset = ConstOffset;
4662       }
4663     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4664       FI = RootDef->getOperand(1).getIndex();
4665     }
4666   }
4667 
4668   return {{[=](MachineInstrBuilder &MIB) { // rsrc
4669              MIB.addReg(Info->getScratchRSrcReg());
4670            },
4671            [=](MachineInstrBuilder &MIB) { // vaddr
4672              if (FI)
4673                MIB.addFrameIndex(*FI);
4674              else
4675                MIB.addReg(VAddr);
4676            },
4677            [=](MachineInstrBuilder &MIB) { // soffset
4678              // Use constant zero for soffset and rely on eliminateFrameIndex
4679              // to choose the appropriate frame register if need be.
4680              MIB.addImm(0);
4681            },
4682            [=](MachineInstrBuilder &MIB) { // offset
4683              MIB.addImm(Offset);
4684            }}};
4685 }
4686 
4687 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4688                                                 int64_t Offset) const {
4689   if (!isUInt<16>(Offset))
4690     return false;
4691 
4692   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4693     return true;
4694 
4695   // On Southern Islands instruction with a negative base value and an offset
4696   // don't seem to work.
4697   return KB->signBitIsZero(Base);
4698 }
4699 
4700 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4701                                                  int64_t Offset1,
4702                                                  unsigned Size) const {
4703   if (Offset0 % Size != 0 || Offset1 % Size != 0)
4704     return false;
4705   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4706     return false;
4707 
4708   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4709     return true;
4710 
4711   // On Southern Islands instruction with a negative base value and an offset
4712   // don't seem to work.
4713   return KB->signBitIsZero(Base);
4714 }
4715 
4716 // Return whether the operation has NoUnsignedWrap property.
4717 static bool isNoUnsignedWrap(MachineInstr *Addr) {
4718   return Addr->getOpcode() == TargetOpcode::G_OR ||
4719          (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4720           Addr->getFlag(MachineInstr::NoUWrap));
4721 }
4722 
4723 // Check that the base address of flat scratch load/store in the form of `base +
4724 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4725 // requirement). We always treat the first operand as the base address here.
4726 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4727   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4728 
4729   if (isNoUnsignedWrap(AddrMI))
4730     return true;
4731 
4732   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4733   // values.
4734   if (STI.hasSignedScratchOffsets())
4735     return true;
4736 
4737   Register LHS = AddrMI->getOperand(1).getReg();
4738   Register RHS = AddrMI->getOperand(2).getReg();
4739 
4740   if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4741     std::optional<ValueAndVReg> RhsValReg =
4742         getIConstantVRegValWithLookThrough(RHS, *MRI);
4743     // If the immediate offset is negative and within certain range, the base
4744     // address cannot also be negative. If the base is also negative, the sum
4745     // would be either negative or much larger than the valid range of scratch
4746     // memory a thread can access.
4747     if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4748         RhsValReg->Value.getSExtValue() > -0x40000000)
4749       return true;
4750   }
4751 
4752   return KB->signBitIsZero(LHS);
4753 }
4754 
4755 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4756 // of: SGPR + VGPR.
4757 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4758   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4759 
4760   if (isNoUnsignedWrap(AddrMI))
4761     return true;
4762 
4763   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4764   // values.
4765   if (STI.hasSignedScratchOffsets())
4766     return true;
4767 
4768   Register LHS = AddrMI->getOperand(1).getReg();
4769   Register RHS = AddrMI->getOperand(2).getReg();
4770   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4771 }
4772 
4773 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4774 // of: SGPR + VGPR + Imm.
4775 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4776     Register Addr) const {
4777   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4778   // values.
4779   if (STI.hasSignedScratchOffsets())
4780     return true;
4781 
4782   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4783   Register Base = AddrMI->getOperand(1).getReg();
4784   std::optional<DefinitionAndSourceRegister> BaseDef =
4785       getDefSrcRegIgnoringCopies(Base, *MRI);
4786   std::optional<ValueAndVReg> RHSOffset =
4787       getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
4788   assert(RHSOffset);
4789 
4790   // If the immediate offset is negative and within certain range, the base
4791   // address cannot also be negative. If the base is also negative, the sum
4792   // would be either negative or much larger than the valid range of scratch
4793   // memory a thread can access.
4794   if (isNoUnsignedWrap(BaseDef->MI) &&
4795       (isNoUnsignedWrap(AddrMI) ||
4796        (RHSOffset->Value.getSExtValue() < 0 &&
4797         RHSOffset->Value.getSExtValue() > -0x40000000)))
4798     return true;
4799 
4800   Register LHS = BaseDef->MI->getOperand(1).getReg();
4801   Register RHS = BaseDef->MI->getOperand(2).getReg();
4802   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4803 }
4804 
4805 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4806                                                     unsigned ShAmtBits) const {
4807   assert(MI.getOpcode() == TargetOpcode::G_AND);
4808 
4809   std::optional<APInt> RHS =
4810       getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4811   if (!RHS)
4812     return false;
4813 
4814   if (RHS->countr_one() >= ShAmtBits)
4815     return true;
4816 
4817   const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4818   return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4819 }
4820 
4821 InstructionSelector::ComplexRendererFns
4822 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4823     MachineOperand &Root) const {
4824   Register Reg = Root.getReg();
4825   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4826 
4827   std::optional<DefinitionAndSourceRegister> Def =
4828     getDefSrcRegIgnoringCopies(Reg, *MRI);
4829   assert(Def && "this shouldn't be an optional result");
4830   Reg = Def->Reg;
4831 
4832   if (Register WaveBase = getWaveAddress(Def->MI)) {
4833     return {{
4834         [=](MachineInstrBuilder &MIB) { // rsrc
4835           MIB.addReg(Info->getScratchRSrcReg());
4836         },
4837         [=](MachineInstrBuilder &MIB) { // soffset
4838           MIB.addReg(WaveBase);
4839         },
4840         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4841     }};
4842   }
4843 
4844   int64_t Offset = 0;
4845 
4846   // FIXME: Copy check is a hack
4847   Register BasePtr;
4848   if (mi_match(Reg, *MRI,
4849                m_GPtrAdd(m_Reg(BasePtr),
4850                          m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
4851     if (!TII.isLegalMUBUFImmOffset(Offset))
4852       return {};
4853     MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4854     Register WaveBase = getWaveAddress(BasePtrDef);
4855     if (!WaveBase)
4856       return {};
4857 
4858     return {{
4859         [=](MachineInstrBuilder &MIB) { // rsrc
4860           MIB.addReg(Info->getScratchRSrcReg());
4861         },
4862         [=](MachineInstrBuilder &MIB) { // soffset
4863           MIB.addReg(WaveBase);
4864         },
4865         [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4866     }};
4867   }
4868 
4869   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4870       !TII.isLegalMUBUFImmOffset(Offset))
4871     return {};
4872 
4873   return {{
4874       [=](MachineInstrBuilder &MIB) { // rsrc
4875         MIB.addReg(Info->getScratchRSrcReg());
4876       },
4877       [=](MachineInstrBuilder &MIB) { // soffset
4878         MIB.addImm(0);
4879       },
4880       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4881   }};
4882 }
4883 
4884 std::pair<Register, unsigned>
4885 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4886   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4887   if (!RootDef)
4888     return std::pair(Root.getReg(), 0);
4889 
4890   int64_t ConstAddr = 0;
4891 
4892   Register PtrBase;
4893   int64_t Offset;
4894   std::tie(PtrBase, Offset) =
4895     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4896 
4897   if (Offset) {
4898     if (isDSOffsetLegal(PtrBase, Offset)) {
4899       // (add n0, c0)
4900       return std::pair(PtrBase, Offset);
4901     }
4902   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4903     // TODO
4904 
4905 
4906   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4907     // TODO
4908 
4909   }
4910 
4911   return std::pair(Root.getReg(), 0);
4912 }
4913 
4914 InstructionSelector::ComplexRendererFns
4915 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4916   Register Reg;
4917   unsigned Offset;
4918   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4919   return {{
4920       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4921       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4922     }};
4923 }
4924 
4925 InstructionSelector::ComplexRendererFns
4926 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4927   return selectDSReadWrite2(Root, 4);
4928 }
4929 
4930 InstructionSelector::ComplexRendererFns
4931 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4932   return selectDSReadWrite2(Root, 8);
4933 }
4934 
4935 InstructionSelector::ComplexRendererFns
4936 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4937                                               unsigned Size) const {
4938   Register Reg;
4939   unsigned Offset;
4940   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4941   return {{
4942       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4943       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4944       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4945     }};
4946 }
4947 
4948 std::pair<Register, unsigned>
4949 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4950                                                   unsigned Size) const {
4951   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4952   if (!RootDef)
4953     return std::pair(Root.getReg(), 0);
4954 
4955   int64_t ConstAddr = 0;
4956 
4957   Register PtrBase;
4958   int64_t Offset;
4959   std::tie(PtrBase, Offset) =
4960     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4961 
4962   if (Offset) {
4963     int64_t OffsetValue0 = Offset;
4964     int64_t OffsetValue1 = Offset + Size;
4965     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4966       // (add n0, c0)
4967       return std::pair(PtrBase, OffsetValue0 / Size);
4968     }
4969   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4970     // TODO
4971 
4972   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4973     // TODO
4974 
4975   }
4976 
4977   return std::pair(Root.getReg(), 0);
4978 }
4979 
4980 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4981 /// the base value with the constant offset. There may be intervening copies
4982 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4983 /// not match the pattern.
4984 std::pair<Register, int64_t>
4985 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4986   Register Root, const MachineRegisterInfo &MRI) const {
4987   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4988   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4989     return {Root, 0};
4990 
4991   MachineOperand &RHS = RootI->getOperand(2);
4992   std::optional<ValueAndVReg> MaybeOffset =
4993       getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4994   if (!MaybeOffset)
4995     return {Root, 0};
4996   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4997 }
4998 
4999 static void addZeroImm(MachineInstrBuilder &MIB) {
5000   MIB.addImm(0);
5001 }
5002 
5003 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5004 /// BasePtr is not valid, a null base pointer will be used.
5005 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5006                           uint32_t FormatLo, uint32_t FormatHi,
5007                           Register BasePtr) {
5008   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5009   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5010   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5011   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5012 
5013   B.buildInstr(AMDGPU::S_MOV_B32)
5014     .addDef(RSrc2)
5015     .addImm(FormatLo);
5016   B.buildInstr(AMDGPU::S_MOV_B32)
5017     .addDef(RSrc3)
5018     .addImm(FormatHi);
5019 
5020   // Build the half of the subregister with the constants before building the
5021   // full 128-bit register. If we are building multiple resource descriptors,
5022   // this will allow CSEing of the 2-component register.
5023   B.buildInstr(AMDGPU::REG_SEQUENCE)
5024     .addDef(RSrcHi)
5025     .addReg(RSrc2)
5026     .addImm(AMDGPU::sub0)
5027     .addReg(RSrc3)
5028     .addImm(AMDGPU::sub1);
5029 
5030   Register RSrcLo = BasePtr;
5031   if (!BasePtr) {
5032     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5033     B.buildInstr(AMDGPU::S_MOV_B64)
5034       .addDef(RSrcLo)
5035       .addImm(0);
5036   }
5037 
5038   B.buildInstr(AMDGPU::REG_SEQUENCE)
5039     .addDef(RSrc)
5040     .addReg(RSrcLo)
5041     .addImm(AMDGPU::sub0_sub1)
5042     .addReg(RSrcHi)
5043     .addImm(AMDGPU::sub2_sub3);
5044 
5045   return RSrc;
5046 }
5047 
5048 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5049                                 const SIInstrInfo &TII, Register BasePtr) {
5050   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5051 
5052   // FIXME: Why are half the "default" bits ignored based on the addressing
5053   // mode?
5054   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5055 }
5056 
5057 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5058                                const SIInstrInfo &TII, Register BasePtr) {
5059   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5060 
5061   // FIXME: Why are half the "default" bits ignored based on the addressing
5062   // mode?
5063   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5064 }
5065 
5066 AMDGPUInstructionSelector::MUBUFAddressData
5067 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5068   MUBUFAddressData Data;
5069   Data.N0 = Src;
5070 
5071   Register PtrBase;
5072   int64_t Offset;
5073 
5074   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5075   if (isUInt<32>(Offset)) {
5076     Data.N0 = PtrBase;
5077     Data.Offset = Offset;
5078   }
5079 
5080   if (MachineInstr *InputAdd
5081       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5082     Data.N2 = InputAdd->getOperand(1).getReg();
5083     Data.N3 = InputAdd->getOperand(2).getReg();
5084 
5085     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5086     // FIXME: Don't know this was defined by operand 0
5087     //
5088     // TODO: Remove this when we have copy folding optimizations after
5089     // RegBankSelect.
5090     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5091     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5092   }
5093 
5094   return Data;
5095 }
5096 
5097 /// Return if the addr64 mubuf mode should be used for the given address.
5098 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5099   // (ptr_add N2, N3) -> addr64, or
5100   // (ptr_add (ptr_add N2, N3), C1) -> addr64
5101   if (Addr.N2)
5102     return true;
5103 
5104   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5105   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5106 }
5107 
5108 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
5109 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5110 /// component.
5111 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5112   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5113   if (TII.isLegalMUBUFImmOffset(ImmOffset))
5114     return;
5115 
5116   // Illegal offset, store it in soffset.
5117   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5118   B.buildInstr(AMDGPU::S_MOV_B32)
5119     .addDef(SOffset)
5120     .addImm(ImmOffset);
5121   ImmOffset = 0;
5122 }
5123 
5124 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5125   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5126   Register &SOffset, int64_t &Offset) const {
5127   // FIXME: Predicates should stop this from reaching here.
5128   // addr64 bit was removed for volcanic islands.
5129   if (!STI.hasAddr64() || STI.useFlatForGlobal())
5130     return false;
5131 
5132   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5133   if (!shouldUseAddr64(AddrData))
5134     return false;
5135 
5136   Register N0 = AddrData.N0;
5137   Register N2 = AddrData.N2;
5138   Register N3 = AddrData.N3;
5139   Offset = AddrData.Offset;
5140 
5141   // Base pointer for the SRD.
5142   Register SRDPtr;
5143 
5144   if (N2) {
5145     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5146       assert(N3);
5147       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5148         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5149         // addr64, and construct the default resource from a 0 address.
5150         VAddr = N0;
5151       } else {
5152         SRDPtr = N3;
5153         VAddr = N2;
5154       }
5155     } else {
5156       // N2 is not divergent.
5157       SRDPtr = N2;
5158       VAddr = N3;
5159     }
5160   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5161     // Use the default null pointer in the resource
5162     VAddr = N0;
5163   } else {
5164     // N0 -> offset, or
5165     // (N0 + C1) -> offset
5166     SRDPtr = N0;
5167   }
5168 
5169   MachineIRBuilder B(*Root.getParent());
5170   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5171   splitIllegalMUBUFOffset(B, SOffset, Offset);
5172   return true;
5173 }
5174 
5175 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5176   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5177   int64_t &Offset) const {
5178 
5179   // FIXME: Pattern should not reach here.
5180   if (STI.useFlatForGlobal())
5181     return false;
5182 
5183   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5184   if (shouldUseAddr64(AddrData))
5185     return false;
5186 
5187   // N0 -> offset, or
5188   // (N0 + C1) -> offset
5189   Register SRDPtr = AddrData.N0;
5190   Offset = AddrData.Offset;
5191 
5192   // TODO: Look through extensions for 32-bit soffset.
5193   MachineIRBuilder B(*Root.getParent());
5194 
5195   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5196   splitIllegalMUBUFOffset(B, SOffset, Offset);
5197   return true;
5198 }
5199 
5200 InstructionSelector::ComplexRendererFns
5201 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5202   Register VAddr;
5203   Register RSrcReg;
5204   Register SOffset;
5205   int64_t Offset = 0;
5206 
5207   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5208     return {};
5209 
5210   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5211   // pattern.
5212   return {{
5213       [=](MachineInstrBuilder &MIB) {  // rsrc
5214         MIB.addReg(RSrcReg);
5215       },
5216       [=](MachineInstrBuilder &MIB) { // vaddr
5217         MIB.addReg(VAddr);
5218       },
5219       [=](MachineInstrBuilder &MIB) { // soffset
5220         if (SOffset)
5221           MIB.addReg(SOffset);
5222         else if (STI.hasRestrictedSOffset())
5223           MIB.addReg(AMDGPU::SGPR_NULL);
5224         else
5225           MIB.addImm(0);
5226       },
5227       [=](MachineInstrBuilder &MIB) { // offset
5228         MIB.addImm(Offset);
5229       },
5230       addZeroImm, //  cpol
5231       addZeroImm, //  tfe
5232       addZeroImm  //  swz
5233     }};
5234 }
5235 
5236 InstructionSelector::ComplexRendererFns
5237 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5238   Register RSrcReg;
5239   Register SOffset;
5240   int64_t Offset = 0;
5241 
5242   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5243     return {};
5244 
5245   return {{
5246       [=](MachineInstrBuilder &MIB) {  // rsrc
5247         MIB.addReg(RSrcReg);
5248       },
5249       [=](MachineInstrBuilder &MIB) { // soffset
5250         if (SOffset)
5251           MIB.addReg(SOffset);
5252         else if (STI.hasRestrictedSOffset())
5253           MIB.addReg(AMDGPU::SGPR_NULL);
5254         else
5255           MIB.addImm(0);
5256       },
5257       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5258       addZeroImm, //  cpol
5259       addZeroImm, //  tfe
5260       addZeroImm, //  swz
5261     }};
5262 }
5263 
5264 InstructionSelector::ComplexRendererFns
5265 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5266 
5267   Register SOffset = Root.getReg();
5268 
5269   if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5270     SOffset = AMDGPU::SGPR_NULL;
5271 
5272   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5273 }
5274 
5275 /// Get an immediate that must be 32-bits, and treated as zero extended.
5276 static std::optional<uint64_t>
5277 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5278   // getIConstantVRegVal sexts any values, so see if that matters.
5279   std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5280   if (!OffsetVal || !isInt<32>(*OffsetVal))
5281     return std::nullopt;
5282   return Lo_32(*OffsetVal);
5283 }
5284 
5285 InstructionSelector::ComplexRendererFns
5286 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5287   std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5288   if (!OffsetVal)
5289     return {};
5290 
5291   std::optional<int64_t> EncodedImm =
5292       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5293   if (!EncodedImm)
5294     return {};
5295 
5296   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
5297 }
5298 
5299 InstructionSelector::ComplexRendererFns
5300 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5301   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5302 
5303   std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5304   if (!OffsetVal)
5305     return {};
5306 
5307   std::optional<int64_t> EncodedImm =
5308       AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
5309   if (!EncodedImm)
5310     return {};
5311 
5312   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
5313 }
5314 
5315 InstructionSelector::ComplexRendererFns
5316 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5317   // Match the (soffset + offset) pair as a 32-bit register base and
5318   // an immediate offset.
5319   Register SOffset;
5320   unsigned Offset;
5321   std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5322       *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5323   if (!SOffset)
5324     return std::nullopt;
5325 
5326   std::optional<int64_t> EncodedOffset =
5327       AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5328   if (!EncodedOffset)
5329     return std::nullopt;
5330 
5331   assert(MRI->getType(SOffset) == LLT::scalar(32));
5332   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5333            [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5334 }
5335 
5336 // Variant of stripBitCast that returns the instruction instead of a
5337 // MachineOperand.
5338 static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5339   if (MI->getOpcode() == AMDGPU::G_BITCAST)
5340     return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5341   return MI;
5342 }
5343 
5344 // Figure out if this is really an extract of the high 16-bits of a dword,
5345 // returns nullptr if it isn't.
5346 static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5347                                     MachineRegisterInfo &MRI) {
5348   Inst = stripBitCast(Inst, MRI);
5349 
5350   if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5351     return nullptr;
5352 
5353   MachineInstr *TruncOp =
5354       getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5355   TruncOp = stripBitCast(TruncOp, MRI);
5356 
5357   // G_LSHR x, (G_CONSTANT i32 16)
5358   if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5359     auto SrlAmount = getIConstantVRegValWithLookThrough(
5360         TruncOp->getOperand(2).getReg(), MRI);
5361     if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5362       MachineInstr *SrlOp =
5363           getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5364       return stripBitCast(SrlOp, MRI);
5365     }
5366   }
5367 
5368   // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5369   //    1, 0 swaps the low/high 16 bits.
5370   //    1, 1 sets the high 16 bits to be the same as the low 16.
5371   // in any case, it selects the high elts.
5372   if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5373     assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5374            LLT::fixed_vector(2, 16));
5375 
5376     ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5377     assert(Mask.size() == 2);
5378 
5379     if (Mask[0] == 1 && Mask[1] <= 1) {
5380       MachineInstr *LHS =
5381           getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5382       return stripBitCast(LHS, MRI);
5383     }
5384   }
5385 
5386   return nullptr;
5387 }
5388 
5389 std::pair<Register, unsigned>
5390 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5391                                                      bool &Matched) const {
5392   Matched = false;
5393 
5394   Register Src;
5395   unsigned Mods;
5396   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5397 
5398   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5399   if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5400     MachineOperand *MO = &MI->getOperand(1);
5401     Src = MO->getReg();
5402     MI = getDefIgnoringCopies(Src, *MRI);
5403 
5404     assert(MRI->getType(Src) == LLT::scalar(16));
5405 
5406     // See through bitcasts.
5407     // FIXME: Would be nice to use stripBitCast here.
5408     if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5409       MO = &MI->getOperand(1);
5410       Src = MO->getReg();
5411       MI = getDefIgnoringCopies(Src, *MRI);
5412     }
5413 
5414     const auto CheckAbsNeg = [&]() {
5415       // Be careful about folding modifiers if we already have an abs. fneg is
5416       // applied last, so we don't want to apply an earlier fneg.
5417       if ((Mods & SISrcMods::ABS) == 0) {
5418         unsigned ModsTmp;
5419         std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5420         MI = getDefIgnoringCopies(Src, *MRI);
5421 
5422         if ((ModsTmp & SISrcMods::NEG) != 0)
5423           Mods ^= SISrcMods::NEG;
5424 
5425         if ((ModsTmp & SISrcMods::ABS) != 0)
5426           Mods |= SISrcMods::ABS;
5427       }
5428     };
5429 
5430     CheckAbsNeg();
5431 
5432     // op_sel/op_sel_hi decide the source type and source.
5433     // If the source's op_sel_hi is set, it indicates to do a conversion from
5434     // fp16. If the sources's op_sel is set, it picks the high half of the
5435     // source register.
5436 
5437     Mods |= SISrcMods::OP_SEL_1;
5438 
5439     if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5440       Mods |= SISrcMods::OP_SEL_0;
5441       MI = ExtractHiEltMI;
5442       MO = &MI->getOperand(0);
5443       Src = MO->getReg();
5444 
5445       CheckAbsNeg();
5446     }
5447 
5448     Matched = true;
5449   }
5450 
5451   return {Src, Mods};
5452 }
5453 
5454 InstructionSelector::ComplexRendererFns
5455 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5456     MachineOperand &Root) const {
5457   Register Src;
5458   unsigned Mods;
5459   bool Matched;
5460   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5461   if (!Matched)
5462     return {};
5463 
5464   return {{
5465       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5466       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5467   }};
5468 }
5469 
5470 InstructionSelector::ComplexRendererFns
5471 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5472   Register Src;
5473   unsigned Mods;
5474   bool Matched;
5475   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5476 
5477   return {{
5478       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5479       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5480   }};
5481 }
5482 
5483 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5484     MachineInstr &I, Intrinsic::ID IntrID) const {
5485   MachineBasicBlock *MBB = I.getParent();
5486   const DebugLoc &DL = I.getDebugLoc();
5487   Register CCReg = I.getOperand(0).getReg();
5488 
5489   bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5490 
5491   if (HasM0) {
5492     auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5493                        .addReg(I.getOperand(2).getReg());
5494     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5495     if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5496       return false;
5497   } else {
5498     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5499         .addImm(I.getOperand(2).getImm());
5500   }
5501 
5502   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5503 
5504   I.eraseFromParent();
5505   return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5506                                       *MRI);
5507 }
5508 
5509 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5510   if (HasInlineConst) {
5511     switch (IntrID) {
5512     default:
5513       llvm_unreachable("not a named barrier op");
5514     case Intrinsic::amdgcn_s_barrier_init:
5515       return AMDGPU::S_BARRIER_INIT_IMM;
5516     case Intrinsic::amdgcn_s_barrier_join:
5517       return AMDGPU::S_BARRIER_JOIN_IMM;
5518     case Intrinsic::amdgcn_s_wakeup_barrier:
5519       return AMDGPU::S_WAKEUP_BARRIER_IMM;
5520     case Intrinsic::amdgcn_s_get_barrier_state:
5521       return AMDGPU::S_GET_BARRIER_STATE_IMM;
5522     };
5523   } else {
5524     switch (IntrID) {
5525     default:
5526       llvm_unreachable("not a named barrier op");
5527     case Intrinsic::amdgcn_s_barrier_init:
5528       return AMDGPU::S_BARRIER_INIT_M0;
5529     case Intrinsic::amdgcn_s_barrier_join:
5530       return AMDGPU::S_BARRIER_JOIN_M0;
5531     case Intrinsic::amdgcn_s_wakeup_barrier:
5532       return AMDGPU::S_WAKEUP_BARRIER_M0;
5533     case Intrinsic::amdgcn_s_get_barrier_state:
5534       return AMDGPU::S_GET_BARRIER_STATE_M0;
5535     };
5536   }
5537 }
5538 
5539 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5540     MachineInstr &I, Intrinsic::ID IntrID) const {
5541   MachineBasicBlock *MBB = I.getParent();
5542   const DebugLoc &DL = I.getDebugLoc();
5543   MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5544                              ? I.getOperand(2)
5545                              : I.getOperand(1);
5546   std::optional<int64_t> BarValImm =
5547       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5548   Register M0Val;
5549   Register TmpReg0;
5550 
5551   // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5552   if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5553     Register MemberCount = I.getOperand(2).getReg();
5554     TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5555     // TODO: This should be expanded during legalization so that the the S_LSHL
5556     // and S_OR can be constant-folded
5557     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5558         .addImm(16)
5559         .addReg(MemberCount);
5560     M0Val = TmpReg0;
5561   }
5562 
5563   // If not inlinable, get reference to barrier depending on the instruction
5564   if (!BarValImm) {
5565     if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5566       // If reference to barrier id is not an inlinable constant then it must be
5567       // referenced with M0[4:0]. Perform an OR with the member count to include
5568       // it in M0 for S_BARRIER_INIT.
5569       Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5570       BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5571           .addReg(BarOp.getReg())
5572           .addReg(TmpReg0);
5573       M0Val = TmpReg1;
5574     } else {
5575       M0Val = BarOp.getReg();
5576     }
5577   }
5578 
5579   // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5580   if (M0Val) {
5581     auto CopyMIB =
5582         BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5583     constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5584   }
5585 
5586   MachineInstrBuilder MIB;
5587   unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5588   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5589 
5590   if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5591     MIB.addDef(I.getOperand(0).getReg());
5592 
5593   if (BarValImm)
5594     MIB.addImm(*BarValImm);
5595 
5596   I.eraseFromParent();
5597   return true;
5598 }
5599 
5600 bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5601   MachineBasicBlock *BB = I.getParent();
5602   const DebugLoc &DL = I.getDebugLoc();
5603   Register CCReg = I.getOperand(0).getReg();
5604 
5605   BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5606   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5607 
5608   I.eraseFromParent();
5609   return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5610                                       *MRI);
5611 }
5612 
5613 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5614                                                  const MachineInstr &MI,
5615                                                  int OpIdx) const {
5616   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5617          "Expected G_CONSTANT");
5618   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5619 }
5620 
5621 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5622                                                 const MachineInstr &MI,
5623                                                 int OpIdx) const {
5624   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5625          "Expected G_CONSTANT");
5626   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5627 }
5628 
5629 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5630                                                  const MachineInstr &MI,
5631                                                  int OpIdx) const {
5632   assert(OpIdx == -1);
5633 
5634   const MachineOperand &Op = MI.getOperand(1);
5635   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5636     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5637   else {
5638     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5639     MIB.addImm(Op.getCImm()->getSExtValue());
5640   }
5641 }
5642 
5643 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5644                                                 const MachineInstr &MI,
5645                                                 int OpIdx) const {
5646   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5647          "Expected G_CONSTANT");
5648   MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5649 }
5650 
5651 /// This only really exists to satisfy DAG type checking machinery, so is a
5652 /// no-op here.
5653 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5654                                                 const MachineInstr &MI,
5655                                                 int OpIdx) const {
5656   MIB.addImm(MI.getOperand(OpIdx).getImm());
5657 }
5658 
5659 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5660                                                 const MachineInstr &MI,
5661                                                 int OpIdx) const {
5662   assert(OpIdx >= 0 && "expected to match an immediate operand");
5663   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5664 }
5665 
5666 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5667                                                   const MachineInstr &MI,
5668                                                   int OpIdx) const {
5669   assert(OpIdx >= 0 && "expected to match an immediate operand");
5670   MIB.addImm(MI.getOperand(OpIdx).getImm() &
5671              (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5672                                        : AMDGPU::CPol::ALL_pregfx12));
5673 }
5674 
5675 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5676                                                  const MachineInstr &MI,
5677                                                  int OpIdx) const {
5678   assert(OpIdx >= 0 && "expected to match an immediate operand");
5679   const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5680                        (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
5681                                                  : AMDGPU::CPol::SWZ_pregfx12);
5682   MIB.addImm(Swizzle);
5683 }
5684 
5685 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5686     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5687   assert(OpIdx >= 0 && "expected to match an immediate operand");
5688   const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5689                         (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5690                                                   : AMDGPU::CPol::ALL_pregfx12);
5691   MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5692 }
5693 
5694 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5695                                                  const MachineInstr &MI,
5696                                                  int OpIdx) const {
5697   MIB.addFrameIndex(MI.getOperand(1).getIndex());
5698 }
5699 
5700 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5701                                                        const MachineInstr &MI,
5702                                                        int OpIdx) const {
5703   const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5704   int ExpVal = APF.getExactLog2Abs();
5705   assert(ExpVal != INT_MIN);
5706   MIB.addImm(ExpVal);
5707 }
5708 
5709 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5710   return TII.isInlineConstant(Imm);
5711 }
5712 
5713 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5714   return TII.isInlineConstant(Imm);
5715 }
5716