xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31 
32 #define DEBUG_TYPE "amdgpu-isel"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 static cl::opt<bool> AllowRiskySelect(
38   "amdgpu-global-isel-risky-select",
39   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
40   cl::init(false),
41   cl::ReallyHidden);
42 
43 #define GET_GLOBALISEL_IMPL
44 #define AMDGPUSubtarget GCNSubtarget
45 #include "AMDGPUGenGlobalISel.inc"
46 #undef GET_GLOBALISEL_IMPL
47 #undef AMDGPUSubtarget
48 
49 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
50     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
51     const AMDGPUTargetMachine &TM)
52     : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
53       STI(STI),
54       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
55 #define GET_GLOBALISEL_PREDICATES_INIT
56 #include "AMDGPUGenGlobalISel.inc"
57 #undef GET_GLOBALISEL_PREDICATES_INIT
58 #define GET_GLOBALISEL_TEMPORARIES_INIT
59 #include "AMDGPUGenGlobalISel.inc"
60 #undef GET_GLOBALISEL_TEMPORARIES_INIT
61 {
62 }
63 
64 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
65 
66 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
67                                         CodeGenCoverage *CoverageInfo,
68                                         ProfileSummaryInfo *PSI,
69                                         BlockFrequencyInfo *BFI) {
70   MRI = &MF.getRegInfo();
71   Subtarget = &MF.getSubtarget<GCNSubtarget>();
72   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
73 }
74 
75 // Return the wave level SGPR base address if this is a wave address.
76 static Register getWaveAddress(const MachineInstr *Def) {
77   return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
78              ? Def->getOperand(1).getReg()
79              : Register();
80 }
81 
82 bool AMDGPUInstructionSelector::isVCC(Register Reg,
83                                       const MachineRegisterInfo &MRI) const {
84   // The verifier is oblivious to s1 being a valid value for wavesize registers.
85   if (Reg.isPhysical())
86     return false;
87 
88   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
89   const TargetRegisterClass *RC =
90       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
91   if (RC) {
92     const LLT Ty = MRI.getType(Reg);
93     if (!Ty.isValid() || Ty.getSizeInBits() != 1)
94       return false;
95     // G_TRUNC s1 result is never vcc.
96     return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
97            RC->hasSuperClassEq(TRI.getBoolRC());
98   }
99 
100   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
101   return RB->getID() == AMDGPU::VCCRegBankID;
102 }
103 
104 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
105                                                         unsigned NewOpc) const {
106   MI.setDesc(TII.get(NewOpc));
107   MI.removeOperand(1); // Remove intrinsic ID.
108   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
109 
110   MachineOperand &Dst = MI.getOperand(0);
111   MachineOperand &Src = MI.getOperand(1);
112 
113   // TODO: This should be legalized to s32 if needed
114   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
115     return false;
116 
117   const TargetRegisterClass *DstRC
118     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
119   const TargetRegisterClass *SrcRC
120     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
121   if (!DstRC || DstRC != SrcRC)
122     return false;
123 
124   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
125          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
126 }
127 
128 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
129   const DebugLoc &DL = I.getDebugLoc();
130   MachineBasicBlock *BB = I.getParent();
131   I.setDesc(TII.get(TargetOpcode::COPY));
132 
133   const MachineOperand &Src = I.getOperand(1);
134   MachineOperand &Dst = I.getOperand(0);
135   Register DstReg = Dst.getReg();
136   Register SrcReg = Src.getReg();
137 
138   if (isVCC(DstReg, *MRI)) {
139     if (SrcReg == AMDGPU::SCC) {
140       const TargetRegisterClass *RC
141         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
142       if (!RC)
143         return true;
144       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
145     }
146 
147     if (!isVCC(SrcReg, *MRI)) {
148       // TODO: Should probably leave the copy and let copyPhysReg expand it.
149       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
150         return false;
151 
152       const TargetRegisterClass *SrcRC
153         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
154 
155       std::optional<ValueAndVReg> ConstVal =
156           getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
157       if (ConstVal) {
158         unsigned MovOpc =
159             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
160         BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
161             .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
162       } else {
163         Register MaskedReg = MRI->createVirtualRegister(SrcRC);
164 
165         // We can't trust the high bits at this point, so clear them.
166 
167         // TODO: Skip masking high bits if def is known boolean.
168 
169         bool IsSGPR = TRI.isSGPRClass(SrcRC);
170         unsigned AndOpc =
171             IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
172         auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
173             .addImm(1)
174             .addReg(SrcReg);
175         if (IsSGPR)
176           And.setOperandDead(3); // Dead scc
177 
178         BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
179             .addImm(0)
180             .addReg(MaskedReg);
181       }
182 
183       if (!MRI->getRegClassOrNull(SrcReg))
184         MRI->setRegClass(SrcReg, SrcRC);
185       I.eraseFromParent();
186       return true;
187     }
188 
189     const TargetRegisterClass *RC =
190       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
191     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
192       return false;
193 
194     return true;
195   }
196 
197   for (const MachineOperand &MO : I.operands()) {
198     if (MO.getReg().isPhysical())
199       continue;
200 
201     const TargetRegisterClass *RC =
202             TRI.getConstrainedRegClassForOperand(MO, *MRI);
203     if (!RC)
204       continue;
205     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
206   }
207   return true;
208 }
209 
210 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211   const Register DefReg = I.getOperand(0).getReg();
212   const LLT DefTy = MRI->getType(DefReg);
213   if (DefTy == LLT::scalar(1)) {
214     if (!AllowRiskySelect) {
215       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
216       return false;
217     }
218 
219     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
220   }
221 
222   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
223 
224   const RegClassOrRegBank &RegClassOrBank =
225     MRI->getRegClassOrRegBank(DefReg);
226 
227   const TargetRegisterClass *DefRC
228     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
229   if (!DefRC) {
230     if (!DefTy.isValid()) {
231       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
232       return false;
233     }
234 
235     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
236     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
237     if (!DefRC) {
238       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
239       return false;
240     }
241   }
242 
243   // TODO: Verify that all registers have the same bank
244   I.setDesc(TII.get(TargetOpcode::PHI));
245   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
246 }
247 
248 MachineOperand
249 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
250                                            const TargetRegisterClass &SubRC,
251                                            unsigned SubIdx) const {
252 
253   MachineInstr *MI = MO.getParent();
254   MachineBasicBlock *BB = MO.getParent()->getParent();
255   Register DstReg = MRI->createVirtualRegister(&SubRC);
256 
257   if (MO.isReg()) {
258     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
259     Register Reg = MO.getReg();
260     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
261             .addReg(Reg, 0, ComposedSubIdx);
262 
263     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
264                                      MO.isKill(), MO.isDead(), MO.isUndef(),
265                                      MO.isEarlyClobber(), 0, MO.isDebug(),
266                                      MO.isInternalRead());
267   }
268 
269   assert(MO.isImm());
270 
271   APInt Imm(64, MO.getImm());
272 
273   switch (SubIdx) {
274   default:
275     llvm_unreachable("do not know to split immediate with this sub index.");
276   case AMDGPU::sub0:
277     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
278   case AMDGPU::sub1:
279     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
280   }
281 }
282 
283 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
284   switch (Opc) {
285   case AMDGPU::G_AND:
286     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
287   case AMDGPU::G_OR:
288     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
289   case AMDGPU::G_XOR:
290     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
291   default:
292     llvm_unreachable("not a bit op");
293   }
294 }
295 
296 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
297   Register DstReg = I.getOperand(0).getReg();
298   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
299 
300   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
301   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
302       DstRB->getID() != AMDGPU::VCCRegBankID)
303     return false;
304 
305   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
306                             STI.isWave64());
307   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
308 
309   // Dead implicit-def of scc
310   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
311                                          true, // isImp
312                                          false, // isKill
313                                          true)); // isDead
314   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
315 }
316 
317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
318   MachineBasicBlock *BB = I.getParent();
319   MachineFunction *MF = BB->getParent();
320   Register DstReg = I.getOperand(0).getReg();
321   const DebugLoc &DL = I.getDebugLoc();
322   LLT Ty = MRI->getType(DstReg);
323   if (Ty.isVector())
324     return false;
325 
326   unsigned Size = Ty.getSizeInBits();
327   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
328   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
329   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
330 
331   if (Size == 32) {
332     if (IsSALU) {
333       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
334       MachineInstr *Add =
335         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
336         .add(I.getOperand(1))
337         .add(I.getOperand(2))
338         .setOperandDead(3); // Dead scc
339       I.eraseFromParent();
340       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341     }
342 
343     if (STI.hasAddNoCarry()) {
344       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
345       I.setDesc(TII.get(Opc));
346       I.addOperand(*MF, MachineOperand::CreateImm(0));
347       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
348       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
349     }
350 
351     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
352 
353     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
354     MachineInstr *Add
355       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
356       .addDef(UnusedCarry, RegState::Dead)
357       .add(I.getOperand(1))
358       .add(I.getOperand(2))
359       .addImm(0);
360     I.eraseFromParent();
361     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
362   }
363 
364   assert(!Sub && "illegal sub should not reach here");
365 
366   const TargetRegisterClass &RC
367     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
368   const TargetRegisterClass &HalfRC
369     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
370 
371   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
372   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
373   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
374   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
375 
376   Register DstLo = MRI->createVirtualRegister(&HalfRC);
377   Register DstHi = MRI->createVirtualRegister(&HalfRC);
378 
379   if (IsSALU) {
380     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
381       .add(Lo1)
382       .add(Lo2);
383     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
384       .add(Hi1)
385       .add(Hi2)
386       .setOperandDead(3); // Dead scc
387   } else {
388     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
389     Register CarryReg = MRI->createVirtualRegister(CarryRC);
390     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
391       .addDef(CarryReg)
392       .add(Lo1)
393       .add(Lo2)
394       .addImm(0);
395     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
396       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
397       .add(Hi1)
398       .add(Hi2)
399       .addReg(CarryReg, RegState::Kill)
400       .addImm(0);
401 
402     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
403       return false;
404   }
405 
406   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
407     .addReg(DstLo)
408     .addImm(AMDGPU::sub0)
409     .addReg(DstHi)
410     .addImm(AMDGPU::sub1);
411 
412 
413   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
414     return false;
415 
416   I.eraseFromParent();
417   return true;
418 }
419 
420 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
421   MachineInstr &I) const {
422   MachineBasicBlock *BB = I.getParent();
423   MachineFunction *MF = BB->getParent();
424   const DebugLoc &DL = I.getDebugLoc();
425   Register Dst0Reg = I.getOperand(0).getReg();
426   Register Dst1Reg = I.getOperand(1).getReg();
427   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
428                      I.getOpcode() == AMDGPU::G_UADDE;
429   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
430                           I.getOpcode() == AMDGPU::G_USUBE;
431 
432   if (isVCC(Dst1Reg, *MRI)) {
433     unsigned NoCarryOpc =
434         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
435     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
436     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
437     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
438     I.addOperand(*MF, MachineOperand::CreateImm(0));
439     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
440   }
441 
442   Register Src0Reg = I.getOperand(2).getReg();
443   Register Src1Reg = I.getOperand(3).getReg();
444 
445   if (HasCarryIn) {
446     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
447       .addReg(I.getOperand(4).getReg());
448   }
449 
450   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
451   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
452 
453   auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
454     .add(I.getOperand(2))
455     .add(I.getOperand(3));
456 
457   if (MRI->use_nodbg_empty(Dst1Reg)) {
458     CarryInst.setOperandDead(3); // Dead scc
459   } else {
460     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
461       .addReg(AMDGPU::SCC);
462     if (!MRI->getRegClassOrNull(Dst1Reg))
463       MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
464   }
465 
466   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
467       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
468       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
469     return false;
470 
471   if (HasCarryIn &&
472       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
473                                     AMDGPU::SReg_32RegClass, *MRI))
474     return false;
475 
476   I.eraseFromParent();
477   return true;
478 }
479 
480 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
481     MachineInstr &I) const {
482   MachineBasicBlock *BB = I.getParent();
483   MachineFunction *MF = BB->getParent();
484   const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
485 
486   unsigned Opc;
487   if (Subtarget->hasMADIntraFwdBug())
488     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
489                      : AMDGPU::V_MAD_I64_I32_gfx11_e64;
490   else
491     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
492   I.setDesc(TII.get(Opc));
493   I.addOperand(*MF, MachineOperand::CreateImm(0));
494   I.addImplicitDefUseOperands(*MF);
495   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
496 }
497 
498 // TODO: We should probably legalize these to only using 32-bit results.
499 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
500   MachineBasicBlock *BB = I.getParent();
501   Register DstReg = I.getOperand(0).getReg();
502   Register SrcReg = I.getOperand(1).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(SrcReg);
505   const unsigned SrcSize = SrcTy.getSizeInBits();
506   unsigned DstSize = DstTy.getSizeInBits();
507 
508   // TODO: Should handle any multiple of 32 offset.
509   unsigned Offset = I.getOperand(2).getImm();
510   if (Offset % 32 != 0 || DstSize > 128)
511     return false;
512 
513   // 16-bit operations really use 32-bit registers.
514   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
515   if (DstSize == 16)
516     DstSize = 32;
517 
518   const TargetRegisterClass *DstRC =
519     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
520   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
521     return false;
522 
523   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
524   const TargetRegisterClass *SrcRC =
525       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
526   if (!SrcRC)
527     return false;
528   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
529                                                          DstSize / 32);
530   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
531   if (!SrcRC)
532     return false;
533 
534   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
535                                     *SrcRC, I.getOperand(1));
536   const DebugLoc &DL = I.getDebugLoc();
537   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
538     .addReg(SrcReg, 0, SubReg);
539 
540   I.eraseFromParent();
541   return true;
542 }
543 
544 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
545   MachineBasicBlock *BB = MI.getParent();
546   Register DstReg = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg);
548   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
549 
550   const unsigned SrcSize = SrcTy.getSizeInBits();
551   if (SrcSize < 32)
552     return selectImpl(MI, *CoverageInfo);
553 
554   const DebugLoc &DL = MI.getDebugLoc();
555   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
556   const unsigned DstSize = DstTy.getSizeInBits();
557   const TargetRegisterClass *DstRC =
558       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
559   if (!DstRC)
560     return false;
561 
562   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
563   MachineInstrBuilder MIB =
564     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
565   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
566     MachineOperand &Src = MI.getOperand(I + 1);
567     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
568     MIB.addImm(SubRegs[I]);
569 
570     const TargetRegisterClass *SrcRC
571       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
572     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
573       return false;
574   }
575 
576   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
577     return false;
578 
579   MI.eraseFromParent();
580   return true;
581 }
582 
583 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
584   MachineBasicBlock *BB = MI.getParent();
585   const int NumDst = MI.getNumOperands() - 1;
586 
587   MachineOperand &Src = MI.getOperand(NumDst);
588 
589   Register SrcReg = Src.getReg();
590   Register DstReg0 = MI.getOperand(0).getReg();
591   LLT DstTy = MRI->getType(DstReg0);
592   LLT SrcTy = MRI->getType(SrcReg);
593 
594   const unsigned DstSize = DstTy.getSizeInBits();
595   const unsigned SrcSize = SrcTy.getSizeInBits();
596   const DebugLoc &DL = MI.getDebugLoc();
597   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
598 
599   const TargetRegisterClass *SrcRC =
600       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
601   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
602     return false;
603 
604   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
605   // source, and this relies on the fact that the same subregister indices are
606   // used for both.
607   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
608   for (int I = 0, E = NumDst; I != E; ++I) {
609     MachineOperand &Dst = MI.getOperand(I);
610     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
611       .addReg(SrcReg, 0, SubRegs[I]);
612 
613     // Make sure the subregister index is valid for the source register.
614     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
615     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
616       return false;
617 
618     const TargetRegisterClass *DstRC =
619       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
620     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
621       return false;
622   }
623 
624   MI.eraseFromParent();
625   return true;
626 }
627 
628 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
629   assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
630          MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
631 
632   Register Src0 = MI.getOperand(1).getReg();
633   Register Src1 = MI.getOperand(2).getReg();
634   LLT SrcTy = MRI->getType(Src0);
635   const unsigned SrcSize = SrcTy.getSizeInBits();
636 
637   // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
638   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
639     return selectG_MERGE_VALUES(MI);
640   }
641 
642   // Selection logic below is for V2S16 only.
643   // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
644   Register Dst = MI.getOperand(0).getReg();
645   if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
646       (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
647        SrcTy != LLT::scalar(32)))
648     return selectImpl(MI, *CoverageInfo);
649 
650   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
651   if (DstBank->getID() == AMDGPU::AGPRRegBankID)
652     return false;
653 
654   assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
655          DstBank->getID() == AMDGPU::VGPRRegBankID);
656   const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
657 
658   const DebugLoc &DL = MI.getDebugLoc();
659   MachineBasicBlock *BB = MI.getParent();
660 
661   // First, before trying TableGen patterns, check if both sources are
662   // constants. In those cases, we can trivially compute the final constant
663   // and emit a simple move.
664   auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
665   if (ConstSrc1) {
666     auto ConstSrc0 =
667         getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
668     if (ConstSrc0) {
669       const int64_t K0 = ConstSrc0->Value.getSExtValue();
670       const int64_t K1 = ConstSrc1->Value.getSExtValue();
671       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
672       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
673       uint32_t Imm = Lo16 | (Hi16 << 16);
674 
675       // VALU
676       if (IsVector) {
677         BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
678         MI.eraseFromParent();
679         return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
680       }
681 
682       // SALU
683       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
684       MI.eraseFromParent();
685       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
686     }
687   }
688 
689   // Now try TableGen patterns.
690   if (selectImpl(MI, *CoverageInfo))
691     return true;
692 
693   // TODO: This should probably be a combine somewhere
694   // (build_vector $src0, undef)  -> copy $src0
695   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
696   if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
697     MI.setDesc(TII.get(AMDGPU::COPY));
698     MI.removeOperand(2);
699     const auto &RC =
700         IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
701     return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
702            RBI.constrainGenericRegister(Src0, RC, *MRI);
703   }
704 
705   // TODO: Can be improved?
706   if (IsVector) {
707     Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
708     auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
709                    .addImm(0xFFFF)
710                    .addReg(Src0);
711     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
712       return false;
713 
714     MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
715               .addReg(Src1)
716               .addImm(16)
717               .addReg(TmpReg);
718     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
719       return false;
720 
721     MI.eraseFromParent();
722     return true;
723   }
724 
725   Register ShiftSrc0;
726   Register ShiftSrc1;
727 
728   // With multiple uses of the shift, this will duplicate the shift and
729   // increase register pressure.
730   //
731   // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
732   //  => (S_PACK_HH_B32_B16 $src0, $src1)
733   // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
734   //  => (S_PACK_HL_B32_B16 $src0, $src1)
735   // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
736   //  => (S_PACK_LH_B32_B16 $src0, $src1)
737   // (build_vector $src0, $src1)
738   //  => (S_PACK_LL_B32_B16 $src0, $src1)
739 
740   bool Shift0 = mi_match(
741       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
742 
743   bool Shift1 = mi_match(
744       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
745 
746   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
747   if (Shift0 && Shift1) {
748     Opc = AMDGPU::S_PACK_HH_B32_B16;
749     MI.getOperand(1).setReg(ShiftSrc0);
750     MI.getOperand(2).setReg(ShiftSrc1);
751   } else if (Shift1) {
752     Opc = AMDGPU::S_PACK_LH_B32_B16;
753     MI.getOperand(2).setReg(ShiftSrc1);
754   } else if (Shift0) {
755     auto ConstSrc1 =
756         getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
757     if (ConstSrc1 && ConstSrc1->Value == 0) {
758       // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
759       auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
760                      .addReg(ShiftSrc0)
761                      .addImm(16)
762                      .setOperandDead(3); // Dead scc
763 
764       MI.eraseFromParent();
765       return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
766     }
767     if (STI.hasSPackHL()) {
768       Opc = AMDGPU::S_PACK_HL_B32_B16;
769       MI.getOperand(1).setReg(ShiftSrc0);
770     }
771   }
772 
773   MI.setDesc(TII.get(Opc));
774   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
775 }
776 
777 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
778   return selectG_ADD_SUB(I);
779 }
780 
781 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
782   const MachineOperand &MO = I.getOperand(0);
783 
784   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
785   // regbank check here is to know why getConstrainedRegClassForOperand failed.
786   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
787   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
788       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
789     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
790     return true;
791   }
792 
793   return false;
794 }
795 
796 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
797   MachineBasicBlock *BB = I.getParent();
798 
799   Register DstReg = I.getOperand(0).getReg();
800   Register Src0Reg = I.getOperand(1).getReg();
801   Register Src1Reg = I.getOperand(2).getReg();
802   LLT Src1Ty = MRI->getType(Src1Reg);
803 
804   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
805   unsigned InsSize = Src1Ty.getSizeInBits();
806 
807   int64_t Offset = I.getOperand(3).getImm();
808 
809   // FIXME: These cases should have been illegal and unnecessary to check here.
810   if (Offset % 32 != 0 || InsSize % 32 != 0)
811     return false;
812 
813   // Currently not handled by getSubRegFromChannel.
814   if (InsSize > 128)
815     return false;
816 
817   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
818   if (SubReg == AMDGPU::NoSubRegister)
819     return false;
820 
821   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
822   const TargetRegisterClass *DstRC =
823       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
824   if (!DstRC)
825     return false;
826 
827   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
828   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
829   const TargetRegisterClass *Src0RC =
830       TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
831   const TargetRegisterClass *Src1RC =
832       TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
833 
834   // Deal with weird cases where the class only partially supports the subreg
835   // index.
836   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
837   if (!Src0RC || !Src1RC)
838     return false;
839 
840   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
841       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
842       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
843     return false;
844 
845   const DebugLoc &DL = I.getDebugLoc();
846   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
847     .addReg(Src0Reg)
848     .addReg(Src1Reg)
849     .addImm(SubReg);
850 
851   I.eraseFromParent();
852   return true;
853 }
854 
855 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
856   Register DstReg = MI.getOperand(0).getReg();
857   Register SrcReg = MI.getOperand(1).getReg();
858   Register OffsetReg = MI.getOperand(2).getReg();
859   Register WidthReg = MI.getOperand(3).getReg();
860 
861   assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
862          "scalar BFX instructions are expanded in regbankselect");
863   assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
864          "64-bit vector BFX instructions are expanded in regbankselect");
865 
866   const DebugLoc &DL = MI.getDebugLoc();
867   MachineBasicBlock *MBB = MI.getParent();
868 
869   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
870   unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
871   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
872                  .addReg(SrcReg)
873                  .addReg(OffsetReg)
874                  .addReg(WidthReg);
875   MI.eraseFromParent();
876   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
877 }
878 
879 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
880   if (STI.getLDSBankCount() != 16)
881     return selectImpl(MI, *CoverageInfo);
882 
883   Register Dst = MI.getOperand(0).getReg();
884   Register Src0 = MI.getOperand(2).getReg();
885   Register M0Val = MI.getOperand(6).getReg();
886   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
887       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
888       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
889     return false;
890 
891   // This requires 2 instructions. It is possible to write a pattern to support
892   // this, but the generated isel emitter doesn't correctly deal with multiple
893   // output instructions using the same physical register input. The copy to m0
894   // is incorrectly placed before the second instruction.
895   //
896   // TODO: Match source modifiers.
897 
898   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
899   const DebugLoc &DL = MI.getDebugLoc();
900   MachineBasicBlock *MBB = MI.getParent();
901 
902   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
903     .addReg(M0Val);
904   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
905     .addImm(2)
906     .addImm(MI.getOperand(4).getImm())  // $attr
907     .addImm(MI.getOperand(3).getImm()); // $attrchan
908 
909   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
910     .addImm(0)                          // $src0_modifiers
911     .addReg(Src0)                       // $src0
912     .addImm(MI.getOperand(4).getImm())  // $attr
913     .addImm(MI.getOperand(3).getImm())  // $attrchan
914     .addImm(0)                          // $src2_modifiers
915     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
916     .addImm(MI.getOperand(5).getImm())  // $high
917     .addImm(0)                          // $clamp
918     .addImm(0);                         // $omod
919 
920   MI.eraseFromParent();
921   return true;
922 }
923 
924 // Writelane is special in that it can use SGPR and M0 (which would normally
925 // count as using the constant bus twice - but in this case it is allowed since
926 // the lane selector doesn't count as a use of the constant bus). However, it is
927 // still required to abide by the 1 SGPR rule. Fix this up if we might have
928 // multiple SGPRs.
929 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
930   // With a constant bus limit of at least 2, there's no issue.
931   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
932     return selectImpl(MI, *CoverageInfo);
933 
934   MachineBasicBlock *MBB = MI.getParent();
935   const DebugLoc &DL = MI.getDebugLoc();
936   Register VDst = MI.getOperand(0).getReg();
937   Register Val = MI.getOperand(2).getReg();
938   Register LaneSelect = MI.getOperand(3).getReg();
939   Register VDstIn = MI.getOperand(4).getReg();
940 
941   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
942 
943   std::optional<ValueAndVReg> ConstSelect =
944       getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
945   if (ConstSelect) {
946     // The selector has to be an inline immediate, so we can use whatever for
947     // the other operands.
948     MIB.addReg(Val);
949     MIB.addImm(ConstSelect->Value.getSExtValue() &
950                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
951   } else {
952     std::optional<ValueAndVReg> ConstVal =
953         getIConstantVRegValWithLookThrough(Val, *MRI);
954 
955     // If the value written is an inline immediate, we can get away without a
956     // copy to m0.
957     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
958                                                  STI.hasInv2PiInlineImm())) {
959       MIB.addImm(ConstVal->Value.getSExtValue());
960       MIB.addReg(LaneSelect);
961     } else {
962       MIB.addReg(Val);
963 
964       // If the lane selector was originally in a VGPR and copied with
965       // readfirstlane, there's a hazard to read the same SGPR from the
966       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
967       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
968 
969       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
970         .addReg(LaneSelect);
971       MIB.addReg(AMDGPU::M0);
972     }
973   }
974 
975   MIB.addReg(VDstIn);
976 
977   MI.eraseFromParent();
978   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
979 }
980 
981 // We need to handle this here because tablegen doesn't support matching
982 // instructions with multiple outputs.
983 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
984   Register Dst0 = MI.getOperand(0).getReg();
985   Register Dst1 = MI.getOperand(1).getReg();
986 
987   LLT Ty = MRI->getType(Dst0);
988   unsigned Opc;
989   if (Ty == LLT::scalar(32))
990     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
991   else if (Ty == LLT::scalar(64))
992     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
993   else
994     return false;
995 
996   // TODO: Match source modifiers.
997 
998   const DebugLoc &DL = MI.getDebugLoc();
999   MachineBasicBlock *MBB = MI.getParent();
1000 
1001   Register Numer = MI.getOperand(3).getReg();
1002   Register Denom = MI.getOperand(4).getReg();
1003   unsigned ChooseDenom = MI.getOperand(5).getImm();
1004 
1005   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1006 
1007   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1008     .addDef(Dst1)
1009     .addImm(0)     // $src0_modifiers
1010     .addUse(Src0)  // $src0
1011     .addImm(0)     // $src1_modifiers
1012     .addUse(Denom) // $src1
1013     .addImm(0)     // $src2_modifiers
1014     .addUse(Numer) // $src2
1015     .addImm(0)     // $clamp
1016     .addImm(0);    // $omod
1017 
1018   MI.eraseFromParent();
1019   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1020 }
1021 
1022 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1023   unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1024   switch (IntrinsicID) {
1025   case Intrinsic::amdgcn_if_break: {
1026     MachineBasicBlock *BB = I.getParent();
1027 
1028     // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1029     // SelectionDAG uses for wave32 vs wave64.
1030     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1031       .add(I.getOperand(0))
1032       .add(I.getOperand(2))
1033       .add(I.getOperand(3));
1034 
1035     Register DstReg = I.getOperand(0).getReg();
1036     Register Src0Reg = I.getOperand(2).getReg();
1037     Register Src1Reg = I.getOperand(3).getReg();
1038 
1039     I.eraseFromParent();
1040 
1041     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1042       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1043 
1044     return true;
1045   }
1046   case Intrinsic::amdgcn_interp_p1_f16:
1047     return selectInterpP1F16(I);
1048   case Intrinsic::amdgcn_wqm:
1049     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1050   case Intrinsic::amdgcn_softwqm:
1051     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1052   case Intrinsic::amdgcn_strict_wwm:
1053   case Intrinsic::amdgcn_wwm:
1054     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1055   case Intrinsic::amdgcn_strict_wqm:
1056     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1057   case Intrinsic::amdgcn_writelane:
1058     return selectWritelane(I);
1059   case Intrinsic::amdgcn_div_scale:
1060     return selectDivScale(I);
1061   case Intrinsic::amdgcn_icmp:
1062   case Intrinsic::amdgcn_fcmp:
1063     if (selectImpl(I, *CoverageInfo))
1064       return true;
1065     return selectIntrinsicCmp(I);
1066   case Intrinsic::amdgcn_ballot:
1067     return selectBallot(I);
1068   case Intrinsic::amdgcn_inverse_ballot:
1069     return selectInverseBallot(I);
1070   case Intrinsic::amdgcn_reloc_constant:
1071     return selectRelocConstant(I);
1072   case Intrinsic::amdgcn_groupstaticsize:
1073     return selectGroupStaticSize(I);
1074   case Intrinsic::returnaddress:
1075     return selectReturnAddress(I);
1076   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1077   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1078   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1079   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1080   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1081   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1082   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1083   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1084   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1085   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1086   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1087   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1088   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1089   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1090     return selectSMFMACIntrin(I);
1091   default:
1092     return selectImpl(I, *CoverageInfo);
1093   }
1094 }
1095 
1096 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1097                           const GCNSubtarget &ST) {
1098   if (Size != 16 && Size != 32 && Size != 64)
1099     return -1;
1100 
1101   if (Size == 16 && !ST.has16BitInsts())
1102     return -1;
1103 
1104   const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1105                           unsigned S64Opc) {
1106     if (Size == 16)
1107       return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1108     if (Size == 32)
1109       return S32Opc;
1110     return S64Opc;
1111   };
1112 
1113   switch (P) {
1114   default:
1115     llvm_unreachable("Unknown condition code!");
1116   case CmpInst::ICMP_NE:
1117     return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1118                   AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1119   case CmpInst::ICMP_EQ:
1120     return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1121                   AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1122   case CmpInst::ICMP_SGT:
1123     return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1124                   AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1125   case CmpInst::ICMP_SGE:
1126     return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1127                   AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1128   case CmpInst::ICMP_SLT:
1129     return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1130                   AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1131   case CmpInst::ICMP_SLE:
1132     return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1133                   AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1134   case CmpInst::ICMP_UGT:
1135     return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1136                   AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1137   case CmpInst::ICMP_UGE:
1138     return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1139                   AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1140   case CmpInst::ICMP_ULT:
1141     return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1142                   AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1143   case CmpInst::ICMP_ULE:
1144     return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1145                   AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1146 
1147   case CmpInst::FCMP_OEQ:
1148     return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1149                   AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1150   case CmpInst::FCMP_OGT:
1151     return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1152                   AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1153   case CmpInst::FCMP_OGE:
1154     return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1155                   AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1156   case CmpInst::FCMP_OLT:
1157     return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1158                   AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1159   case CmpInst::FCMP_OLE:
1160     return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1161                   AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1162   case CmpInst::FCMP_ONE:
1163     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1164                   AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1165   case CmpInst::FCMP_ORD:
1166     return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1167                   AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1168   case CmpInst::FCMP_UNO:
1169     return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1170                   AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1171   case CmpInst::FCMP_UEQ:
1172     return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1173                   AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1174   case CmpInst::FCMP_UGT:
1175     return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1176                   AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1177   case CmpInst::FCMP_UGE:
1178     return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1179                   AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1180   case CmpInst::FCMP_ULT:
1181     return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1182                   AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1183   case CmpInst::FCMP_ULE:
1184     return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1185                   AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1186   case CmpInst::FCMP_UNE:
1187     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1188                   AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1189   case CmpInst::FCMP_TRUE:
1190     return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1191                   AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1192   case CmpInst::FCMP_FALSE:
1193     return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1194                   AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1195   }
1196 }
1197 
1198 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1199                                               unsigned Size) const {
1200   if (Size == 64) {
1201     if (!STI.hasScalarCompareEq64())
1202       return -1;
1203 
1204     switch (P) {
1205     case CmpInst::ICMP_NE:
1206       return AMDGPU::S_CMP_LG_U64;
1207     case CmpInst::ICMP_EQ:
1208       return AMDGPU::S_CMP_EQ_U64;
1209     default:
1210       return -1;
1211     }
1212   }
1213 
1214   if (Size == 32) {
1215     switch (P) {
1216     case CmpInst::ICMP_NE:
1217       return AMDGPU::S_CMP_LG_U32;
1218     case CmpInst::ICMP_EQ:
1219       return AMDGPU::S_CMP_EQ_U32;
1220     case CmpInst::ICMP_SGT:
1221       return AMDGPU::S_CMP_GT_I32;
1222     case CmpInst::ICMP_SGE:
1223       return AMDGPU::S_CMP_GE_I32;
1224     case CmpInst::ICMP_SLT:
1225       return AMDGPU::S_CMP_LT_I32;
1226     case CmpInst::ICMP_SLE:
1227       return AMDGPU::S_CMP_LE_I32;
1228     case CmpInst::ICMP_UGT:
1229       return AMDGPU::S_CMP_GT_U32;
1230     case CmpInst::ICMP_UGE:
1231       return AMDGPU::S_CMP_GE_U32;
1232     case CmpInst::ICMP_ULT:
1233       return AMDGPU::S_CMP_LT_U32;
1234     case CmpInst::ICMP_ULE:
1235       return AMDGPU::S_CMP_LE_U32;
1236     case CmpInst::FCMP_OEQ:
1237       return AMDGPU::S_CMP_EQ_F32;
1238     case CmpInst::FCMP_OGT:
1239       return AMDGPU::S_CMP_GT_F32;
1240     case CmpInst::FCMP_OGE:
1241       return AMDGPU::S_CMP_GE_F32;
1242     case CmpInst::FCMP_OLT:
1243       return AMDGPU::S_CMP_LT_F32;
1244     case CmpInst::FCMP_OLE:
1245       return AMDGPU::S_CMP_LE_F32;
1246     case CmpInst::FCMP_ONE:
1247       return AMDGPU::S_CMP_LG_F32;
1248     case CmpInst::FCMP_ORD:
1249       return AMDGPU::S_CMP_O_F32;
1250     case CmpInst::FCMP_UNO:
1251       return AMDGPU::S_CMP_U_F32;
1252     case CmpInst::FCMP_UEQ:
1253       return AMDGPU::S_CMP_NLG_F32;
1254     case CmpInst::FCMP_UGT:
1255       return AMDGPU::S_CMP_NLE_F32;
1256     case CmpInst::FCMP_UGE:
1257       return AMDGPU::S_CMP_NLT_F32;
1258     case CmpInst::FCMP_ULT:
1259       return AMDGPU::S_CMP_NGE_F32;
1260     case CmpInst::FCMP_ULE:
1261       return AMDGPU::S_CMP_NGT_F32;
1262     case CmpInst::FCMP_UNE:
1263       return AMDGPU::S_CMP_NEQ_F32;
1264     default:
1265       llvm_unreachable("Unknown condition code!");
1266     }
1267   }
1268 
1269   if (Size == 16) {
1270     if (!STI.hasSALUFloatInsts())
1271       return -1;
1272 
1273     switch (P) {
1274     case CmpInst::FCMP_OEQ:
1275       return AMDGPU::S_CMP_EQ_F16;
1276     case CmpInst::FCMP_OGT:
1277       return AMDGPU::S_CMP_GT_F16;
1278     case CmpInst::FCMP_OGE:
1279       return AMDGPU::S_CMP_GE_F16;
1280     case CmpInst::FCMP_OLT:
1281       return AMDGPU::S_CMP_LT_F16;
1282     case CmpInst::FCMP_OLE:
1283       return AMDGPU::S_CMP_LE_F16;
1284     case CmpInst::FCMP_ONE:
1285       return AMDGPU::S_CMP_LG_F16;
1286     case CmpInst::FCMP_ORD:
1287       return AMDGPU::S_CMP_O_F16;
1288     case CmpInst::FCMP_UNO:
1289       return AMDGPU::S_CMP_U_F16;
1290     case CmpInst::FCMP_UEQ:
1291       return AMDGPU::S_CMP_NLG_F16;
1292     case CmpInst::FCMP_UGT:
1293       return AMDGPU::S_CMP_NLE_F16;
1294     case CmpInst::FCMP_UGE:
1295       return AMDGPU::S_CMP_NLT_F16;
1296     case CmpInst::FCMP_ULT:
1297       return AMDGPU::S_CMP_NGE_F16;
1298     case CmpInst::FCMP_ULE:
1299       return AMDGPU::S_CMP_NGT_F16;
1300     case CmpInst::FCMP_UNE:
1301       return AMDGPU::S_CMP_NEQ_F16;
1302     default:
1303       llvm_unreachable("Unknown condition code!");
1304     }
1305   }
1306 
1307   return -1;
1308 }
1309 
1310 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1311 
1312   MachineBasicBlock *BB = I.getParent();
1313   const DebugLoc &DL = I.getDebugLoc();
1314 
1315   Register SrcReg = I.getOperand(2).getReg();
1316   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1317 
1318   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1319 
1320   Register CCReg = I.getOperand(0).getReg();
1321   if (!isVCC(CCReg, *MRI)) {
1322     int Opcode = getS_CMPOpcode(Pred, Size);
1323     if (Opcode == -1)
1324       return false;
1325     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1326             .add(I.getOperand(2))
1327             .add(I.getOperand(3));
1328     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1329       .addReg(AMDGPU::SCC);
1330     bool Ret =
1331         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1332         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1333     I.eraseFromParent();
1334     return Ret;
1335   }
1336 
1337   if (I.getOpcode() == AMDGPU::G_FCMP)
1338     return false;
1339 
1340   int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1341   if (Opcode == -1)
1342     return false;
1343 
1344   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1345             I.getOperand(0).getReg())
1346             .add(I.getOperand(2))
1347             .add(I.getOperand(3));
1348   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1349                                *TRI.getBoolRC(), *MRI);
1350   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1351   I.eraseFromParent();
1352   return Ret;
1353 }
1354 
1355 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1356   Register Dst = I.getOperand(0).getReg();
1357   if (isVCC(Dst, *MRI))
1358     return false;
1359 
1360   LLT DstTy = MRI->getType(Dst);
1361   if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1362     return false;
1363 
1364   MachineBasicBlock *BB = I.getParent();
1365   const DebugLoc &DL = I.getDebugLoc();
1366   Register SrcReg = I.getOperand(2).getReg();
1367   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1368 
1369   // i1 inputs are not supported in GlobalISel.
1370   if (Size == 1)
1371     return false;
1372 
1373   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1374   if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1375     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1376     I.eraseFromParent();
1377     return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1378   }
1379 
1380   const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1381   if (Opcode == -1)
1382     return false;
1383 
1384   MachineInstrBuilder SelectedMI;
1385   MachineOperand &LHS = I.getOperand(2);
1386   MachineOperand &RHS = I.getOperand(3);
1387   auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1388   auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1389   Register Src0Reg =
1390       copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1391   Register Src1Reg =
1392       copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1393   SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1394   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1395     SelectedMI.addImm(Src0Mods);
1396   SelectedMI.addReg(Src0Reg);
1397   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1398     SelectedMI.addImm(Src1Mods);
1399   SelectedMI.addReg(Src1Reg);
1400   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1401     SelectedMI.addImm(0); // clamp
1402   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1403     SelectedMI.addImm(0); // op_sel
1404 
1405   RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1406   if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1407     return false;
1408 
1409   I.eraseFromParent();
1410   return true;
1411 }
1412 
1413 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1414   MachineBasicBlock *BB = I.getParent();
1415   const DebugLoc &DL = I.getDebugLoc();
1416   Register DstReg = I.getOperand(0).getReg();
1417   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1418   const bool Is64 = Size == 64;
1419   const bool IsWave32 = (STI.getWavefrontSize() == 32);
1420 
1421   // In the common case, the return type matches the wave size.
1422   // However we also support emitting i64 ballots in wave32 mode.
1423   if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1424     return false;
1425 
1426   std::optional<ValueAndVReg> Arg =
1427       getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1428 
1429   const auto BuildCopy = [&](Register SrcReg) {
1430     if (Size == STI.getWavefrontSize()) {
1431       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1432           .addReg(SrcReg);
1433       return;
1434     }
1435 
1436     // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1437     Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1438     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1439     BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1440         .addReg(SrcReg)
1441         .addImm(AMDGPU::sub0)
1442         .addReg(HiReg)
1443         .addImm(AMDGPU::sub1);
1444   };
1445 
1446   if (Arg) {
1447     const int64_t Value = Arg->Value.getSExtValue();
1448     if (Value == 0) {
1449       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1450       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1451     } else if (Value == -1) // all ones
1452       BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1453     else
1454       return false;
1455   } else
1456     BuildCopy(I.getOperand(2).getReg());
1457 
1458   I.eraseFromParent();
1459   return true;
1460 }
1461 
1462 bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1463   MachineBasicBlock *BB = I.getParent();
1464   const DebugLoc &DL = I.getDebugLoc();
1465   const Register DstReg = I.getOperand(0).getReg();
1466   const Register MaskReg = I.getOperand(2).getReg();
1467 
1468   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1469   I.eraseFromParent();
1470   return true;
1471 }
1472 
1473 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1474   Register DstReg = I.getOperand(0).getReg();
1475   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1476   const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1477   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1478     return false;
1479 
1480   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1481 
1482   Module *M = MF->getFunction().getParent();
1483   const MDNode *Metadata = I.getOperand(2).getMetadata();
1484   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1485   auto RelocSymbol = cast<GlobalVariable>(
1486     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1487 
1488   MachineBasicBlock *BB = I.getParent();
1489   BuildMI(*BB, &I, I.getDebugLoc(),
1490           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1491     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1492 
1493   I.eraseFromParent();
1494   return true;
1495 }
1496 
1497 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1498   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1499 
1500   Register DstReg = I.getOperand(0).getReg();
1501   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1502   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1503     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1504 
1505   MachineBasicBlock *MBB = I.getParent();
1506   const DebugLoc &DL = I.getDebugLoc();
1507 
1508   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1509 
1510   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1511     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1512     MIB.addImm(MFI->getLDSSize());
1513   } else {
1514     Module *M = MF->getFunction().getParent();
1515     const GlobalValue *GV
1516       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1517     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1518   }
1519 
1520   I.eraseFromParent();
1521   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1522 }
1523 
1524 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1525   MachineBasicBlock *MBB = I.getParent();
1526   MachineFunction &MF = *MBB->getParent();
1527   const DebugLoc &DL = I.getDebugLoc();
1528 
1529   MachineOperand &Dst = I.getOperand(0);
1530   Register DstReg = Dst.getReg();
1531   unsigned Depth = I.getOperand(2).getImm();
1532 
1533   const TargetRegisterClass *RC
1534     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1535   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1536       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1537     return false;
1538 
1539   // Check for kernel and shader functions
1540   if (Depth != 0 ||
1541       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1542     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1543       .addImm(0);
1544     I.eraseFromParent();
1545     return true;
1546   }
1547 
1548   MachineFrameInfo &MFI = MF.getFrameInfo();
1549   // There is a call to @llvm.returnaddress in this function
1550   MFI.setReturnAddressIsTaken(true);
1551 
1552   // Get the return address reg and mark it as an implicit live-in
1553   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1554   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1555                                              AMDGPU::SReg_64RegClass, DL);
1556   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1557     .addReg(LiveIn);
1558   I.eraseFromParent();
1559   return true;
1560 }
1561 
1562 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1563   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1564   // SelectionDAG uses for wave32 vs wave64.
1565   MachineBasicBlock *BB = MI.getParent();
1566   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1567       .add(MI.getOperand(1));
1568 
1569   Register Reg = MI.getOperand(1).getReg();
1570   MI.eraseFromParent();
1571 
1572   if (!MRI->getRegClassOrNull(Reg))
1573     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1574   return true;
1575 }
1576 
1577 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1578   MachineInstr &MI, Intrinsic::ID IntrID) const {
1579   MachineBasicBlock *MBB = MI.getParent();
1580   MachineFunction *MF = MBB->getParent();
1581   const DebugLoc &DL = MI.getDebugLoc();
1582 
1583   unsigned IndexOperand = MI.getOperand(7).getImm();
1584   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1585   bool WaveDone = MI.getOperand(9).getImm() != 0;
1586 
1587   if (WaveDone && !WaveRelease)
1588     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1589 
1590   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1591   IndexOperand &= ~0x3f;
1592   unsigned CountDw = 0;
1593 
1594   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1595     CountDw = (IndexOperand >> 24) & 0xf;
1596     IndexOperand &= ~(0xf << 24);
1597 
1598     if (CountDw < 1 || CountDw > 4) {
1599       report_fatal_error(
1600         "ds_ordered_count: dword count must be between 1 and 4");
1601     }
1602   }
1603 
1604   if (IndexOperand)
1605     report_fatal_error("ds_ordered_count: bad index operand");
1606 
1607   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1608   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1609 
1610   unsigned Offset0 = OrderedCountIndex << 2;
1611   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1612 
1613   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1614     Offset1 |= (CountDw - 1) << 6;
1615 
1616   if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1617     Offset1 |= ShaderType << 2;
1618 
1619   unsigned Offset = Offset0 | (Offset1 << 8);
1620 
1621   Register M0Val = MI.getOperand(2).getReg();
1622   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1623     .addReg(M0Val);
1624 
1625   Register DstReg = MI.getOperand(0).getReg();
1626   Register ValReg = MI.getOperand(3).getReg();
1627   MachineInstrBuilder DS =
1628     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1629       .addReg(ValReg)
1630       .addImm(Offset)
1631       .cloneMemRefs(MI);
1632 
1633   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1634     return false;
1635 
1636   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1637   MI.eraseFromParent();
1638   return Ret;
1639 }
1640 
1641 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1642   switch (IntrID) {
1643   case Intrinsic::amdgcn_ds_gws_init:
1644     return AMDGPU::DS_GWS_INIT;
1645   case Intrinsic::amdgcn_ds_gws_barrier:
1646     return AMDGPU::DS_GWS_BARRIER;
1647   case Intrinsic::amdgcn_ds_gws_sema_v:
1648     return AMDGPU::DS_GWS_SEMA_V;
1649   case Intrinsic::amdgcn_ds_gws_sema_br:
1650     return AMDGPU::DS_GWS_SEMA_BR;
1651   case Intrinsic::amdgcn_ds_gws_sema_p:
1652     return AMDGPU::DS_GWS_SEMA_P;
1653   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1654     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1655   default:
1656     llvm_unreachable("not a gws intrinsic");
1657   }
1658 }
1659 
1660 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1661                                                      Intrinsic::ID IID) const {
1662   if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1663                         !STI.hasGWSSemaReleaseAll()))
1664     return false;
1665 
1666   // intrinsic ID, vsrc, offset
1667   const bool HasVSrc = MI.getNumOperands() == 3;
1668   assert(HasVSrc || MI.getNumOperands() == 2);
1669 
1670   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1671   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1672   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1673     return false;
1674 
1675   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1676   unsigned ImmOffset;
1677 
1678   MachineBasicBlock *MBB = MI.getParent();
1679   const DebugLoc &DL = MI.getDebugLoc();
1680 
1681   MachineInstr *Readfirstlane = nullptr;
1682 
1683   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1684   // incoming offset, in case there's an add of a constant. We'll have to put it
1685   // back later.
1686   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1687     Readfirstlane = OffsetDef;
1688     BaseOffset = OffsetDef->getOperand(1).getReg();
1689     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1690   }
1691 
1692   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1693     // If we have a constant offset, try to use the 0 in m0 as the base.
1694     // TODO: Look into changing the default m0 initialization value. If the
1695     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1696     // the immediate offset.
1697 
1698     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1699     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1700       .addImm(0);
1701   } else {
1702     std::tie(BaseOffset, ImmOffset) =
1703         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1704 
1705     if (Readfirstlane) {
1706       // We have the constant offset now, so put the readfirstlane back on the
1707       // variable component.
1708       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1709         return false;
1710 
1711       Readfirstlane->getOperand(1).setReg(BaseOffset);
1712       BaseOffset = Readfirstlane->getOperand(0).getReg();
1713     } else {
1714       if (!RBI.constrainGenericRegister(BaseOffset,
1715                                         AMDGPU::SReg_32RegClass, *MRI))
1716         return false;
1717     }
1718 
1719     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1721       .addReg(BaseOffset)
1722       .addImm(16)
1723       .setOperandDead(3); // Dead scc
1724 
1725     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1726       .addReg(M0Base);
1727   }
1728 
1729   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1730   // offset field) % 64. Some versions of the programming guide omit the m0
1731   // part, or claim it's from offset 0.
1732   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1733 
1734   if (HasVSrc) {
1735     Register VSrc = MI.getOperand(1).getReg();
1736     MIB.addReg(VSrc);
1737 
1738     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1739       return false;
1740   }
1741 
1742   MIB.addImm(ImmOffset)
1743      .cloneMemRefs(MI);
1744 
1745   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1746 
1747   MI.eraseFromParent();
1748   return true;
1749 }
1750 
1751 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1752                                                       bool IsAppend) const {
1753   Register PtrBase = MI.getOperand(2).getReg();
1754   LLT PtrTy = MRI->getType(PtrBase);
1755   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1756 
1757   unsigned Offset;
1758   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1759 
1760   // TODO: Should this try to look through readfirstlane like GWS?
1761   if (!isDSOffsetLegal(PtrBase, Offset)) {
1762     PtrBase = MI.getOperand(2).getReg();
1763     Offset = 0;
1764   }
1765 
1766   MachineBasicBlock *MBB = MI.getParent();
1767   const DebugLoc &DL = MI.getDebugLoc();
1768   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1769 
1770   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1771     .addReg(PtrBase);
1772   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1773     return false;
1774 
1775   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1776     .addImm(Offset)
1777     .addImm(IsGDS ? -1 : 0)
1778     .cloneMemRefs(MI);
1779   MI.eraseFromParent();
1780   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1781 }
1782 
1783 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1784   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1785     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1786     if (WGSize <= STI.getWavefrontSize()) {
1787       MachineBasicBlock *MBB = MI.getParent();
1788       const DebugLoc &DL = MI.getDebugLoc();
1789       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1790       MI.eraseFromParent();
1791       return true;
1792     }
1793   }
1794 
1795   // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1796   if (STI.hasSplitBarriers()) {
1797     MachineBasicBlock *MBB = MI.getParent();
1798     const DebugLoc &DL = MI.getDebugLoc();
1799     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1800         .addImm(AMDGPU::Barrier::WORKGROUP);
1801     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1802         .addImm(AMDGPU::Barrier::WORKGROUP);
1803     MI.eraseFromParent();
1804     return true;
1805   }
1806 
1807   return selectImpl(MI, *CoverageInfo);
1808 }
1809 
1810 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1811                          bool &IsTexFail) {
1812   if (TexFailCtrl)
1813     IsTexFail = true;
1814 
1815   TFE = (TexFailCtrl & 0x1) ? true : false;
1816   TexFailCtrl &= ~(uint64_t)0x1;
1817   LWE = (TexFailCtrl & 0x2) ? true : false;
1818   TexFailCtrl &= ~(uint64_t)0x2;
1819 
1820   return TexFailCtrl == 0;
1821 }
1822 
1823 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1824   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1825   MachineBasicBlock *MBB = MI.getParent();
1826   const DebugLoc &DL = MI.getDebugLoc();
1827 
1828   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1829     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1830 
1831   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1832   unsigned IntrOpcode = Intr->BaseOpcode;
1833   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1834   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1835   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1836 
1837   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1838 
1839   Register VDataIn, VDataOut;
1840   LLT VDataTy;
1841   int NumVDataDwords = -1;
1842   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1843                MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1844 
1845   bool Unorm;
1846   if (!BaseOpcode->Sampler)
1847     Unorm = true;
1848   else
1849     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1850 
1851   bool TFE;
1852   bool LWE;
1853   bool IsTexFail = false;
1854   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1855                     TFE, LWE, IsTexFail))
1856     return false;
1857 
1858   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1859   const bool IsA16 = (Flags & 1) != 0;
1860   const bool IsG16 = (Flags & 2) != 0;
1861 
1862   // A16 implies 16 bit gradients if subtarget doesn't support G16
1863   if (IsA16 && !STI.hasG16() && !IsG16)
1864     return false;
1865 
1866   unsigned DMask = 0;
1867   unsigned DMaskLanes = 0;
1868 
1869   if (BaseOpcode->Atomic) {
1870     VDataOut = MI.getOperand(0).getReg();
1871     VDataIn = MI.getOperand(2).getReg();
1872     LLT Ty = MRI->getType(VDataIn);
1873 
1874     // Be careful to allow atomic swap on 16-bit element vectors.
1875     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1876       Ty.getSizeInBits() == 128 :
1877       Ty.getSizeInBits() == 64;
1878 
1879     if (BaseOpcode->AtomicX2) {
1880       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1881 
1882       DMask = Is64Bit ? 0xf : 0x3;
1883       NumVDataDwords = Is64Bit ? 4 : 2;
1884     } else {
1885       DMask = Is64Bit ? 0x3 : 0x1;
1886       NumVDataDwords = Is64Bit ? 2 : 1;
1887     }
1888   } else {
1889     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1890     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1891 
1892     if (BaseOpcode->Store) {
1893       VDataIn = MI.getOperand(1).getReg();
1894       VDataTy = MRI->getType(VDataIn);
1895       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1896     } else {
1897       VDataOut = MI.getOperand(0).getReg();
1898       VDataTy = MRI->getType(VDataOut);
1899       NumVDataDwords = DMaskLanes;
1900 
1901       if (IsD16 && !STI.hasUnpackedD16VMem())
1902         NumVDataDwords = (DMaskLanes + 1) / 2;
1903     }
1904   }
1905 
1906   // Set G16 opcode
1907   if (Subtarget->hasG16() && IsG16) {
1908     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1909         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1910     assert(G16MappingInfo);
1911     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1912   }
1913 
1914   // TODO: Check this in verifier.
1915   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1916 
1917   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1918   if (BaseOpcode->Atomic)
1919     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1920   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1921                AMDGPU::CPol::VOLATILE))
1922     return false;
1923 
1924   int NumVAddrRegs = 0;
1925   int NumVAddrDwords = 0;
1926   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1927     // Skip the $noregs and 0s inserted during legalization.
1928     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1929     if (!AddrOp.isReg())
1930       continue; // XXX - Break?
1931 
1932     Register Addr = AddrOp.getReg();
1933     if (!Addr)
1934       break;
1935 
1936     ++NumVAddrRegs;
1937     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1938   }
1939 
1940   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1941   // NSA, these should have been packed into a single value in the first
1942   // address register
1943   const bool UseNSA =
1944       NumVAddrRegs != 1 &&
1945       (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1946                                    : NumVAddrDwords == NumVAddrRegs);
1947   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1948     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1949     return false;
1950   }
1951 
1952   if (IsTexFail)
1953     ++NumVDataDwords;
1954 
1955   int Opcode = -1;
1956   if (IsGFX12Plus) {
1957     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1958                                    NumVDataDwords, NumVAddrDwords);
1959   } else if (IsGFX11Plus) {
1960     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1961                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
1962                                           : AMDGPU::MIMGEncGfx11Default,
1963                                    NumVDataDwords, NumVAddrDwords);
1964   } else if (IsGFX10Plus) {
1965     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1966                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1967                                           : AMDGPU::MIMGEncGfx10Default,
1968                                    NumVDataDwords, NumVAddrDwords);
1969   } else {
1970     if (Subtarget->hasGFX90AInsts()) {
1971       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1972                                      NumVDataDwords, NumVAddrDwords);
1973       if (Opcode == -1) {
1974         LLVM_DEBUG(
1975             dbgs()
1976             << "requested image instruction is not supported on this GPU\n");
1977         return false;
1978       }
1979     }
1980     if (Opcode == -1 &&
1981         STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1982       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1983                                      NumVDataDwords, NumVAddrDwords);
1984     if (Opcode == -1)
1985       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1986                                      NumVDataDwords, NumVAddrDwords);
1987   }
1988   if (Opcode == -1)
1989     return false;
1990 
1991   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1992     .cloneMemRefs(MI);
1993 
1994   if (VDataOut) {
1995     if (BaseOpcode->AtomicX2) {
1996       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1997 
1998       Register TmpReg = MRI->createVirtualRegister(
1999         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2000       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2001 
2002       MIB.addDef(TmpReg);
2003       if (!MRI->use_empty(VDataOut)) {
2004         BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2005             .addReg(TmpReg, RegState::Kill, SubReg);
2006       }
2007 
2008     } else {
2009       MIB.addDef(VDataOut); // vdata output
2010     }
2011   }
2012 
2013   if (VDataIn)
2014     MIB.addReg(VDataIn); // vdata input
2015 
2016   for (int I = 0; I != NumVAddrRegs; ++I) {
2017     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2018     if (SrcOp.isReg()) {
2019       assert(SrcOp.getReg() != 0);
2020       MIB.addReg(SrcOp.getReg());
2021     }
2022   }
2023 
2024   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2025   if (BaseOpcode->Sampler)
2026     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2027 
2028   MIB.addImm(DMask); // dmask
2029 
2030   if (IsGFX10Plus)
2031     MIB.addImm(DimInfo->Encoding);
2032   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2033     MIB.addImm(Unorm);
2034 
2035   MIB.addImm(CPol);
2036   MIB.addImm(IsA16 &&  // a16 or r128
2037              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2038   if (IsGFX10Plus)
2039     MIB.addImm(IsA16 ? -1 : 0);
2040 
2041   if (!Subtarget->hasGFX90AInsts()) {
2042     MIB.addImm(TFE); // tfe
2043   } else if (TFE) {
2044     LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2045     return false;
2046   }
2047 
2048   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2049     MIB.addImm(LWE); // lwe
2050   if (!IsGFX10Plus)
2051     MIB.addImm(DimInfo->DA ? -1 : 0);
2052   if (BaseOpcode->HasD16)
2053     MIB.addImm(IsD16 ? -1 : 0);
2054 
2055   if (IsTexFail) {
2056     // An image load instruction with TFE/LWE only conditionally writes to its
2057     // result registers. Initialize them to zero so that we always get well
2058     // defined result values.
2059     assert(VDataOut && !VDataIn);
2060     Register Tied = MRI->cloneVirtualRegister(VDataOut);
2061     Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2062     BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
2063       .addImm(0);
2064     auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
2065     if (STI.usePRTStrictNull()) {
2066       // With enable-prt-strict-null enabled, initialize all result registers to
2067       // zero.
2068       auto RegSeq =
2069           BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2070       for (auto Sub : Parts)
2071         RegSeq.addReg(Zero).addImm(Sub);
2072     } else {
2073       // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
2074       // result register.
2075       Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2076       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
2077       auto RegSeq =
2078           BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2079       for (auto Sub : Parts.drop_back(1))
2080         RegSeq.addReg(Undef).addImm(Sub);
2081       RegSeq.addReg(Zero).addImm(Parts.back());
2082     }
2083     MIB.addReg(Tied, RegState::Implicit);
2084     MIB->tieOperands(0, MIB->getNumOperands() - 1);
2085   }
2086 
2087   MI.eraseFromParent();
2088   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2089   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2090   return true;
2091 }
2092 
2093 // We need to handle this here because tablegen doesn't support matching
2094 // instructions with multiple outputs.
2095 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2096     MachineInstr &MI) const {
2097   Register Dst0 = MI.getOperand(0).getReg();
2098   Register Dst1 = MI.getOperand(1).getReg();
2099 
2100   const DebugLoc &DL = MI.getDebugLoc();
2101   MachineBasicBlock *MBB = MI.getParent();
2102 
2103   Register Addr = MI.getOperand(3).getReg();
2104   Register Data0 = MI.getOperand(4).getReg();
2105   Register Data1 = MI.getOperand(5).getReg();
2106   unsigned Offset = MI.getOperand(6).getImm();
2107 
2108   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2109                  .addDef(Dst1)
2110                  .addUse(Addr)
2111                  .addUse(Data0)
2112                  .addUse(Data1)
2113                  .addImm(Offset)
2114                  .cloneMemRefs(MI);
2115 
2116   MI.eraseFromParent();
2117   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2118 }
2119 
2120 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2121     MachineInstr &I) const {
2122   unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2123   switch (IntrinsicID) {
2124   case Intrinsic::amdgcn_end_cf:
2125     return selectEndCfIntrinsic(I);
2126   case Intrinsic::amdgcn_ds_ordered_add:
2127   case Intrinsic::amdgcn_ds_ordered_swap:
2128     return selectDSOrderedIntrinsic(I, IntrinsicID);
2129   case Intrinsic::amdgcn_ds_gws_init:
2130   case Intrinsic::amdgcn_ds_gws_barrier:
2131   case Intrinsic::amdgcn_ds_gws_sema_v:
2132   case Intrinsic::amdgcn_ds_gws_sema_br:
2133   case Intrinsic::amdgcn_ds_gws_sema_p:
2134   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2135     return selectDSGWSIntrinsic(I, IntrinsicID);
2136   case Intrinsic::amdgcn_ds_append:
2137     return selectDSAppendConsume(I, true);
2138   case Intrinsic::amdgcn_ds_consume:
2139     return selectDSAppendConsume(I, false);
2140   case Intrinsic::amdgcn_s_barrier:
2141     return selectSBarrier(I);
2142   case Intrinsic::amdgcn_raw_buffer_load_lds:
2143   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2144   case Intrinsic::amdgcn_struct_buffer_load_lds:
2145   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2146     return selectBufferLoadLds(I);
2147   case Intrinsic::amdgcn_global_load_lds:
2148     return selectGlobalLoadLds(I);
2149   case Intrinsic::amdgcn_exp_compr:
2150     if (!STI.hasCompressedExport()) {
2151       Function &F = I.getMF()->getFunction();
2152       DiagnosticInfoUnsupported NoFpRet(
2153           F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2154       F.getContext().diagnose(NoFpRet);
2155       return false;
2156     }
2157     break;
2158   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2159     return selectDSBvhStackIntrinsic(I);
2160   case Intrinsic::amdgcn_s_barrier_init:
2161   case Intrinsic::amdgcn_s_barrier_join:
2162   case Intrinsic::amdgcn_s_wakeup_barrier:
2163   case Intrinsic::amdgcn_s_get_barrier_state:
2164     return selectNamedBarrierInst(I, IntrinsicID);
2165   case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2166   case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2167     return selectSBarrierSignalIsfirst(I, IntrinsicID);
2168   case Intrinsic::amdgcn_s_barrier_leave:
2169     return selectSBarrierLeave(I);
2170   }
2171   return selectImpl(I, *CoverageInfo);
2172 }
2173 
2174 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2175   if (selectImpl(I, *CoverageInfo))
2176     return true;
2177 
2178   MachineBasicBlock *BB = I.getParent();
2179   const DebugLoc &DL = I.getDebugLoc();
2180 
2181   Register DstReg = I.getOperand(0).getReg();
2182   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2183   assert(Size <= 32 || Size == 64);
2184   const MachineOperand &CCOp = I.getOperand(1);
2185   Register CCReg = CCOp.getReg();
2186   if (!isVCC(CCReg, *MRI)) {
2187     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2188                                          AMDGPU::S_CSELECT_B32;
2189     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2190             .addReg(CCReg);
2191 
2192     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2193     // bank, because it does not cover the register class that we used to represent
2194     // for it.  So we need to manually set the register class here.
2195     if (!MRI->getRegClassOrNull(CCReg))
2196         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2197     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2198             .add(I.getOperand(2))
2199             .add(I.getOperand(3));
2200 
2201     bool Ret = false;
2202     Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2203     Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2204     I.eraseFromParent();
2205     return Ret;
2206   }
2207 
2208   // Wide VGPR select should have been split in RegBankSelect.
2209   if (Size > 32)
2210     return false;
2211 
2212   MachineInstr *Select =
2213       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2214               .addImm(0)
2215               .add(I.getOperand(3))
2216               .addImm(0)
2217               .add(I.getOperand(2))
2218               .add(I.getOperand(1));
2219 
2220   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2221   I.eraseFromParent();
2222   return Ret;
2223 }
2224 
2225 static int sizeToSubRegIndex(unsigned Size) {
2226   switch (Size) {
2227   case 32:
2228     return AMDGPU::sub0;
2229   case 64:
2230     return AMDGPU::sub0_sub1;
2231   case 96:
2232     return AMDGPU::sub0_sub1_sub2;
2233   case 128:
2234     return AMDGPU::sub0_sub1_sub2_sub3;
2235   case 256:
2236     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2237   default:
2238     if (Size < 32)
2239       return AMDGPU::sub0;
2240     if (Size > 256)
2241       return -1;
2242     return sizeToSubRegIndex(llvm::bit_ceil(Size));
2243   }
2244 }
2245 
2246 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2247   Register DstReg = I.getOperand(0).getReg();
2248   Register SrcReg = I.getOperand(1).getReg();
2249   const LLT DstTy = MRI->getType(DstReg);
2250   const LLT SrcTy = MRI->getType(SrcReg);
2251   const LLT S1 = LLT::scalar(1);
2252 
2253   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2254   const RegisterBank *DstRB;
2255   if (DstTy == S1) {
2256     // This is a special case. We don't treat s1 for legalization artifacts as
2257     // vcc booleans.
2258     DstRB = SrcRB;
2259   } else {
2260     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2261     if (SrcRB != DstRB)
2262       return false;
2263   }
2264 
2265   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2266 
2267   unsigned DstSize = DstTy.getSizeInBits();
2268   unsigned SrcSize = SrcTy.getSizeInBits();
2269 
2270   const TargetRegisterClass *SrcRC =
2271       TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2272   const TargetRegisterClass *DstRC =
2273       TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2274   if (!SrcRC || !DstRC)
2275     return false;
2276 
2277   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2278       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2279     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2280     return false;
2281   }
2282 
2283   if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2284     MachineBasicBlock *MBB = I.getParent();
2285     const DebugLoc &DL = I.getDebugLoc();
2286 
2287     Register LoReg = MRI->createVirtualRegister(DstRC);
2288     Register HiReg = MRI->createVirtualRegister(DstRC);
2289     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2290       .addReg(SrcReg, 0, AMDGPU::sub0);
2291     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2292       .addReg(SrcReg, 0, AMDGPU::sub1);
2293 
2294     if (IsVALU && STI.hasSDWA()) {
2295       // Write the low 16-bits of the high element into the high 16-bits of the
2296       // low element.
2297       MachineInstr *MovSDWA =
2298         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2299         .addImm(0)                             // $src0_modifiers
2300         .addReg(HiReg)                         // $src0
2301         .addImm(0)                             // $clamp
2302         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2303         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2304         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2305         .addReg(LoReg, RegState::Implicit);
2306       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2307     } else {
2308       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2309       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2310       Register ImmReg = MRI->createVirtualRegister(DstRC);
2311       if (IsVALU) {
2312         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2313           .addImm(16)
2314           .addReg(HiReg);
2315       } else {
2316         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2317           .addReg(HiReg)
2318           .addImm(16)
2319           .setOperandDead(3); // Dead scc
2320       }
2321 
2322       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2323       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2324       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2325 
2326       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2327         .addImm(0xffff);
2328       auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2329         .addReg(LoReg)
2330         .addReg(ImmReg);
2331       auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2332         .addReg(TmpReg0)
2333         .addReg(TmpReg1);
2334 
2335       if (!IsVALU) {
2336         And.setOperandDead(3); // Dead scc
2337         Or.setOperandDead(3); // Dead scc
2338       }
2339     }
2340 
2341     I.eraseFromParent();
2342     return true;
2343   }
2344 
2345   if (!DstTy.isScalar())
2346     return false;
2347 
2348   if (SrcSize > 32) {
2349     int SubRegIdx = sizeToSubRegIndex(DstSize);
2350     if (SubRegIdx == -1)
2351       return false;
2352 
2353     // Deal with weird cases where the class only partially supports the subreg
2354     // index.
2355     const TargetRegisterClass *SrcWithSubRC
2356       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2357     if (!SrcWithSubRC)
2358       return false;
2359 
2360     if (SrcWithSubRC != SrcRC) {
2361       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2362         return false;
2363     }
2364 
2365     I.getOperand(1).setSubReg(SubRegIdx);
2366   }
2367 
2368   I.setDesc(TII.get(TargetOpcode::COPY));
2369   return true;
2370 }
2371 
2372 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
2373 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2374   Mask = maskTrailingOnes<unsigned>(Size);
2375   int SignedMask = static_cast<int>(Mask);
2376   return SignedMask >= -16 && SignedMask <= 64;
2377 }
2378 
2379 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2380 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2381   Register Reg, const MachineRegisterInfo &MRI,
2382   const TargetRegisterInfo &TRI) const {
2383   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2384   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2385     return RB;
2386 
2387   // Ignore the type, since we don't use vcc in artifacts.
2388   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2389     return &RBI.getRegBankFromRegClass(*RC, LLT());
2390   return nullptr;
2391 }
2392 
2393 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2394   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2395   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2396   const DebugLoc &DL = I.getDebugLoc();
2397   MachineBasicBlock &MBB = *I.getParent();
2398   const Register DstReg = I.getOperand(0).getReg();
2399   const Register SrcReg = I.getOperand(1).getReg();
2400 
2401   const LLT DstTy = MRI->getType(DstReg);
2402   const LLT SrcTy = MRI->getType(SrcReg);
2403   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2404     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2405   const unsigned DstSize = DstTy.getSizeInBits();
2406   if (!DstTy.isScalar())
2407     return false;
2408 
2409   // Artifact casts should never use vcc.
2410   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2411 
2412   // FIXME: This should probably be illegal and split earlier.
2413   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2414     if (DstSize <= 32)
2415       return selectCOPY(I);
2416 
2417     const TargetRegisterClass *SrcRC =
2418         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2419     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2420     const TargetRegisterClass *DstRC =
2421         TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2422 
2423     Register UndefReg = MRI->createVirtualRegister(SrcRC);
2424     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2425     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2426       .addReg(SrcReg)
2427       .addImm(AMDGPU::sub0)
2428       .addReg(UndefReg)
2429       .addImm(AMDGPU::sub1);
2430     I.eraseFromParent();
2431 
2432     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2433            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2434   }
2435 
2436   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2437     // 64-bit should have been split up in RegBankSelect
2438 
2439     // Try to use an and with a mask if it will save code size.
2440     unsigned Mask;
2441     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2442       MachineInstr *ExtI =
2443       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2444         .addImm(Mask)
2445         .addReg(SrcReg);
2446       I.eraseFromParent();
2447       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2448     }
2449 
2450     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2451     MachineInstr *ExtI =
2452       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2453       .addReg(SrcReg)
2454       .addImm(0) // Offset
2455       .addImm(SrcSize); // Width
2456     I.eraseFromParent();
2457     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2458   }
2459 
2460   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2461     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2462       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2463     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2464       return false;
2465 
2466     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2467       const unsigned SextOpc = SrcSize == 8 ?
2468         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2469       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2470         .addReg(SrcReg);
2471       I.eraseFromParent();
2472       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2473     }
2474 
2475     // Using a single 32-bit SALU to calculate the high half is smaller than
2476     // S_BFE with a literal constant operand.
2477     if (DstSize > 32 && SrcSize == 32) {
2478       Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2479       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2480       if (Signed) {
2481         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2482           .addReg(SrcReg, 0, SubReg)
2483           .addImm(31)
2484           .setOperandDead(3); // Dead scc
2485       } else {
2486         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2487           .addImm(0);
2488       }
2489       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2490         .addReg(SrcReg, 0, SubReg)
2491         .addImm(AMDGPU::sub0)
2492         .addReg(HiReg)
2493         .addImm(AMDGPU::sub1);
2494       I.eraseFromParent();
2495       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2496                                           *MRI);
2497     }
2498 
2499     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2500     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2501 
2502     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2503     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2504       // We need a 64-bit register source, but the high bits don't matter.
2505       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2506       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2507       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2508 
2509       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2510       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2511         .addReg(SrcReg, 0, SubReg)
2512         .addImm(AMDGPU::sub0)
2513         .addReg(UndefReg)
2514         .addImm(AMDGPU::sub1);
2515 
2516       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2517         .addReg(ExtReg)
2518         .addImm(SrcSize << 16);
2519 
2520       I.eraseFromParent();
2521       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2522     }
2523 
2524     unsigned Mask;
2525     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2526       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2527         .addReg(SrcReg)
2528         .addImm(Mask)
2529         .setOperandDead(3); // Dead scc
2530     } else {
2531       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2532         .addReg(SrcReg)
2533         .addImm(SrcSize << 16);
2534     }
2535 
2536     I.eraseFromParent();
2537     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2538   }
2539 
2540   return false;
2541 }
2542 
2543 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2544                            Register &Out) {
2545   Register LShlSrc;
2546   if (mi_match(In, MRI,
2547                m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2548     Out = LShlSrc;
2549     return true;
2550   }
2551   return false;
2552 }
2553 
2554 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2555   if (!Subtarget->hasSALUFloatInsts())
2556     return false;
2557 
2558   Register Dst = I.getOperand(0).getReg();
2559   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2560   if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2561     return false;
2562 
2563   Register Src = I.getOperand(1).getReg();
2564 
2565   if (MRI->getType(Dst) == LLT::scalar(32) &&
2566       MRI->getType(Src) == LLT::scalar(16)) {
2567     if (isExtractHiElt(*MRI, Src, Src)) {
2568       MachineBasicBlock *BB = I.getParent();
2569       BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2570           .addUse(Src);
2571       I.eraseFromParent();
2572       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2573     }
2574   }
2575 
2576   return false;
2577 }
2578 
2579 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2580   MachineBasicBlock *BB = I.getParent();
2581   MachineOperand &ImmOp = I.getOperand(1);
2582   Register DstReg = I.getOperand(0).getReg();
2583   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2584   bool IsFP = false;
2585 
2586   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2587   if (ImmOp.isFPImm()) {
2588     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2589     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2590     IsFP = true;
2591   } else if (ImmOp.isCImm()) {
2592     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2593   } else {
2594     llvm_unreachable("Not supported by g_constants");
2595   }
2596 
2597   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2598   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2599 
2600   unsigned Opcode;
2601   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2602     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2603   } else if (Size == 64 &&
2604              AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2605     Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2606     I.setDesc(TII.get(Opcode));
2607     I.addImplicitDefUseOperands(*MF);
2608     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2609   } else {
2610     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2611 
2612     // We should never produce s1 values on banks other than VCC. If the user of
2613     // this already constrained the register, we may incorrectly think it's VCC
2614     // if it wasn't originally.
2615     if (Size == 1)
2616       return false;
2617   }
2618 
2619   if (Size != 64) {
2620     I.setDesc(TII.get(Opcode));
2621     I.addImplicitDefUseOperands(*MF);
2622     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2623   }
2624 
2625   const DebugLoc &DL = I.getDebugLoc();
2626 
2627   APInt Imm(Size, I.getOperand(1).getImm());
2628 
2629   MachineInstr *ResInst;
2630   if (IsSgpr && TII.isInlineConstant(Imm)) {
2631     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2632       .addImm(I.getOperand(1).getImm());
2633   } else {
2634     const TargetRegisterClass *RC = IsSgpr ?
2635       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2636     Register LoReg = MRI->createVirtualRegister(RC);
2637     Register HiReg = MRI->createVirtualRegister(RC);
2638 
2639     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2640       .addImm(Imm.trunc(32).getZExtValue());
2641 
2642     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2643       .addImm(Imm.ashr(32).getZExtValue());
2644 
2645     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2646       .addReg(LoReg)
2647       .addImm(AMDGPU::sub0)
2648       .addReg(HiReg)
2649       .addImm(AMDGPU::sub1);
2650   }
2651 
2652   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2653   // work for target independent opcodes
2654   I.eraseFromParent();
2655   const TargetRegisterClass *DstRC =
2656     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2657   if (!DstRC)
2658     return true;
2659   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2660 }
2661 
2662 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2663   // Only manually handle the f64 SGPR case.
2664   //
2665   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2666   // the bit ops theoretically have a second result due to the implicit def of
2667   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2668   // that is easy by disabling the check. The result works, but uses a
2669   // nonsensical sreg32orlds_and_sreg_1 regclass.
2670   //
2671   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2672   // the variadic REG_SEQUENCE operands.
2673 
2674   Register Dst = MI.getOperand(0).getReg();
2675   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2676   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2677       MRI->getType(Dst) != LLT::scalar(64))
2678     return false;
2679 
2680   Register Src = MI.getOperand(1).getReg();
2681   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2682   if (Fabs)
2683     Src = Fabs->getOperand(1).getReg();
2684 
2685   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2686       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2687     return false;
2688 
2689   MachineBasicBlock *BB = MI.getParent();
2690   const DebugLoc &DL = MI.getDebugLoc();
2691   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2692   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2693   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2694   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2695 
2696   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2697     .addReg(Src, 0, AMDGPU::sub0);
2698   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2699     .addReg(Src, 0, AMDGPU::sub1);
2700   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2701     .addImm(0x80000000);
2702 
2703   // Set or toggle sign bit.
2704   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2705   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2706     .addReg(HiReg)
2707     .addReg(ConstReg)
2708     .setOperandDead(3); // Dead scc
2709   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2710     .addReg(LoReg)
2711     .addImm(AMDGPU::sub0)
2712     .addReg(OpReg)
2713     .addImm(AMDGPU::sub1);
2714   MI.eraseFromParent();
2715   return true;
2716 }
2717 
2718 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2719 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2720   Register Dst = MI.getOperand(0).getReg();
2721   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2722   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2723       MRI->getType(Dst) != LLT::scalar(64))
2724     return false;
2725 
2726   Register Src = MI.getOperand(1).getReg();
2727   MachineBasicBlock *BB = MI.getParent();
2728   const DebugLoc &DL = MI.getDebugLoc();
2729   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2730   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2731   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2732   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2733 
2734   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2735       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2736     return false;
2737 
2738   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2739     .addReg(Src, 0, AMDGPU::sub0);
2740   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2741     .addReg(Src, 0, AMDGPU::sub1);
2742   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2743     .addImm(0x7fffffff);
2744 
2745   // Clear sign bit.
2746   // TODO: Should this used S_BITSET0_*?
2747   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2748     .addReg(HiReg)
2749     .addReg(ConstReg)
2750     .setOperandDead(3); // Dead scc
2751   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2752     .addReg(LoReg)
2753     .addImm(AMDGPU::sub0)
2754     .addReg(OpReg)
2755     .addImm(AMDGPU::sub1);
2756 
2757   MI.eraseFromParent();
2758   return true;
2759 }
2760 
2761 static bool isConstant(const MachineInstr &MI) {
2762   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2763 }
2764 
2765 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2766     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2767 
2768   unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2769   const MachineInstr *PtrMI =
2770       MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2771 
2772   assert(PtrMI);
2773 
2774   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2775     return;
2776 
2777   GEPInfo GEPInfo;
2778 
2779   for (unsigned i = 1; i != 3; ++i) {
2780     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2781     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2782     assert(OpDef);
2783     if (i == 2 && isConstant(*OpDef)) {
2784       // TODO: Could handle constant base + variable offset, but a combine
2785       // probably should have commuted it.
2786       assert(GEPInfo.Imm == 0);
2787       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2788       continue;
2789     }
2790     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2791     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2792       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2793     else
2794       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2795   }
2796 
2797   AddrInfo.push_back(GEPInfo);
2798   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2799 }
2800 
2801 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2802   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2803 }
2804 
2805 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2806   if (!MI.hasOneMemOperand())
2807     return false;
2808 
2809   const MachineMemOperand *MMO = *MI.memoperands_begin();
2810   const Value *Ptr = MMO->getValue();
2811 
2812   // UndefValue means this is a load of a kernel input.  These are uniform.
2813   // Sometimes LDS instructions have constant pointers.
2814   // If Ptr is null, then that means this mem operand contains a
2815   // PseudoSourceValue like GOT.
2816   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2817       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2818     return true;
2819 
2820   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2821     return true;
2822 
2823   if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2824     return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2825            AMDGPU::SGPRRegBankID;
2826 
2827   const Instruction *I = dyn_cast<Instruction>(Ptr);
2828   return I && I->getMetadata("amdgpu.uniform");
2829 }
2830 
2831 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2832   for (const GEPInfo &GEPInfo : AddrInfo) {
2833     if (!GEPInfo.VgprParts.empty())
2834       return true;
2835   }
2836   return false;
2837 }
2838 
2839 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2840   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2841   unsigned AS = PtrTy.getAddressSpace();
2842   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2843       STI.ldsRequiresM0Init()) {
2844     MachineBasicBlock *BB = I.getParent();
2845 
2846     // If DS instructions require M0 initialization, insert it before selecting.
2847     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2848       .addImm(-1);
2849   }
2850 }
2851 
2852 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2853   MachineInstr &I) const {
2854   initM0(I);
2855   return selectImpl(I, *CoverageInfo);
2856 }
2857 
2858 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2859   if (Reg.isPhysical())
2860     return false;
2861 
2862   MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2863   const unsigned Opcode = MI.getOpcode();
2864 
2865   if (Opcode == AMDGPU::COPY)
2866     return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2867 
2868   if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2869       Opcode == AMDGPU::G_XOR)
2870     return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2871            isVCmpResult(MI.getOperand(2).getReg(), MRI);
2872 
2873   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2874     return GI->is(Intrinsic::amdgcn_class);
2875 
2876   return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2877 }
2878 
2879 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2880   MachineBasicBlock *BB = I.getParent();
2881   MachineOperand &CondOp = I.getOperand(0);
2882   Register CondReg = CondOp.getReg();
2883   const DebugLoc &DL = I.getDebugLoc();
2884 
2885   unsigned BrOpcode;
2886   Register CondPhysReg;
2887   const TargetRegisterClass *ConstrainRC;
2888 
2889   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2890   // whether the branch is uniform when selecting the instruction. In
2891   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2892   // RegBankSelect knows what it's doing if the branch condition is scc, even
2893   // though it currently does not.
2894   if (!isVCC(CondReg, *MRI)) {
2895     if (MRI->getType(CondReg) != LLT::scalar(32))
2896       return false;
2897 
2898     CondPhysReg = AMDGPU::SCC;
2899     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2900     ConstrainRC = &AMDGPU::SReg_32RegClass;
2901   } else {
2902     // FIXME: Should scc->vcc copies and with exec?
2903 
2904     // Unless the value of CondReg is a result of a V_CMP* instruction then we
2905     // need to insert an and with exec.
2906     if (!isVCmpResult(CondReg, *MRI)) {
2907       const bool Is64 = STI.isWave64();
2908       const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2909       const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2910 
2911       Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2912       BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2913           .addReg(CondReg)
2914           .addReg(Exec)
2915           .setOperandDead(3); // Dead scc
2916       CondReg = TmpReg;
2917     }
2918 
2919     CondPhysReg = TRI.getVCC();
2920     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2921     ConstrainRC = TRI.getBoolRC();
2922   }
2923 
2924   if (!MRI->getRegClassOrNull(CondReg))
2925     MRI->setRegClass(CondReg, ConstrainRC);
2926 
2927   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2928     .addReg(CondReg);
2929   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2930     .addMBB(I.getOperand(1).getMBB());
2931 
2932   I.eraseFromParent();
2933   return true;
2934 }
2935 
2936 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2937   MachineInstr &I) const {
2938   Register DstReg = I.getOperand(0).getReg();
2939   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2940   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2941   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2942   if (IsVGPR)
2943     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2944 
2945   return RBI.constrainGenericRegister(
2946     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2947 }
2948 
2949 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2950   Register DstReg = I.getOperand(0).getReg();
2951   Register SrcReg = I.getOperand(1).getReg();
2952   Register MaskReg = I.getOperand(2).getReg();
2953   LLT Ty = MRI->getType(DstReg);
2954   LLT MaskTy = MRI->getType(MaskReg);
2955   MachineBasicBlock *BB = I.getParent();
2956   const DebugLoc &DL = I.getDebugLoc();
2957 
2958   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2959   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2960   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2961   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2962   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2963     return false;
2964 
2965   // Try to avoid emitting a bit operation when we only need to touch half of
2966   // the 64-bit pointer.
2967   APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2968   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2969   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2970 
2971   const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2972   const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2973 
2974   if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2975       !CanCopyLow32 && !CanCopyHi32) {
2976     auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2977       .addReg(SrcReg)
2978       .addReg(MaskReg)
2979       .setOperandDead(3); // Dead scc
2980     I.eraseFromParent();
2981     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2982   }
2983 
2984   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2985   const TargetRegisterClass &RegRC
2986     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2987 
2988   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2989   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2990   const TargetRegisterClass *MaskRC =
2991       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2992 
2993   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2994       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2995       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2996     return false;
2997 
2998   if (Ty.getSizeInBits() == 32) {
2999     assert(MaskTy.getSizeInBits() == 32 &&
3000            "ptrmask should have been narrowed during legalize");
3001 
3002     auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3003       .addReg(SrcReg)
3004       .addReg(MaskReg);
3005 
3006     if (!IsVGPR)
3007       NewOp.setOperandDead(3); // Dead scc
3008     I.eraseFromParent();
3009     return true;
3010   }
3011 
3012   Register HiReg = MRI->createVirtualRegister(&RegRC);
3013   Register LoReg = MRI->createVirtualRegister(&RegRC);
3014 
3015   // Extract the subregisters from the source pointer.
3016   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3017     .addReg(SrcReg, 0, AMDGPU::sub0);
3018   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3019     .addReg(SrcReg, 0, AMDGPU::sub1);
3020 
3021   Register MaskedLo, MaskedHi;
3022 
3023   if (CanCopyLow32) {
3024     // If all the bits in the low half are 1, we only need a copy for it.
3025     MaskedLo = LoReg;
3026   } else {
3027     // Extract the mask subregister and apply the and.
3028     Register MaskLo = MRI->createVirtualRegister(&RegRC);
3029     MaskedLo = MRI->createVirtualRegister(&RegRC);
3030 
3031     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3032       .addReg(MaskReg, 0, AMDGPU::sub0);
3033     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3034       .addReg(LoReg)
3035       .addReg(MaskLo);
3036   }
3037 
3038   if (CanCopyHi32) {
3039     // If all the bits in the high half are 1, we only need a copy for it.
3040     MaskedHi = HiReg;
3041   } else {
3042     Register MaskHi = MRI->createVirtualRegister(&RegRC);
3043     MaskedHi = MRI->createVirtualRegister(&RegRC);
3044 
3045     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3046       .addReg(MaskReg, 0, AMDGPU::sub1);
3047     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3048       .addReg(HiReg)
3049       .addReg(MaskHi);
3050   }
3051 
3052   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3053     .addReg(MaskedLo)
3054     .addImm(AMDGPU::sub0)
3055     .addReg(MaskedHi)
3056     .addImm(AMDGPU::sub1);
3057   I.eraseFromParent();
3058   return true;
3059 }
3060 
3061 /// Return the register to use for the index value, and the subregister to use
3062 /// for the indirectly accessed register.
3063 static std::pair<Register, unsigned>
3064 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3065                         const TargetRegisterClass *SuperRC, Register IdxReg,
3066                         unsigned EltSize, GISelKnownBits &KnownBits) {
3067   Register IdxBaseReg;
3068   int Offset;
3069 
3070   std::tie(IdxBaseReg, Offset) =
3071       AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
3072   if (IdxBaseReg == AMDGPU::NoRegister) {
3073     // This will happen if the index is a known constant. This should ordinarily
3074     // be legalized out, but handle it as a register just in case.
3075     assert(Offset == 0);
3076     IdxBaseReg = IdxReg;
3077   }
3078 
3079   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3080 
3081   // Skip out of bounds offsets, or else we would end up using an undefined
3082   // register.
3083   if (static_cast<unsigned>(Offset) >= SubRegs.size())
3084     return std::pair(IdxReg, SubRegs[0]);
3085   return std::pair(IdxBaseReg, SubRegs[Offset]);
3086 }
3087 
3088 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3089   MachineInstr &MI) const {
3090   Register DstReg = MI.getOperand(0).getReg();
3091   Register SrcReg = MI.getOperand(1).getReg();
3092   Register IdxReg = MI.getOperand(2).getReg();
3093 
3094   LLT DstTy = MRI->getType(DstReg);
3095   LLT SrcTy = MRI->getType(SrcReg);
3096 
3097   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3098   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3099   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3100 
3101   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3102   // into a waterfall loop.
3103   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3104     return false;
3105 
3106   const TargetRegisterClass *SrcRC =
3107       TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3108   const TargetRegisterClass *DstRC =
3109       TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3110   if (!SrcRC || !DstRC)
3111     return false;
3112   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3113       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3114       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3115     return false;
3116 
3117   MachineBasicBlock *BB = MI.getParent();
3118   const DebugLoc &DL = MI.getDebugLoc();
3119   const bool Is64 = DstTy.getSizeInBits() == 64;
3120 
3121   unsigned SubReg;
3122   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3123       *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3124 
3125   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3126     if (DstTy.getSizeInBits() != 32 && !Is64)
3127       return false;
3128 
3129     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3130       .addReg(IdxReg);
3131 
3132     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3133     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3134       .addReg(SrcReg, 0, SubReg)
3135       .addReg(SrcReg, RegState::Implicit);
3136     MI.eraseFromParent();
3137     return true;
3138   }
3139 
3140   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3141     return false;
3142 
3143   if (!STI.useVGPRIndexMode()) {
3144     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3145       .addReg(IdxReg);
3146     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3147       .addReg(SrcReg, 0, SubReg)
3148       .addReg(SrcReg, RegState::Implicit);
3149     MI.eraseFromParent();
3150     return true;
3151   }
3152 
3153   const MCInstrDesc &GPRIDXDesc =
3154       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3155   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3156       .addReg(SrcReg)
3157       .addReg(IdxReg)
3158       .addImm(SubReg);
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3165 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3166   MachineInstr &MI) const {
3167   Register DstReg = MI.getOperand(0).getReg();
3168   Register VecReg = MI.getOperand(1).getReg();
3169   Register ValReg = MI.getOperand(2).getReg();
3170   Register IdxReg = MI.getOperand(3).getReg();
3171 
3172   LLT VecTy = MRI->getType(DstReg);
3173   LLT ValTy = MRI->getType(ValReg);
3174   unsigned VecSize = VecTy.getSizeInBits();
3175   unsigned ValSize = ValTy.getSizeInBits();
3176 
3177   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3178   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3179   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3180 
3181   assert(VecTy.getElementType() == ValTy);
3182 
3183   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3184   // into a waterfall loop.
3185   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3186     return false;
3187 
3188   const TargetRegisterClass *VecRC =
3189       TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3190   const TargetRegisterClass *ValRC =
3191       TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3192 
3193   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3194       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3195       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3196       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3197     return false;
3198 
3199   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3200     return false;
3201 
3202   unsigned SubReg;
3203   std::tie(IdxReg, SubReg) =
3204       computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3205 
3206   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3207                          STI.useVGPRIndexMode();
3208 
3209   MachineBasicBlock *BB = MI.getParent();
3210   const DebugLoc &DL = MI.getDebugLoc();
3211 
3212   if (!IndexMode) {
3213     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3214       .addReg(IdxReg);
3215 
3216     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3217         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3218     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3219         .addReg(VecReg)
3220         .addReg(ValReg)
3221         .addImm(SubReg);
3222     MI.eraseFromParent();
3223     return true;
3224   }
3225 
3226   const MCInstrDesc &GPRIDXDesc =
3227       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3228   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3229       .addReg(VecReg)
3230       .addReg(ValReg)
3231       .addReg(IdxReg)
3232       .addImm(SubReg);
3233 
3234   MI.eraseFromParent();
3235   return true;
3236 }
3237 
3238 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3239   assert(!AMDGPU::isGFX12Plus(STI));
3240   unsigned Opc;
3241   unsigned Size = MI.getOperand(3).getImm();
3242 
3243   // The struct intrinsic variants add one additional operand over raw.
3244   const bool HasVIndex = MI.getNumOperands() == 9;
3245   Register VIndex;
3246   int OpOffset = 0;
3247   if (HasVIndex) {
3248     VIndex = MI.getOperand(4).getReg();
3249     OpOffset = 1;
3250   }
3251 
3252   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3253   std::optional<ValueAndVReg> MaybeVOffset =
3254       getIConstantVRegValWithLookThrough(VOffset, *MRI);
3255   const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3256 
3257   switch (Size) {
3258   default:
3259     return false;
3260   case 1:
3261     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3262                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3263                     : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3264                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3265     break;
3266   case 2:
3267     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3268                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3269                     : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3270                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3271     break;
3272   case 4:
3273     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3274                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3275                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3276                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3277     break;
3278   }
3279 
3280   MachineBasicBlock *MBB = MI.getParent();
3281   const DebugLoc &DL = MI.getDebugLoc();
3282   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3283     .add(MI.getOperand(2));
3284 
3285   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3286 
3287   if (HasVIndex && HasVOffset) {
3288     Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3289     BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3290       .addReg(VIndex)
3291       .addImm(AMDGPU::sub0)
3292       .addReg(VOffset)
3293       .addImm(AMDGPU::sub1);
3294 
3295     MIB.addReg(IdxReg);
3296   } else if (HasVIndex) {
3297     MIB.addReg(VIndex);
3298   } else if (HasVOffset) {
3299     MIB.addReg(VOffset);
3300   }
3301 
3302   MIB.add(MI.getOperand(1));            // rsrc
3303   MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3304   MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3305   unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3306   MIB.addImm(Aux & AMDGPU::CPol::ALL);                  // cpol
3307   MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3308 
3309   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3310   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3311   LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3312   MachinePointerInfo StorePtrI = LoadPtrI;
3313   StorePtrI.V = nullptr;
3314   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3315 
3316   auto F = LoadMMO->getFlags() &
3317            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3318   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3319                                      Size, LoadMMO->getBaseAlign());
3320 
3321   MachineMemOperand *StoreMMO =
3322       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3323                                sizeof(int32_t), LoadMMO->getBaseAlign());
3324 
3325   MIB.setMemRefs({LoadMMO, StoreMMO});
3326 
3327   MI.eraseFromParent();
3328   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3329 }
3330 
3331 /// Match a zero extend from a 32-bit value to 64-bits.
3332 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3333   Register ZExtSrc;
3334   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3335     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3336 
3337   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3338   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3339   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3340     return Register();
3341 
3342   assert(Def->getNumOperands() == 3 &&
3343          MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3344   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3345     return Def->getOperand(1).getReg();
3346   }
3347 
3348   return Register();
3349 }
3350 
3351 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3352   unsigned Opc;
3353   unsigned Size = MI.getOperand(3).getImm();
3354 
3355   switch (Size) {
3356   default:
3357     return false;
3358   case 1:
3359     Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3360     break;
3361   case 2:
3362     Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3363     break;
3364   case 4:
3365     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3366     break;
3367   }
3368 
3369   MachineBasicBlock *MBB = MI.getParent();
3370   const DebugLoc &DL = MI.getDebugLoc();
3371   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3372     .add(MI.getOperand(2));
3373 
3374   Register Addr = MI.getOperand(1).getReg();
3375   Register VOffset;
3376   // Try to split SAddr and VOffset. Global and LDS pointers share the same
3377   // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3378   if (!isSGPR(Addr)) {
3379     auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3380     if (isSGPR(AddrDef->Reg)) {
3381       Addr = AddrDef->Reg;
3382     } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3383       Register SAddr =
3384           getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3385       if (isSGPR(SAddr)) {
3386         Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3387         if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3388           Addr = SAddr;
3389           VOffset = Off;
3390         }
3391       }
3392     }
3393   }
3394 
3395   if (isSGPR(Addr)) {
3396     Opc = AMDGPU::getGlobalSaddrOp(Opc);
3397     if (!VOffset) {
3398       VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3399       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3400         .addImm(0);
3401     }
3402   }
3403 
3404   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3405     .addReg(Addr);
3406 
3407   if (isSGPR(Addr))
3408     MIB.addReg(VOffset);
3409 
3410   MIB.add(MI.getOperand(4))  // offset
3411      .add(MI.getOperand(5)); // cpol
3412 
3413   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3414   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3415   LoadPtrI.Offset = MI.getOperand(4).getImm();
3416   MachinePointerInfo StorePtrI = LoadPtrI;
3417   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3418   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3419   auto F = LoadMMO->getFlags() &
3420            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3421   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3422                                      Size, LoadMMO->getBaseAlign());
3423   MachineMemOperand *StoreMMO =
3424       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3425                                sizeof(int32_t), Align(4));
3426 
3427   MIB.setMemRefs({LoadMMO, StoreMMO});
3428 
3429   MI.eraseFromParent();
3430   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3431 }
3432 
3433 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3434   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3435   MI.removeOperand(1);
3436   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3437   return true;
3438 }
3439 
3440 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3441   unsigned Opc;
3442   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3443   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3444     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3445     break;
3446   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3447     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3448     break;
3449   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3450     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3451     break;
3452   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3453     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3454     break;
3455   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3456     Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3457     break;
3458   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3459     Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3460     break;
3461   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3462     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3463     break;
3464   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3465     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3466     break;
3467   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3468     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3469     break;
3470   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3471     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3472     break;
3473   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3474     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3475     break;
3476   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3477     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3478     break;
3479   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3480     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3481     break;
3482   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3483     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3484     break;
3485   default:
3486     llvm_unreachable("unhandled smfmac intrinsic");
3487   }
3488 
3489   auto VDst_In = MI.getOperand(4);
3490 
3491   MI.setDesc(TII.get(Opc));
3492   MI.removeOperand(4); // VDst_In
3493   MI.removeOperand(1); // Intrinsic ID
3494   MI.addOperand(VDst_In); // Readd VDst_In to the end
3495   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3496   return true;
3497 }
3498 
3499 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3500   Register DstReg = MI.getOperand(0).getReg();
3501   Register SrcReg = MI.getOperand(1).getReg();
3502   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3503   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3504   MachineBasicBlock *MBB = MI.getParent();
3505   const DebugLoc &DL = MI.getDebugLoc();
3506 
3507   if (IsVALU) {
3508     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3509       .addImm(Subtarget->getWavefrontSizeLog2())
3510       .addReg(SrcReg);
3511   } else {
3512     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3513       .addReg(SrcReg)
3514       .addImm(Subtarget->getWavefrontSizeLog2())
3515       .setOperandDead(3); // Dead scc
3516   }
3517 
3518   const TargetRegisterClass &RC =
3519       IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3520   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3521     return false;
3522 
3523   MI.eraseFromParent();
3524   return true;
3525 }
3526 
3527 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3528   Register SrcReg = MI.getOperand(0).getReg();
3529   if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3530     return false;
3531 
3532   MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3533   Register SP =
3534       Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3535   Register WaveAddr = getWaveAddress(DefMI);
3536   MachineBasicBlock *MBB = MI.getParent();
3537   const DebugLoc &DL = MI.getDebugLoc();
3538 
3539   if (!WaveAddr) {
3540     WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3541     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3542       .addReg(SrcReg)
3543       .addImm(Subtarget->getWavefrontSizeLog2())
3544       .setOperandDead(3); // Dead scc
3545   }
3546 
3547   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3548     .addReg(WaveAddr);
3549 
3550   MI.eraseFromParent();
3551   return true;
3552 }
3553 
3554 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3555   if (I.isPHI())
3556     return selectPHI(I);
3557 
3558   if (!I.isPreISelOpcode()) {
3559     if (I.isCopy())
3560       return selectCOPY(I);
3561     return true;
3562   }
3563 
3564   switch (I.getOpcode()) {
3565   case TargetOpcode::G_AND:
3566   case TargetOpcode::G_OR:
3567   case TargetOpcode::G_XOR:
3568     if (selectImpl(I, *CoverageInfo))
3569       return true;
3570     return selectG_AND_OR_XOR(I);
3571   case TargetOpcode::G_ADD:
3572   case TargetOpcode::G_SUB:
3573     if (selectImpl(I, *CoverageInfo))
3574       return true;
3575     return selectG_ADD_SUB(I);
3576   case TargetOpcode::G_UADDO:
3577   case TargetOpcode::G_USUBO:
3578   case TargetOpcode::G_UADDE:
3579   case TargetOpcode::G_USUBE:
3580     return selectG_UADDO_USUBO_UADDE_USUBE(I);
3581   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3582   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3583     return selectG_AMDGPU_MAD_64_32(I);
3584   case TargetOpcode::G_INTTOPTR:
3585   case TargetOpcode::G_BITCAST:
3586   case TargetOpcode::G_PTRTOINT:
3587     return selectCOPY(I);
3588   case TargetOpcode::G_CONSTANT:
3589   case TargetOpcode::G_FCONSTANT:
3590     return selectG_CONSTANT(I);
3591   case TargetOpcode::G_FNEG:
3592     if (selectImpl(I, *CoverageInfo))
3593       return true;
3594     return selectG_FNEG(I);
3595   case TargetOpcode::G_FABS:
3596     if (selectImpl(I, *CoverageInfo))
3597       return true;
3598     return selectG_FABS(I);
3599   case TargetOpcode::G_EXTRACT:
3600     return selectG_EXTRACT(I);
3601   case TargetOpcode::G_MERGE_VALUES:
3602   case TargetOpcode::G_CONCAT_VECTORS:
3603     return selectG_MERGE_VALUES(I);
3604   case TargetOpcode::G_UNMERGE_VALUES:
3605     return selectG_UNMERGE_VALUES(I);
3606   case TargetOpcode::G_BUILD_VECTOR:
3607   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3608     return selectG_BUILD_VECTOR(I);
3609   case TargetOpcode::G_PTR_ADD:
3610     if (selectImpl(I, *CoverageInfo))
3611       return true;
3612     return selectG_PTR_ADD(I);
3613   case TargetOpcode::G_IMPLICIT_DEF:
3614     return selectG_IMPLICIT_DEF(I);
3615   case TargetOpcode::G_FREEZE:
3616     return selectCOPY(I);
3617   case TargetOpcode::G_INSERT:
3618     return selectG_INSERT(I);
3619   case TargetOpcode::G_INTRINSIC:
3620   case TargetOpcode::G_INTRINSIC_CONVERGENT:
3621     return selectG_INTRINSIC(I);
3622   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3623   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3624     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3625   case TargetOpcode::G_ICMP:
3626   case TargetOpcode::G_FCMP:
3627     if (selectG_ICMP_or_FCMP(I))
3628       return true;
3629     return selectImpl(I, *CoverageInfo);
3630   case TargetOpcode::G_LOAD:
3631   case TargetOpcode::G_STORE:
3632   case TargetOpcode::G_ATOMIC_CMPXCHG:
3633   case TargetOpcode::G_ATOMICRMW_XCHG:
3634   case TargetOpcode::G_ATOMICRMW_ADD:
3635   case TargetOpcode::G_ATOMICRMW_SUB:
3636   case TargetOpcode::G_ATOMICRMW_AND:
3637   case TargetOpcode::G_ATOMICRMW_OR:
3638   case TargetOpcode::G_ATOMICRMW_XOR:
3639   case TargetOpcode::G_ATOMICRMW_MIN:
3640   case TargetOpcode::G_ATOMICRMW_MAX:
3641   case TargetOpcode::G_ATOMICRMW_UMIN:
3642   case TargetOpcode::G_ATOMICRMW_UMAX:
3643   case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3644   case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3645   case TargetOpcode::G_ATOMICRMW_FADD:
3646   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3647   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3648     return selectG_LOAD_STORE_ATOMICRMW(I);
3649   case TargetOpcode::G_SELECT:
3650     return selectG_SELECT(I);
3651   case TargetOpcode::G_TRUNC:
3652     return selectG_TRUNC(I);
3653   case TargetOpcode::G_SEXT:
3654   case TargetOpcode::G_ZEXT:
3655   case TargetOpcode::G_ANYEXT:
3656   case TargetOpcode::G_SEXT_INREG:
3657     // This is a workaround. For extension from type i1, `selectImpl()` uses
3658     // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3659     // i1 can only be hold in a SGPR class.
3660     if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3661         selectImpl(I, *CoverageInfo))
3662       return true;
3663     return selectG_SZA_EXT(I);
3664   case TargetOpcode::G_FPEXT:
3665     if (selectG_FPEXT(I))
3666       return true;
3667     return selectImpl(I, *CoverageInfo);
3668   case TargetOpcode::G_BRCOND:
3669     return selectG_BRCOND(I);
3670   case TargetOpcode::G_GLOBAL_VALUE:
3671     return selectG_GLOBAL_VALUE(I);
3672   case TargetOpcode::G_PTRMASK:
3673     return selectG_PTRMASK(I);
3674   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3675     return selectG_EXTRACT_VECTOR_ELT(I);
3676   case TargetOpcode::G_INSERT_VECTOR_ELT:
3677     return selectG_INSERT_VECTOR_ELT(I);
3678   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3679   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3680   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3681   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3682     const AMDGPU::ImageDimIntrinsicInfo *Intr =
3683         AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
3684     assert(Intr && "not an image intrinsic with image pseudo");
3685     return selectImageIntrinsic(I, Intr);
3686   }
3687   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3688     return selectBVHIntrinsic(I);
3689   case AMDGPU::G_SBFX:
3690   case AMDGPU::G_UBFX:
3691     return selectG_SBFX_UBFX(I);
3692   case AMDGPU::G_SI_CALL:
3693     I.setDesc(TII.get(AMDGPU::SI_CALL));
3694     return true;
3695   case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3696     return selectWaveAddress(I);
3697   case AMDGPU::G_STACKRESTORE:
3698     return selectStackRestore(I);
3699   default:
3700     return selectImpl(I, *CoverageInfo);
3701   }
3702   return false;
3703 }
3704 
3705 InstructionSelector::ComplexRendererFns
3706 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3707   return {{
3708       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3709   }};
3710 
3711 }
3712 
3713 std::pair<Register, unsigned>
3714 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3715                                               bool IsCanonicalizing,
3716                                               bool AllowAbs, bool OpSel) const {
3717   Register Src = Root.getReg();
3718   unsigned Mods = 0;
3719   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3720 
3721   if (MI->getOpcode() == AMDGPU::G_FNEG) {
3722     Src = MI->getOperand(1).getReg();
3723     Mods |= SISrcMods::NEG;
3724     MI = getDefIgnoringCopies(Src, *MRI);
3725   } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3726     // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3727     // denormal mode, but we're implicitly canonicalizing in a source operand.
3728     const ConstantFP *LHS =
3729         getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3730     if (LHS && LHS->isZero()) {
3731       Mods |= SISrcMods::NEG;
3732       Src = MI->getOperand(2).getReg();
3733     }
3734   }
3735 
3736   if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3737     Src = MI->getOperand(1).getReg();
3738     Mods |= SISrcMods::ABS;
3739   }
3740 
3741   if (OpSel)
3742     Mods |= SISrcMods::OP_SEL_0;
3743 
3744   return std::pair(Src, Mods);
3745 }
3746 
3747 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3748     Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3749     bool ForceVGPR) const {
3750   if ((Mods != 0 || ForceVGPR) &&
3751       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3752 
3753     // If we looked through copies to find source modifiers on an SGPR operand,
3754     // we now have an SGPR register source. To avoid potentially violating the
3755     // constant bus restriction, we need to insert a copy to a VGPR.
3756     Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3757     BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3758             TII.get(AMDGPU::COPY), VGPRSrc)
3759         .addReg(Src);
3760     Src = VGPRSrc;
3761   }
3762 
3763   return Src;
3764 }
3765 
3766 ///
3767 /// This will select either an SGPR or VGPR operand and will save us from
3768 /// having to write an extra tablegen pattern.
3769 InstructionSelector::ComplexRendererFns
3770 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3771   return {{
3772       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3773   }};
3774 }
3775 
3776 InstructionSelector::ComplexRendererFns
3777 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3778   Register Src;
3779   unsigned Mods;
3780   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3781 
3782   return {{
3783       [=](MachineInstrBuilder &MIB) {
3784         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3785       },
3786       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3787       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3788       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3789   }};
3790 }
3791 
3792 InstructionSelector::ComplexRendererFns
3793 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3794   Register Src;
3795   unsigned Mods;
3796   std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3797                                            /*IsCanonicalizing=*/true,
3798                                            /*AllowAbs=*/false);
3799 
3800   return {{
3801       [=](MachineInstrBuilder &MIB) {
3802         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3803       },
3804       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3805       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3806       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3807   }};
3808 }
3809 
3810 InstructionSelector::ComplexRendererFns
3811 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3812   return {{
3813       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3814       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3815       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3816   }};
3817 }
3818 
3819 InstructionSelector::ComplexRendererFns
3820 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3821   Register Src;
3822   unsigned Mods;
3823   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3824 
3825   return {{
3826       [=](MachineInstrBuilder &MIB) {
3827         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3828       },
3829       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3830   }};
3831 }
3832 
3833 InstructionSelector::ComplexRendererFns
3834 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3835     MachineOperand &Root) const {
3836   Register Src;
3837   unsigned Mods;
3838   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3839 
3840   return {{
3841       [=](MachineInstrBuilder &MIB) {
3842         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3843       },
3844       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3845   }};
3846 }
3847 
3848 InstructionSelector::ComplexRendererFns
3849 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3850   Register Src;
3851   unsigned Mods;
3852   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3853                                            /*AllowAbs=*/false);
3854 
3855   return {{
3856       [=](MachineInstrBuilder &MIB) {
3857         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3858       },
3859       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3860   }};
3861 }
3862 
3863 InstructionSelector::ComplexRendererFns
3864 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3865   Register Reg = Root.getReg();
3866   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3867   if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3868     return {};
3869   return {{
3870       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3871   }};
3872 }
3873 
3874 std::pair<Register, unsigned>
3875 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3876   Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3877   unsigned Mods = 0;
3878   MachineInstr *MI = MRI.getVRegDef(Src);
3879 
3880   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3881       // It's possible to see an f32 fneg here, but unlikely.
3882       // TODO: Treat f32 fneg as only high bit.
3883       MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3884     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3885     Src = MI->getOperand(1).getReg();
3886     MI = MRI.getVRegDef(Src);
3887   }
3888 
3889   // TODO: Handle G_FSUB 0 as fneg
3890 
3891   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3892   (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3893 
3894   // Packed instructions do not have abs modifiers.
3895   Mods |= SISrcMods::OP_SEL_1;
3896 
3897   return std::pair(Src, Mods);
3898 }
3899 
3900 InstructionSelector::ComplexRendererFns
3901 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3902   MachineRegisterInfo &MRI
3903     = Root.getParent()->getParent()->getParent()->getRegInfo();
3904 
3905   Register Src;
3906   unsigned Mods;
3907   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3908 
3909   return {{
3910       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3911       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3912   }};
3913 }
3914 
3915 InstructionSelector::ComplexRendererFns
3916 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3917   MachineRegisterInfo &MRI
3918     = Root.getParent()->getParent()->getParent()->getRegInfo();
3919 
3920   Register Src;
3921   unsigned Mods;
3922   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3923 
3924   return {{
3925       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3926       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3927   }};
3928 }
3929 
3930 InstructionSelector::ComplexRendererFns
3931 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3932   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3933   // Value is in Imm operand as i1 sign extended to int64_t.
3934   // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3935   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3936          "expected i1 value");
3937   unsigned Mods = SISrcMods::OP_SEL_1;
3938   if (Root.getImm() == -1)
3939     Mods ^= SISrcMods::NEG;
3940   return {{
3941       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3942   }};
3943 }
3944 
3945 InstructionSelector::ComplexRendererFns
3946 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3947     MachineOperand &Root) const {
3948   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3949          "expected i1 value");
3950   unsigned Mods = SISrcMods::OP_SEL_1;
3951   if (Root.getImm() != 0)
3952     Mods |= SISrcMods::OP_SEL_0;
3953 
3954   return {{
3955       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3956   }};
3957 }
3958 
3959 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
3960                                  MachineInstr *InsertPt,
3961                                  MachineRegisterInfo &MRI) {
3962   const TargetRegisterClass *DstRegClass;
3963   switch (Elts.size()) {
3964   case 8:
3965     DstRegClass = &AMDGPU::VReg_256RegClass;
3966     break;
3967   case 4:
3968     DstRegClass = &AMDGPU::VReg_128RegClass;
3969     break;
3970   case 2:
3971     DstRegClass = &AMDGPU::VReg_64RegClass;
3972     break;
3973   default:
3974     llvm_unreachable("unhandled Reg sequence size");
3975   }
3976 
3977   MachineIRBuilder B(*InsertPt);
3978   auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3979                  .addDef(MRI.createVirtualRegister(DstRegClass));
3980   for (unsigned i = 0; i < Elts.size(); ++i) {
3981     MIB.addReg(Elts[i]);
3982     MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
3983   }
3984   return MIB->getOperand(0).getReg();
3985 }
3986 
3987 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3988                                  SmallVectorImpl<Register> &Elts, Register &Src,
3989                                  MachineInstr *InsertPt,
3990                                  MachineRegisterInfo &MRI) {
3991   if (ModOpcode == TargetOpcode::G_FNEG) {
3992     Mods |= SISrcMods::NEG;
3993     // Check if all elements also have abs modifier
3994     SmallVector<Register, 8> NegAbsElts;
3995     for (auto El : Elts) {
3996       Register FabsSrc;
3997       if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3998         break;
3999       NegAbsElts.push_back(FabsSrc);
4000     }
4001     if (Elts.size() != NegAbsElts.size()) {
4002       // Neg
4003       Src = buildRegSequence(Elts, InsertPt, MRI);
4004     } else {
4005       // Neg and Abs
4006       Mods |= SISrcMods::NEG_HI;
4007       Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4008     }
4009   } else {
4010     assert(ModOpcode == TargetOpcode::G_FABS);
4011     // Abs
4012     Mods |= SISrcMods::NEG_HI;
4013     Src = buildRegSequence(Elts, InsertPt, MRI);
4014   }
4015 }
4016 
4017 InstructionSelector::ComplexRendererFns
4018 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4019   Register Src = Root.getReg();
4020   unsigned Mods = SISrcMods::OP_SEL_1;
4021   unsigned ModOpcode;
4022   SmallVector<Register, 8> EltsF32;
4023 
4024   if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4025     for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4026       MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4027       // Based on first element decide which mod we match, neg or abs
4028       if (EltsF32.empty())
4029         ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
4030                                                            : AMDGPU::G_FABS;
4031       if (ElF32->getOpcode() != ModOpcode)
4032         break;
4033       EltsF32.push_back(ElF32->getOperand(1).getReg());
4034     }
4035 
4036     // All elements had ModOpcode modifier
4037     if (BV->getNumSources() == EltsF32.size()) {
4038       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4039                            *MRI);
4040     }
4041   }
4042 
4043   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4044            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4045 }
4046 
4047 InstructionSelector::ComplexRendererFns
4048 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4049   Register Src = Root.getReg();
4050   unsigned Mods = SISrcMods::OP_SEL_1;
4051   SmallVector<Register, 8> EltsV2F16;
4052 
4053   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4054     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4055       Register FNegSrc;
4056       if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4057         break;
4058       EltsV2F16.push_back(FNegSrc);
4059     }
4060 
4061     // All elements had ModOpcode modifier
4062     if (CV->getNumSources() == EltsV2F16.size()) {
4063       Mods |= SISrcMods::NEG;
4064       Mods |= SISrcMods::NEG_HI;
4065       Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4066     }
4067   }
4068 
4069   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4070            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4071 }
4072 
4073 InstructionSelector::ComplexRendererFns
4074 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4075   Register Src = Root.getReg();
4076   unsigned Mods = SISrcMods::OP_SEL_1;
4077   unsigned ModOpcode;
4078   SmallVector<Register, 8> EltsV2F16;
4079 
4080   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4081     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4082       MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4083       // Based on first element decide which mod we match, neg or abs
4084       if (EltsV2F16.empty())
4085         ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
4086                                                              : AMDGPU::G_FABS;
4087       if (ElV2F16->getOpcode() != ModOpcode)
4088         break;
4089       EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4090     }
4091 
4092     // All elements had ModOpcode modifier
4093     if (CV->getNumSources() == EltsV2F16.size()) {
4094       MachineIRBuilder B(*Root.getParent());
4095       selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4096                            *MRI);
4097     }
4098   }
4099 
4100   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4101            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4102 }
4103 
4104 InstructionSelector::ComplexRendererFns
4105 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4106   std::optional<FPValueAndVReg> FPValReg;
4107   if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4108     if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
4109       return {{[=](MachineInstrBuilder &MIB) {
4110         MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4111       }}};
4112     }
4113     // Non-inlineable splat floats should not fall-through for integer immediate
4114     // checks.
4115     return {};
4116   }
4117 
4118   APInt ICst;
4119   if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4120     if (TII.isInlineConstant(ICst)) {
4121       return {
4122           {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4123     }
4124   }
4125 
4126   return {};
4127 }
4128 
4129 InstructionSelector::ComplexRendererFns
4130 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4131   Register Src =
4132       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4133   unsigned Key = 0;
4134 
4135   Register ShiftSrc;
4136   std::optional<ValueAndVReg> ShiftAmt;
4137   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4138       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4139       ShiftAmt->Value.getZExtValue() % 8 == 0) {
4140     Key = ShiftAmt->Value.getZExtValue() / 8;
4141     Src = ShiftSrc;
4142   }
4143 
4144   return {{
4145       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4146       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4147   }};
4148 }
4149 
4150 InstructionSelector::ComplexRendererFns
4151 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4152 
4153   Register Src =
4154       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4155   unsigned Key = 0;
4156 
4157   Register ShiftSrc;
4158   std::optional<ValueAndVReg> ShiftAmt;
4159   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4160       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4161       ShiftAmt->Value.getZExtValue() == 16) {
4162     Src = ShiftSrc;
4163     Key = 1;
4164   }
4165 
4166   return {{
4167       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4168       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4169   }};
4170 }
4171 
4172 InstructionSelector::ComplexRendererFns
4173 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4174   Register Src;
4175   unsigned Mods;
4176   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4177 
4178   // FIXME: Handle op_sel
4179   return {{
4180       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4181       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4182   }};
4183 }
4184 
4185 InstructionSelector::ComplexRendererFns
4186 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4187   Register Src;
4188   unsigned Mods;
4189   std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4190                                            /*IsCanonicalizing=*/true,
4191                                            /*AllowAbs=*/false,
4192                                            /*OpSel=*/false);
4193 
4194   return {{
4195       [=](MachineInstrBuilder &MIB) {
4196         MIB.addReg(
4197             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4198       },
4199       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4200   }};
4201 }
4202 
4203 InstructionSelector::ComplexRendererFns
4204 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4205   Register Src;
4206   unsigned Mods;
4207   std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4208                                            /*IsCanonicalizing=*/true,
4209                                            /*AllowAbs=*/false,
4210                                            /*OpSel=*/true);
4211 
4212   return {{
4213       [=](MachineInstrBuilder &MIB) {
4214         MIB.addReg(
4215             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4216       },
4217       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4218   }};
4219 }
4220 
4221 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4222                                                  Register &Base,
4223                                                  Register *SOffset,
4224                                                  int64_t *Offset) const {
4225   MachineInstr *MI = Root.getParent();
4226   MachineBasicBlock *MBB = MI->getParent();
4227 
4228   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4229   // then we can select all ptr + 32-bit offsets.
4230   SmallVector<GEPInfo, 4> AddrInfo;
4231   getAddrModeInfo(*MI, *MRI, AddrInfo);
4232 
4233   if (AddrInfo.empty())
4234     return false;
4235 
4236   const GEPInfo &GEPI = AddrInfo[0];
4237   std::optional<int64_t> EncodedImm =
4238       AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4239 
4240   if (SOffset && Offset) {
4241     if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4242         AddrInfo.size() > 1) {
4243       const GEPInfo &GEPI2 = AddrInfo[1];
4244       if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4245         if (Register OffsetReg =
4246                 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4247           Base = GEPI2.SgprParts[0];
4248           *SOffset = OffsetReg;
4249           *Offset = *EncodedImm;
4250           return true;
4251         }
4252       }
4253     }
4254     return false;
4255   }
4256 
4257   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4258     Base = GEPI.SgprParts[0];
4259     *Offset = *EncodedImm;
4260     return true;
4261   }
4262 
4263   // SGPR offset is unsigned.
4264   if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4265       GEPI.Imm != 0) {
4266     // If we make it this far we have a load with an 32-bit immediate offset.
4267     // It is OK to select this using a sgpr offset, because we have already
4268     // failed trying to select this load into one of the _IMM variants since
4269     // the _IMM Patterns are considered before the _SGPR patterns.
4270     Base = GEPI.SgprParts[0];
4271     *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4272     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4273         .addImm(GEPI.Imm);
4274     return true;
4275   }
4276 
4277   if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4278     if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4279       Base = GEPI.SgprParts[0];
4280       *SOffset = OffsetReg;
4281       return true;
4282     }
4283   }
4284 
4285   return false;
4286 }
4287 
4288 InstructionSelector::ComplexRendererFns
4289 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4290   Register Base;
4291   int64_t Offset;
4292   if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4293     return std::nullopt;
4294 
4295   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4296            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4297 }
4298 
4299 InstructionSelector::ComplexRendererFns
4300 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4301   SmallVector<GEPInfo, 4> AddrInfo;
4302   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4303 
4304   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4305     return std::nullopt;
4306 
4307   const GEPInfo &GEPInfo = AddrInfo[0];
4308   Register PtrReg = GEPInfo.SgprParts[0];
4309   std::optional<int64_t> EncodedImm =
4310       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4311   if (!EncodedImm)
4312     return std::nullopt;
4313 
4314   return {{
4315     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4316     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4317   }};
4318 }
4319 
4320 InstructionSelector::ComplexRendererFns
4321 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4322   Register Base, SOffset;
4323   if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4324     return std::nullopt;
4325 
4326   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4327            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4328 }
4329 
4330 InstructionSelector::ComplexRendererFns
4331 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4332   Register Base, SOffset;
4333   int64_t Offset;
4334   if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4335     return std::nullopt;
4336 
4337   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4338            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4339            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4340 }
4341 
4342 std::pair<Register, int>
4343 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4344                                                 uint64_t FlatVariant) const {
4345   MachineInstr *MI = Root.getParent();
4346 
4347   auto Default = std::pair(Root.getReg(), 0);
4348 
4349   if (!STI.hasFlatInstOffsets())
4350     return Default;
4351 
4352   Register PtrBase;
4353   int64_t ConstOffset;
4354   std::tie(PtrBase, ConstOffset) =
4355       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4356 
4357   if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4358                            !isFlatScratchBaseLegal(Root.getReg())))
4359     return Default;
4360 
4361   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4362   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4363     return Default;
4364 
4365   return std::pair(PtrBase, ConstOffset);
4366 }
4367 
4368 InstructionSelector::ComplexRendererFns
4369 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4370   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4371 
4372   return {{
4373       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4374       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4375     }};
4376 }
4377 
4378 InstructionSelector::ComplexRendererFns
4379 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4380   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4381 
4382   return {{
4383       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4384       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4385   }};
4386 }
4387 
4388 InstructionSelector::ComplexRendererFns
4389 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4390   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4391 
4392   return {{
4393       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4394       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4395     }};
4396 }
4397 
4398 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4399 InstructionSelector::ComplexRendererFns
4400 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4401   Register Addr = Root.getReg();
4402   Register PtrBase;
4403   int64_t ConstOffset;
4404   int64_t ImmOffset = 0;
4405 
4406   // Match the immediate offset first, which canonically is moved as low as
4407   // possible.
4408   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4409 
4410   if (ConstOffset != 0) {
4411     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4412                               SIInstrFlags::FlatGlobal)) {
4413       Addr = PtrBase;
4414       ImmOffset = ConstOffset;
4415     } else {
4416       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4417       if (isSGPR(PtrBaseDef->Reg)) {
4418         if (ConstOffset > 0) {
4419           // Offset is too large.
4420           //
4421           // saddr + large_offset -> saddr +
4422           //                         (voffset = large_offset & ~MaxOffset) +
4423           //                         (large_offset & MaxOffset);
4424           int64_t SplitImmOffset, RemainderOffset;
4425           std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4426               ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
4427 
4428           if (isUInt<32>(RemainderOffset)) {
4429             MachineInstr *MI = Root.getParent();
4430             MachineBasicBlock *MBB = MI->getParent();
4431             Register HighBits =
4432                 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4433 
4434             BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4435                     HighBits)
4436                 .addImm(RemainderOffset);
4437 
4438             return {{
4439                 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4440                 [=](MachineInstrBuilder &MIB) {
4441                   MIB.addReg(HighBits);
4442                 }, // voffset
4443                 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4444             }};
4445           }
4446         }
4447 
4448         // We are adding a 64 bit SGPR and a constant. If constant bus limit
4449         // is 1 we would need to perform 1 or 2 extra moves for each half of
4450         // the constant and it is better to do a scalar add and then issue a
4451         // single VALU instruction to materialize zero. Otherwise it is less
4452         // instructions to perform VALU adds with immediates or inline literals.
4453         unsigned NumLiterals =
4454             !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4455             !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4456         if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4457           return std::nullopt;
4458       }
4459     }
4460   }
4461 
4462   // Match the variable offset.
4463   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4464   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4465     // Look through the SGPR->VGPR copy.
4466     Register SAddr =
4467         getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4468 
4469     if (isSGPR(SAddr)) {
4470       Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4471 
4472       // It's possible voffset is an SGPR here, but the copy to VGPR will be
4473       // inserted later.
4474       if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4475         return {{[=](MachineInstrBuilder &MIB) { // saddr
4476                    MIB.addReg(SAddr);
4477                  },
4478                  [=](MachineInstrBuilder &MIB) { // voffset
4479                    MIB.addReg(VOffset);
4480                  },
4481                  [=](MachineInstrBuilder &MIB) { // offset
4482                    MIB.addImm(ImmOffset);
4483                  }}};
4484       }
4485     }
4486   }
4487 
4488   // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4489   // drop this.
4490   if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4491       AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4492     return std::nullopt;
4493 
4494   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4495   // moves required to copy a 64-bit SGPR to VGPR.
4496   MachineInstr *MI = Root.getParent();
4497   MachineBasicBlock *MBB = MI->getParent();
4498   Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4499 
4500   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4501       .addImm(0);
4502 
4503   return {{
4504       [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4505       [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset
4506       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }     // offset
4507   }};
4508 }
4509 
4510 InstructionSelector::ComplexRendererFns
4511 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4512   Register Addr = Root.getReg();
4513   Register PtrBase;
4514   int64_t ConstOffset;
4515   int64_t ImmOffset = 0;
4516 
4517   // Match the immediate offset first, which canonically is moved as low as
4518   // possible.
4519   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4520 
4521   if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4522       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
4523                             SIInstrFlags::FlatScratch)) {
4524     Addr = PtrBase;
4525     ImmOffset = ConstOffset;
4526   }
4527 
4528   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4529   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4530     int FI = AddrDef->MI->getOperand(1).getIndex();
4531     return {{
4532         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4533         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4534     }};
4535   }
4536 
4537   Register SAddr = AddrDef->Reg;
4538 
4539   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4540     Register LHS = AddrDef->MI->getOperand(1).getReg();
4541     Register RHS = AddrDef->MI->getOperand(2).getReg();
4542     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4543     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4544 
4545     if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4546         isSGPR(RHSDef->Reg)) {
4547       int FI = LHSDef->MI->getOperand(1).getIndex();
4548       MachineInstr &I = *Root.getParent();
4549       MachineBasicBlock *BB = I.getParent();
4550       const DebugLoc &DL = I.getDebugLoc();
4551       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4552 
4553       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4554           .addFrameIndex(FI)
4555           .addReg(RHSDef->Reg)
4556           .setOperandDead(3); // Dead scc
4557     }
4558   }
4559 
4560   if (!isSGPR(SAddr))
4561     return std::nullopt;
4562 
4563   return {{
4564       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4565       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4566   }};
4567 }
4568 
4569 // Check whether the flat scratch SVS swizzle bug affects this access.
4570 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4571     Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4572   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4573     return false;
4574 
4575   // The bug affects the swizzling of SVS accesses if there is any carry out
4576   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4577   // voffset to (soffset + inst_offset).
4578   auto VKnown = KB->getKnownBits(VAddr);
4579   auto SKnown = KnownBits::computeForAddSub(
4580       true, false, KB->getKnownBits(SAddr),
4581       KnownBits::makeConstant(APInt(32, ImmOffset)));
4582   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4583   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4584   return (VMax & 3) + (SMax & 3) >= 4;
4585 }
4586 
4587 InstructionSelector::ComplexRendererFns
4588 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4589   Register Addr = Root.getReg();
4590   Register PtrBase;
4591   int64_t ConstOffset;
4592   int64_t ImmOffset = 0;
4593 
4594   // Match the immediate offset first, which canonically is moved as low as
4595   // possible.
4596   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4597 
4598   Register OrigAddr = Addr;
4599   if (ConstOffset != 0 &&
4600       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4601     Addr = PtrBase;
4602     ImmOffset = ConstOffset;
4603   }
4604 
4605   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4606   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4607     return std::nullopt;
4608 
4609   Register RHS = AddrDef->MI->getOperand(2).getReg();
4610   if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4611     return std::nullopt;
4612 
4613   Register LHS = AddrDef->MI->getOperand(1).getReg();
4614   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4615 
4616   if (OrigAddr != Addr) {
4617     if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4618       return std::nullopt;
4619   } else {
4620     if (!isFlatScratchBaseLegalSV(OrigAddr))
4621       return std::nullopt;
4622   }
4623 
4624   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4625     return std::nullopt;
4626 
4627   if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4628     int FI = LHSDef->MI->getOperand(1).getIndex();
4629     return {{
4630         [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4631         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4632         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4633     }};
4634   }
4635 
4636   if (!isSGPR(LHS))
4637     return std::nullopt;
4638 
4639   return {{
4640       [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4641       [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4642       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4643   }};
4644 }
4645 
4646 InstructionSelector::ComplexRendererFns
4647 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4648   MachineInstr *MI = Root.getParent();
4649   MachineBasicBlock *MBB = MI->getParent();
4650   MachineFunction *MF = MBB->getParent();
4651   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4652 
4653   int64_t Offset = 0;
4654   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4655       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
4656     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4657 
4658     // TODO: Should this be inside the render function? The iterator seems to
4659     // move.
4660     const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4661     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4662             HighBits)
4663         .addImm(Offset & ~MaxOffset);
4664 
4665     return {{[=](MachineInstrBuilder &MIB) { // rsrc
4666                MIB.addReg(Info->getScratchRSrcReg());
4667              },
4668              [=](MachineInstrBuilder &MIB) { // vaddr
4669                MIB.addReg(HighBits);
4670              },
4671              [=](MachineInstrBuilder &MIB) { // soffset
4672                // Use constant zero for soffset and rely on eliminateFrameIndex
4673                // to choose the appropriate frame register if need be.
4674                MIB.addImm(0);
4675              },
4676              [=](MachineInstrBuilder &MIB) { // offset
4677                MIB.addImm(Offset & MaxOffset);
4678              }}};
4679   }
4680 
4681   assert(Offset == 0 || Offset == -1);
4682 
4683   // Try to fold a frame index directly into the MUBUF vaddr field, and any
4684   // offsets.
4685   std::optional<int> FI;
4686   Register VAddr = Root.getReg();
4687   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4688     Register PtrBase;
4689     int64_t ConstOffset;
4690     std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4691     if (ConstOffset != 0) {
4692       if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4693           (!STI.privateMemoryResourceIsRangeChecked() ||
4694            KB->signBitIsZero(PtrBase))) {
4695         const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4696         if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4697           FI = PtrBaseDef->getOperand(1).getIndex();
4698         else
4699           VAddr = PtrBase;
4700         Offset = ConstOffset;
4701       }
4702     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4703       FI = RootDef->getOperand(1).getIndex();
4704     }
4705   }
4706 
4707   return {{[=](MachineInstrBuilder &MIB) { // rsrc
4708              MIB.addReg(Info->getScratchRSrcReg());
4709            },
4710            [=](MachineInstrBuilder &MIB) { // vaddr
4711              if (FI)
4712                MIB.addFrameIndex(*FI);
4713              else
4714                MIB.addReg(VAddr);
4715            },
4716            [=](MachineInstrBuilder &MIB) { // soffset
4717              // Use constant zero for soffset and rely on eliminateFrameIndex
4718              // to choose the appropriate frame register if need be.
4719              MIB.addImm(0);
4720            },
4721            [=](MachineInstrBuilder &MIB) { // offset
4722              MIB.addImm(Offset);
4723            }}};
4724 }
4725 
4726 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4727                                                 int64_t Offset) const {
4728   if (!isUInt<16>(Offset))
4729     return false;
4730 
4731   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4732     return true;
4733 
4734   // On Southern Islands instruction with a negative base value and an offset
4735   // don't seem to work.
4736   return KB->signBitIsZero(Base);
4737 }
4738 
4739 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4740                                                  int64_t Offset1,
4741                                                  unsigned Size) const {
4742   if (Offset0 % Size != 0 || Offset1 % Size != 0)
4743     return false;
4744   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4745     return false;
4746 
4747   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4748     return true;
4749 
4750   // On Southern Islands instruction with a negative base value and an offset
4751   // don't seem to work.
4752   return KB->signBitIsZero(Base);
4753 }
4754 
4755 // Return whether the operation has NoUnsignedWrap property.
4756 static bool isNoUnsignedWrap(MachineInstr *Addr) {
4757   return Addr->getOpcode() == TargetOpcode::G_OR ||
4758          (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4759           Addr->getFlag(MachineInstr::NoUWrap));
4760 }
4761 
4762 // Check that the base address of flat scratch load/store in the form of `base +
4763 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4764 // requirement). We always treat the first operand as the base address here.
4765 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4766   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4767 
4768   if (isNoUnsignedWrap(AddrMI))
4769     return true;
4770 
4771   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4772   // values.
4773   if (STI.hasSignedScratchOffsets())
4774     return true;
4775 
4776   Register LHS = AddrMI->getOperand(1).getReg();
4777   Register RHS = AddrMI->getOperand(2).getReg();
4778 
4779   if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4780     std::optional<ValueAndVReg> RhsValReg =
4781         getIConstantVRegValWithLookThrough(RHS, *MRI);
4782     // If the immediate offset is negative and within certain range, the base
4783     // address cannot also be negative. If the base is also negative, the sum
4784     // would be either negative or much larger than the valid range of scratch
4785     // memory a thread can access.
4786     if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4787         RhsValReg->Value.getSExtValue() > -0x40000000)
4788       return true;
4789   }
4790 
4791   return KB->signBitIsZero(LHS);
4792 }
4793 
4794 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4795 // of: SGPR + VGPR.
4796 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4797   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4798 
4799   if (isNoUnsignedWrap(AddrMI))
4800     return true;
4801 
4802   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4803   // values.
4804   if (STI.hasSignedScratchOffsets())
4805     return true;
4806 
4807   Register LHS = AddrMI->getOperand(1).getReg();
4808   Register RHS = AddrMI->getOperand(2).getReg();
4809   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4810 }
4811 
4812 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4813 // of: SGPR + VGPR + Imm.
4814 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4815     Register Addr) const {
4816   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4817   // values.
4818   if (STI.hasSignedScratchOffsets())
4819     return true;
4820 
4821   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4822   Register Base = AddrMI->getOperand(1).getReg();
4823   std::optional<DefinitionAndSourceRegister> BaseDef =
4824       getDefSrcRegIgnoringCopies(Base, *MRI);
4825   std::optional<ValueAndVReg> RHSOffset =
4826       getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
4827   assert(RHSOffset);
4828 
4829   // If the immediate offset is negative and within certain range, the base
4830   // address cannot also be negative. If the base is also negative, the sum
4831   // would be either negative or much larger than the valid range of scratch
4832   // memory a thread can access.
4833   if (isNoUnsignedWrap(BaseDef->MI) &&
4834       (isNoUnsignedWrap(AddrMI) ||
4835        (RHSOffset->Value.getSExtValue() < 0 &&
4836         RHSOffset->Value.getSExtValue() > -0x40000000)))
4837     return true;
4838 
4839   Register LHS = BaseDef->MI->getOperand(1).getReg();
4840   Register RHS = BaseDef->MI->getOperand(2).getReg();
4841   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4842 }
4843 
4844 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4845                                                     unsigned ShAmtBits) const {
4846   assert(MI.getOpcode() == TargetOpcode::G_AND);
4847 
4848   std::optional<APInt> RHS =
4849       getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4850   if (!RHS)
4851     return false;
4852 
4853   if (RHS->countr_one() >= ShAmtBits)
4854     return true;
4855 
4856   const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4857   return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4858 }
4859 
4860 InstructionSelector::ComplexRendererFns
4861 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4862     MachineOperand &Root) const {
4863   Register Reg = Root.getReg();
4864   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4865 
4866   std::optional<DefinitionAndSourceRegister> Def =
4867     getDefSrcRegIgnoringCopies(Reg, *MRI);
4868   assert(Def && "this shouldn't be an optional result");
4869   Reg = Def->Reg;
4870 
4871   if (Register WaveBase = getWaveAddress(Def->MI)) {
4872     return {{
4873         [=](MachineInstrBuilder &MIB) { // rsrc
4874           MIB.addReg(Info->getScratchRSrcReg());
4875         },
4876         [=](MachineInstrBuilder &MIB) { // soffset
4877           MIB.addReg(WaveBase);
4878         },
4879         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4880     }};
4881   }
4882 
4883   int64_t Offset = 0;
4884 
4885   // FIXME: Copy check is a hack
4886   Register BasePtr;
4887   if (mi_match(Reg, *MRI,
4888                m_GPtrAdd(m_Reg(BasePtr),
4889                          m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
4890     if (!TII.isLegalMUBUFImmOffset(Offset))
4891       return {};
4892     MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4893     Register WaveBase = getWaveAddress(BasePtrDef);
4894     if (!WaveBase)
4895       return {};
4896 
4897     return {{
4898         [=](MachineInstrBuilder &MIB) { // rsrc
4899           MIB.addReg(Info->getScratchRSrcReg());
4900         },
4901         [=](MachineInstrBuilder &MIB) { // soffset
4902           MIB.addReg(WaveBase);
4903         },
4904         [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4905     }};
4906   }
4907 
4908   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4909       !TII.isLegalMUBUFImmOffset(Offset))
4910     return {};
4911 
4912   return {{
4913       [=](MachineInstrBuilder &MIB) { // rsrc
4914         MIB.addReg(Info->getScratchRSrcReg());
4915       },
4916       [=](MachineInstrBuilder &MIB) { // soffset
4917         MIB.addImm(0);
4918       },
4919       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4920   }};
4921 }
4922 
4923 std::pair<Register, unsigned>
4924 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4925   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4926   if (!RootDef)
4927     return std::pair(Root.getReg(), 0);
4928 
4929   int64_t ConstAddr = 0;
4930 
4931   Register PtrBase;
4932   int64_t Offset;
4933   std::tie(PtrBase, Offset) =
4934     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4935 
4936   if (Offset) {
4937     if (isDSOffsetLegal(PtrBase, Offset)) {
4938       // (add n0, c0)
4939       return std::pair(PtrBase, Offset);
4940     }
4941   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4942     // TODO
4943 
4944 
4945   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4946     // TODO
4947 
4948   }
4949 
4950   return std::pair(Root.getReg(), 0);
4951 }
4952 
4953 InstructionSelector::ComplexRendererFns
4954 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4955   Register Reg;
4956   unsigned Offset;
4957   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4958   return {{
4959       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4960       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4961     }};
4962 }
4963 
4964 InstructionSelector::ComplexRendererFns
4965 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4966   return selectDSReadWrite2(Root, 4);
4967 }
4968 
4969 InstructionSelector::ComplexRendererFns
4970 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4971   return selectDSReadWrite2(Root, 8);
4972 }
4973 
4974 InstructionSelector::ComplexRendererFns
4975 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4976                                               unsigned Size) const {
4977   Register Reg;
4978   unsigned Offset;
4979   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4980   return {{
4981       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4982       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4983       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4984     }};
4985 }
4986 
4987 std::pair<Register, unsigned>
4988 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4989                                                   unsigned Size) const {
4990   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4991   if (!RootDef)
4992     return std::pair(Root.getReg(), 0);
4993 
4994   int64_t ConstAddr = 0;
4995 
4996   Register PtrBase;
4997   int64_t Offset;
4998   std::tie(PtrBase, Offset) =
4999     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5000 
5001   if (Offset) {
5002     int64_t OffsetValue0 = Offset;
5003     int64_t OffsetValue1 = Offset + Size;
5004     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5005       // (add n0, c0)
5006       return std::pair(PtrBase, OffsetValue0 / Size);
5007     }
5008   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5009     // TODO
5010 
5011   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5012     // TODO
5013 
5014   }
5015 
5016   return std::pair(Root.getReg(), 0);
5017 }
5018 
5019 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5020 /// the base value with the constant offset. There may be intervening copies
5021 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
5022 /// not match the pattern.
5023 std::pair<Register, int64_t>
5024 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5025   Register Root, const MachineRegisterInfo &MRI) const {
5026   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5027   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5028     return {Root, 0};
5029 
5030   MachineOperand &RHS = RootI->getOperand(2);
5031   std::optional<ValueAndVReg> MaybeOffset =
5032       getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5033   if (!MaybeOffset)
5034     return {Root, 0};
5035   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5036 }
5037 
5038 static void addZeroImm(MachineInstrBuilder &MIB) {
5039   MIB.addImm(0);
5040 }
5041 
5042 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5043 /// BasePtr is not valid, a null base pointer will be used.
5044 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5045                           uint32_t FormatLo, uint32_t FormatHi,
5046                           Register BasePtr) {
5047   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5048   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5049   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5050   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5051 
5052   B.buildInstr(AMDGPU::S_MOV_B32)
5053     .addDef(RSrc2)
5054     .addImm(FormatLo);
5055   B.buildInstr(AMDGPU::S_MOV_B32)
5056     .addDef(RSrc3)
5057     .addImm(FormatHi);
5058 
5059   // Build the half of the subregister with the constants before building the
5060   // full 128-bit register. If we are building multiple resource descriptors,
5061   // this will allow CSEing of the 2-component register.
5062   B.buildInstr(AMDGPU::REG_SEQUENCE)
5063     .addDef(RSrcHi)
5064     .addReg(RSrc2)
5065     .addImm(AMDGPU::sub0)
5066     .addReg(RSrc3)
5067     .addImm(AMDGPU::sub1);
5068 
5069   Register RSrcLo = BasePtr;
5070   if (!BasePtr) {
5071     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5072     B.buildInstr(AMDGPU::S_MOV_B64)
5073       .addDef(RSrcLo)
5074       .addImm(0);
5075   }
5076 
5077   B.buildInstr(AMDGPU::REG_SEQUENCE)
5078     .addDef(RSrc)
5079     .addReg(RSrcLo)
5080     .addImm(AMDGPU::sub0_sub1)
5081     .addReg(RSrcHi)
5082     .addImm(AMDGPU::sub2_sub3);
5083 
5084   return RSrc;
5085 }
5086 
5087 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5088                                 const SIInstrInfo &TII, Register BasePtr) {
5089   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5090 
5091   // FIXME: Why are half the "default" bits ignored based on the addressing
5092   // mode?
5093   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5094 }
5095 
5096 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5097                                const SIInstrInfo &TII, Register BasePtr) {
5098   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5099 
5100   // FIXME: Why are half the "default" bits ignored based on the addressing
5101   // mode?
5102   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5103 }
5104 
5105 AMDGPUInstructionSelector::MUBUFAddressData
5106 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5107   MUBUFAddressData Data;
5108   Data.N0 = Src;
5109 
5110   Register PtrBase;
5111   int64_t Offset;
5112 
5113   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5114   if (isUInt<32>(Offset)) {
5115     Data.N0 = PtrBase;
5116     Data.Offset = Offset;
5117   }
5118 
5119   if (MachineInstr *InputAdd
5120       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5121     Data.N2 = InputAdd->getOperand(1).getReg();
5122     Data.N3 = InputAdd->getOperand(2).getReg();
5123 
5124     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5125     // FIXME: Don't know this was defined by operand 0
5126     //
5127     // TODO: Remove this when we have copy folding optimizations after
5128     // RegBankSelect.
5129     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5130     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5131   }
5132 
5133   return Data;
5134 }
5135 
5136 /// Return if the addr64 mubuf mode should be used for the given address.
5137 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5138   // (ptr_add N2, N3) -> addr64, or
5139   // (ptr_add (ptr_add N2, N3), C1) -> addr64
5140   if (Addr.N2)
5141     return true;
5142 
5143   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5144   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5145 }
5146 
5147 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
5148 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5149 /// component.
5150 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5151   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5152   if (TII.isLegalMUBUFImmOffset(ImmOffset))
5153     return;
5154 
5155   // Illegal offset, store it in soffset.
5156   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5157   B.buildInstr(AMDGPU::S_MOV_B32)
5158     .addDef(SOffset)
5159     .addImm(ImmOffset);
5160   ImmOffset = 0;
5161 }
5162 
5163 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5164   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5165   Register &SOffset, int64_t &Offset) const {
5166   // FIXME: Predicates should stop this from reaching here.
5167   // addr64 bit was removed for volcanic islands.
5168   if (!STI.hasAddr64() || STI.useFlatForGlobal())
5169     return false;
5170 
5171   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5172   if (!shouldUseAddr64(AddrData))
5173     return false;
5174 
5175   Register N0 = AddrData.N0;
5176   Register N2 = AddrData.N2;
5177   Register N3 = AddrData.N3;
5178   Offset = AddrData.Offset;
5179 
5180   // Base pointer for the SRD.
5181   Register SRDPtr;
5182 
5183   if (N2) {
5184     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5185       assert(N3);
5186       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5187         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5188         // addr64, and construct the default resource from a 0 address.
5189         VAddr = N0;
5190       } else {
5191         SRDPtr = N3;
5192         VAddr = N2;
5193       }
5194     } else {
5195       // N2 is not divergent.
5196       SRDPtr = N2;
5197       VAddr = N3;
5198     }
5199   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5200     // Use the default null pointer in the resource
5201     VAddr = N0;
5202   } else {
5203     // N0 -> offset, or
5204     // (N0 + C1) -> offset
5205     SRDPtr = N0;
5206   }
5207 
5208   MachineIRBuilder B(*Root.getParent());
5209   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5210   splitIllegalMUBUFOffset(B, SOffset, Offset);
5211   return true;
5212 }
5213 
5214 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5215   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5216   int64_t &Offset) const {
5217 
5218   // FIXME: Pattern should not reach here.
5219   if (STI.useFlatForGlobal())
5220     return false;
5221 
5222   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5223   if (shouldUseAddr64(AddrData))
5224     return false;
5225 
5226   // N0 -> offset, or
5227   // (N0 + C1) -> offset
5228   Register SRDPtr = AddrData.N0;
5229   Offset = AddrData.Offset;
5230 
5231   // TODO: Look through extensions for 32-bit soffset.
5232   MachineIRBuilder B(*Root.getParent());
5233 
5234   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5235   splitIllegalMUBUFOffset(B, SOffset, Offset);
5236   return true;
5237 }
5238 
5239 InstructionSelector::ComplexRendererFns
5240 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5241   Register VAddr;
5242   Register RSrcReg;
5243   Register SOffset;
5244   int64_t Offset = 0;
5245 
5246   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5247     return {};
5248 
5249   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5250   // pattern.
5251   return {{
5252       [=](MachineInstrBuilder &MIB) {  // rsrc
5253         MIB.addReg(RSrcReg);
5254       },
5255       [=](MachineInstrBuilder &MIB) { // vaddr
5256         MIB.addReg(VAddr);
5257       },
5258       [=](MachineInstrBuilder &MIB) { // soffset
5259         if (SOffset)
5260           MIB.addReg(SOffset);
5261         else if (STI.hasRestrictedSOffset())
5262           MIB.addReg(AMDGPU::SGPR_NULL);
5263         else
5264           MIB.addImm(0);
5265       },
5266       [=](MachineInstrBuilder &MIB) { // offset
5267         MIB.addImm(Offset);
5268       },
5269       addZeroImm, //  cpol
5270       addZeroImm, //  tfe
5271       addZeroImm  //  swz
5272     }};
5273 }
5274 
5275 InstructionSelector::ComplexRendererFns
5276 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5277   Register RSrcReg;
5278   Register SOffset;
5279   int64_t Offset = 0;
5280 
5281   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5282     return {};
5283 
5284   return {{
5285       [=](MachineInstrBuilder &MIB) {  // rsrc
5286         MIB.addReg(RSrcReg);
5287       },
5288       [=](MachineInstrBuilder &MIB) { // soffset
5289         if (SOffset)
5290           MIB.addReg(SOffset);
5291         else if (STI.hasRestrictedSOffset())
5292           MIB.addReg(AMDGPU::SGPR_NULL);
5293         else
5294           MIB.addImm(0);
5295       },
5296       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5297       addZeroImm, //  cpol
5298       addZeroImm, //  tfe
5299       addZeroImm, //  swz
5300     }};
5301 }
5302 
5303 InstructionSelector::ComplexRendererFns
5304 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5305 
5306   Register SOffset = Root.getReg();
5307 
5308   if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5309     SOffset = AMDGPU::SGPR_NULL;
5310 
5311   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5312 }
5313 
5314 /// Get an immediate that must be 32-bits, and treated as zero extended.
5315 static std::optional<uint64_t>
5316 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5317   // getIConstantVRegVal sexts any values, so see if that matters.
5318   std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5319   if (!OffsetVal || !isInt<32>(*OffsetVal))
5320     return std::nullopt;
5321   return Lo_32(*OffsetVal);
5322 }
5323 
5324 InstructionSelector::ComplexRendererFns
5325 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5326   std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5327   if (!OffsetVal)
5328     return {};
5329 
5330   std::optional<int64_t> EncodedImm =
5331       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5332   if (!EncodedImm)
5333     return {};
5334 
5335   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
5336 }
5337 
5338 InstructionSelector::ComplexRendererFns
5339 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5340   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5341 
5342   std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5343   if (!OffsetVal)
5344     return {};
5345 
5346   std::optional<int64_t> EncodedImm =
5347       AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
5348   if (!EncodedImm)
5349     return {};
5350 
5351   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
5352 }
5353 
5354 InstructionSelector::ComplexRendererFns
5355 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5356   // Match the (soffset + offset) pair as a 32-bit register base and
5357   // an immediate offset.
5358   Register SOffset;
5359   unsigned Offset;
5360   std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5361       *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5362   if (!SOffset)
5363     return std::nullopt;
5364 
5365   std::optional<int64_t> EncodedOffset =
5366       AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5367   if (!EncodedOffset)
5368     return std::nullopt;
5369 
5370   assert(MRI->getType(SOffset) == LLT::scalar(32));
5371   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5372            [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5373 }
5374 
5375 // Variant of stripBitCast that returns the instruction instead of a
5376 // MachineOperand.
5377 static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5378   if (MI->getOpcode() == AMDGPU::G_BITCAST)
5379     return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5380   return MI;
5381 }
5382 
5383 // Figure out if this is really an extract of the high 16-bits of a dword,
5384 // returns nullptr if it isn't.
5385 static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5386                                     MachineRegisterInfo &MRI) {
5387   Inst = stripBitCast(Inst, MRI);
5388 
5389   if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5390     return nullptr;
5391 
5392   MachineInstr *TruncOp =
5393       getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5394   TruncOp = stripBitCast(TruncOp, MRI);
5395 
5396   // G_LSHR x, (G_CONSTANT i32 16)
5397   if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5398     auto SrlAmount = getIConstantVRegValWithLookThrough(
5399         TruncOp->getOperand(2).getReg(), MRI);
5400     if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5401       MachineInstr *SrlOp =
5402           getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5403       return stripBitCast(SrlOp, MRI);
5404     }
5405   }
5406 
5407   // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5408   //    1, 0 swaps the low/high 16 bits.
5409   //    1, 1 sets the high 16 bits to be the same as the low 16.
5410   // in any case, it selects the high elts.
5411   if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5412     assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5413            LLT::fixed_vector(2, 16));
5414 
5415     ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5416     assert(Mask.size() == 2);
5417 
5418     if (Mask[0] == 1 && Mask[1] <= 1) {
5419       MachineInstr *LHS =
5420           getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5421       return stripBitCast(LHS, MRI);
5422     }
5423   }
5424 
5425   return nullptr;
5426 }
5427 
5428 std::pair<Register, unsigned>
5429 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5430                                                      bool &Matched) const {
5431   Matched = false;
5432 
5433   Register Src;
5434   unsigned Mods;
5435   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5436 
5437   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5438   if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5439     MachineOperand *MO = &MI->getOperand(1);
5440     Src = MO->getReg();
5441     MI = getDefIgnoringCopies(Src, *MRI);
5442 
5443     assert(MRI->getType(Src) == LLT::scalar(16));
5444 
5445     // See through bitcasts.
5446     // FIXME: Would be nice to use stripBitCast here.
5447     if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5448       MO = &MI->getOperand(1);
5449       Src = MO->getReg();
5450       MI = getDefIgnoringCopies(Src, *MRI);
5451     }
5452 
5453     const auto CheckAbsNeg = [&]() {
5454       // Be careful about folding modifiers if we already have an abs. fneg is
5455       // applied last, so we don't want to apply an earlier fneg.
5456       if ((Mods & SISrcMods::ABS) == 0) {
5457         unsigned ModsTmp;
5458         std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5459         MI = getDefIgnoringCopies(Src, *MRI);
5460 
5461         if ((ModsTmp & SISrcMods::NEG) != 0)
5462           Mods ^= SISrcMods::NEG;
5463 
5464         if ((ModsTmp & SISrcMods::ABS) != 0)
5465           Mods |= SISrcMods::ABS;
5466       }
5467     };
5468 
5469     CheckAbsNeg();
5470 
5471     // op_sel/op_sel_hi decide the source type and source.
5472     // If the source's op_sel_hi is set, it indicates to do a conversion from
5473     // fp16. If the sources's op_sel is set, it picks the high half of the
5474     // source register.
5475 
5476     Mods |= SISrcMods::OP_SEL_1;
5477 
5478     if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5479       Mods |= SISrcMods::OP_SEL_0;
5480       MI = ExtractHiEltMI;
5481       MO = &MI->getOperand(0);
5482       Src = MO->getReg();
5483 
5484       CheckAbsNeg();
5485     }
5486 
5487     Matched = true;
5488   }
5489 
5490   return {Src, Mods};
5491 }
5492 
5493 InstructionSelector::ComplexRendererFns
5494 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5495     MachineOperand &Root) const {
5496   Register Src;
5497   unsigned Mods;
5498   bool Matched;
5499   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5500   if (!Matched)
5501     return {};
5502 
5503   return {{
5504       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5505       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5506   }};
5507 }
5508 
5509 InstructionSelector::ComplexRendererFns
5510 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5511   Register Src;
5512   unsigned Mods;
5513   bool Matched;
5514   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5515 
5516   return {{
5517       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5518       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5519   }};
5520 }
5521 
5522 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5523     MachineInstr &I, Intrinsic::ID IntrID) const {
5524   MachineBasicBlock *MBB = I.getParent();
5525   const DebugLoc &DL = I.getDebugLoc();
5526   Register CCReg = I.getOperand(0).getReg();
5527 
5528   bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5529 
5530   if (HasM0) {
5531     auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5532                        .addReg(I.getOperand(2).getReg());
5533     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5534     if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5535       return false;
5536   } else {
5537     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5538         .addImm(I.getOperand(2).getImm());
5539   }
5540 
5541   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5542 
5543   I.eraseFromParent();
5544   return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5545                                       *MRI);
5546 }
5547 
5548 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5549   if (HasInlineConst) {
5550     switch (IntrID) {
5551     default:
5552       llvm_unreachable("not a named barrier op");
5553     case Intrinsic::amdgcn_s_barrier_init:
5554       return AMDGPU::S_BARRIER_INIT_IMM;
5555     case Intrinsic::amdgcn_s_barrier_join:
5556       return AMDGPU::S_BARRIER_JOIN_IMM;
5557     case Intrinsic::amdgcn_s_wakeup_barrier:
5558       return AMDGPU::S_WAKEUP_BARRIER_IMM;
5559     case Intrinsic::amdgcn_s_get_barrier_state:
5560       return AMDGPU::S_GET_BARRIER_STATE_IMM;
5561     };
5562   } else {
5563     switch (IntrID) {
5564     default:
5565       llvm_unreachable("not a named barrier op");
5566     case Intrinsic::amdgcn_s_barrier_init:
5567       return AMDGPU::S_BARRIER_INIT_M0;
5568     case Intrinsic::amdgcn_s_barrier_join:
5569       return AMDGPU::S_BARRIER_JOIN_M0;
5570     case Intrinsic::amdgcn_s_wakeup_barrier:
5571       return AMDGPU::S_WAKEUP_BARRIER_M0;
5572     case Intrinsic::amdgcn_s_get_barrier_state:
5573       return AMDGPU::S_GET_BARRIER_STATE_M0;
5574     };
5575   }
5576 }
5577 
5578 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5579     MachineInstr &I, Intrinsic::ID IntrID) const {
5580   MachineBasicBlock *MBB = I.getParent();
5581   const DebugLoc &DL = I.getDebugLoc();
5582   MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5583                              ? I.getOperand(2)
5584                              : I.getOperand(1);
5585   std::optional<int64_t> BarValImm =
5586       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5587   Register M0Val;
5588   Register TmpReg0;
5589 
5590   // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5591   if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5592     Register MemberCount = I.getOperand(2).getReg();
5593     TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5594     // TODO: This should be expanded during legalization so that the the S_LSHL
5595     // and S_OR can be constant-folded
5596     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5597         .addImm(16)
5598         .addReg(MemberCount);
5599     M0Val = TmpReg0;
5600   }
5601 
5602   // If not inlinable, get reference to barrier depending on the instruction
5603   if (!BarValImm) {
5604     if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5605       // If reference to barrier id is not an inlinable constant then it must be
5606       // referenced with M0[4:0]. Perform an OR with the member count to include
5607       // it in M0 for S_BARRIER_INIT.
5608       Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5609       BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5610           .addReg(BarOp.getReg())
5611           .addReg(TmpReg0);
5612       M0Val = TmpReg1;
5613     } else {
5614       M0Val = BarOp.getReg();
5615     }
5616   }
5617 
5618   // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5619   if (M0Val) {
5620     auto CopyMIB =
5621         BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5622     constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5623   }
5624 
5625   MachineInstrBuilder MIB;
5626   unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5627   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5628 
5629   if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5630     MIB.addDef(I.getOperand(0).getReg());
5631 
5632   if (BarValImm)
5633     MIB.addImm(*BarValImm);
5634 
5635   I.eraseFromParent();
5636   return true;
5637 }
5638 
5639 bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5640   MachineBasicBlock *BB = I.getParent();
5641   const DebugLoc &DL = I.getDebugLoc();
5642   Register CCReg = I.getOperand(0).getReg();
5643 
5644   BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5645   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5646 
5647   I.eraseFromParent();
5648   return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5649                                       *MRI);
5650 }
5651 
5652 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5653                                                  const MachineInstr &MI,
5654                                                  int OpIdx) const {
5655   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5656          "Expected G_CONSTANT");
5657   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5658 }
5659 
5660 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5661                                                 const MachineInstr &MI,
5662                                                 int OpIdx) const {
5663   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5664          "Expected G_CONSTANT");
5665   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5666 }
5667 
5668 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5669                                                  const MachineInstr &MI,
5670                                                  int OpIdx) const {
5671   assert(OpIdx == -1);
5672 
5673   const MachineOperand &Op = MI.getOperand(1);
5674   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5675     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5676   else {
5677     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5678     MIB.addImm(Op.getCImm()->getSExtValue());
5679   }
5680 }
5681 
5682 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5683                                                 const MachineInstr &MI,
5684                                                 int OpIdx) const {
5685   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5686          "Expected G_CONSTANT");
5687   MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5688 }
5689 
5690 /// This only really exists to satisfy DAG type checking machinery, so is a
5691 /// no-op here.
5692 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5693                                                 const MachineInstr &MI,
5694                                                 int OpIdx) const {
5695   MIB.addImm(MI.getOperand(OpIdx).getImm());
5696 }
5697 
5698 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5699                                                 const MachineInstr &MI,
5700                                                 int OpIdx) const {
5701   assert(OpIdx >= 0 && "expected to match an immediate operand");
5702   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5703 }
5704 
5705 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5706                                                   const MachineInstr &MI,
5707                                                   int OpIdx) const {
5708   assert(OpIdx >= 0 && "expected to match an immediate operand");
5709   MIB.addImm(MI.getOperand(OpIdx).getImm() &
5710              (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5711                                        : AMDGPU::CPol::ALL_pregfx12));
5712 }
5713 
5714 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5715                                                  const MachineInstr &MI,
5716                                                  int OpIdx) const {
5717   assert(OpIdx >= 0 && "expected to match an immediate operand");
5718   const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5719                        (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
5720                                                  : AMDGPU::CPol::SWZ_pregfx12);
5721   MIB.addImm(Swizzle);
5722 }
5723 
5724 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5725     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5726   assert(OpIdx >= 0 && "expected to match an immediate operand");
5727   const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5728                         (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5729                                                   : AMDGPU::CPol::ALL_pregfx12);
5730   MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5731 }
5732 
5733 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5734                                                  const MachineInstr &MI,
5735                                                  int OpIdx) const {
5736   MIB.addFrameIndex(MI.getOperand(1).getIndex());
5737 }
5738 
5739 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5740                                                        const MachineInstr &MI,
5741                                                        int OpIdx) const {
5742   const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5743   int ExpVal = APF.getExactLog2Abs();
5744   assert(ExpVal != INT_MIN);
5745   MIB.addImm(ExpVal);
5746 }
5747 
5748 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
5749   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
5750 }
5751 
5752 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
5753   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
5754 }
5755 
5756 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
5757   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
5758 }
5759 
5760 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5761   return TII.isInlineConstant(Imm);
5762 }
5763