xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (revision 77013d11e6483b970af25e13c9b892075742f7e5)
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 
27 #define DEBUG_TYPE "amdgpu-isel"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 static cl::opt<bool> AllowRiskySelect(
33   "amdgpu-global-isel-risky-select",
34   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
35   cl::init(false),
36   cl::ReallyHidden);
37 
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43 
44 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
45     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46     const AMDGPUTargetMachine &TM)
47     : InstructionSelector(), TII(*STI.getInstrInfo()),
48       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49       STI(STI),
50       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
51 #define GET_GLOBALISEL_PREDICATES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_PREDICATES_INIT
54 #define GET_GLOBALISEL_TEMPORARIES_INIT
55 #include "AMDGPUGenGlobalISel.inc"
56 #undef GET_GLOBALISEL_TEMPORARIES_INIT
57 {
58 }
59 
60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61 
62 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
63                                         CodeGenCoverage &CoverageInfo) {
64   MRI = &MF.getRegInfo();
65   Subtarget = &MF.getSubtarget<GCNSubtarget>();
66   InstructionSelector::setupMF(MF, KB, CoverageInfo);
67 }
68 
69 bool AMDGPUInstructionSelector::isVCC(Register Reg,
70                                       const MachineRegisterInfo &MRI) const {
71   // The verifier is oblivious to s1 being a valid value for wavesize registers.
72   if (Reg.isPhysical())
73     return false;
74 
75   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
76   const TargetRegisterClass *RC =
77       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
78   if (RC) {
79     const LLT Ty = MRI.getType(Reg);
80     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
81            Ty.isValid() && Ty.getSizeInBits() == 1;
82   }
83 
84   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
85   return RB->getID() == AMDGPU::VCCRegBankID;
86 }
87 
88 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
89                                                         unsigned NewOpc) const {
90   MI.setDesc(TII.get(NewOpc));
91   MI.RemoveOperand(1); // Remove intrinsic ID.
92   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
93 
94   MachineOperand &Dst = MI.getOperand(0);
95   MachineOperand &Src = MI.getOperand(1);
96 
97   // TODO: This should be legalized to s32 if needed
98   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
99     return false;
100 
101   const TargetRegisterClass *DstRC
102     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
103   const TargetRegisterClass *SrcRC
104     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
105   if (!DstRC || DstRC != SrcRC)
106     return false;
107 
108   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
109          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
110 }
111 
112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
113   const DebugLoc &DL = I.getDebugLoc();
114   MachineBasicBlock *BB = I.getParent();
115   I.setDesc(TII.get(TargetOpcode::COPY));
116 
117   const MachineOperand &Src = I.getOperand(1);
118   MachineOperand &Dst = I.getOperand(0);
119   Register DstReg = Dst.getReg();
120   Register SrcReg = Src.getReg();
121 
122   if (isVCC(DstReg, *MRI)) {
123     if (SrcReg == AMDGPU::SCC) {
124       const TargetRegisterClass *RC
125         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
126       if (!RC)
127         return true;
128       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
129     }
130 
131     if (!isVCC(SrcReg, *MRI)) {
132       // TODO: Should probably leave the copy and let copyPhysReg expand it.
133       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
134         return false;
135 
136       const TargetRegisterClass *SrcRC
137         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
138 
139       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
140 
141       // We can't trust the high bits at this point, so clear them.
142 
143       // TODO: Skip masking high bits if def is known boolean.
144 
145       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
146         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
147       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
148         .addImm(1)
149         .addReg(SrcReg);
150       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
151         .addImm(0)
152         .addReg(MaskedReg);
153 
154       if (!MRI->getRegClassOrNull(SrcReg))
155         MRI->setRegClass(SrcReg, SrcRC);
156       I.eraseFromParent();
157       return true;
158     }
159 
160     const TargetRegisterClass *RC =
161       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
162     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
163       return false;
164 
165     return true;
166   }
167 
168   for (const MachineOperand &MO : I.operands()) {
169     if (MO.getReg().isPhysical())
170       continue;
171 
172     const TargetRegisterClass *RC =
173             TRI.getConstrainedRegClassForOperand(MO, *MRI);
174     if (!RC)
175       continue;
176     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
177   }
178   return true;
179 }
180 
181 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
182   const Register DefReg = I.getOperand(0).getReg();
183   const LLT DefTy = MRI->getType(DefReg);
184   if (DefTy == LLT::scalar(1)) {
185     if (!AllowRiskySelect) {
186       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
187       return false;
188     }
189 
190     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
191   }
192 
193   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
194 
195   const RegClassOrRegBank &RegClassOrBank =
196     MRI->getRegClassOrRegBank(DefReg);
197 
198   const TargetRegisterClass *DefRC
199     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
200   if (!DefRC) {
201     if (!DefTy.isValid()) {
202       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
203       return false;
204     }
205 
206     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
207     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
208     if (!DefRC) {
209       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
210       return false;
211     }
212   }
213 
214   // TODO: Verify that all registers have the same bank
215   I.setDesc(TII.get(TargetOpcode::PHI));
216   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
217 }
218 
219 MachineOperand
220 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
221                                            const TargetRegisterClass &SubRC,
222                                            unsigned SubIdx) const {
223 
224   MachineInstr *MI = MO.getParent();
225   MachineBasicBlock *BB = MO.getParent()->getParent();
226   Register DstReg = MRI->createVirtualRegister(&SubRC);
227 
228   if (MO.isReg()) {
229     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
230     Register Reg = MO.getReg();
231     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
232             .addReg(Reg, 0, ComposedSubIdx);
233 
234     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
235                                      MO.isKill(), MO.isDead(), MO.isUndef(),
236                                      MO.isEarlyClobber(), 0, MO.isDebug(),
237                                      MO.isInternalRead());
238   }
239 
240   assert(MO.isImm());
241 
242   APInt Imm(64, MO.getImm());
243 
244   switch (SubIdx) {
245   default:
246     llvm_unreachable("do not know to split immediate with this sub index.");
247   case AMDGPU::sub0:
248     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
249   case AMDGPU::sub1:
250     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
251   }
252 }
253 
254 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
255   switch (Opc) {
256   case AMDGPU::G_AND:
257     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
258   case AMDGPU::G_OR:
259     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
260   case AMDGPU::G_XOR:
261     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
262   default:
263     llvm_unreachable("not a bit op");
264   }
265 }
266 
267 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
268   Register DstReg = I.getOperand(0).getReg();
269   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
270 
271   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
272   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
273       DstRB->getID() != AMDGPU::VCCRegBankID)
274     return false;
275 
276   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
277                             STI.isWave64());
278   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
279 
280   // Dead implicit-def of scc
281   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
282                                          true, // isImp
283                                          false, // isKill
284                                          true)); // isDead
285   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
286 }
287 
288 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
289   MachineBasicBlock *BB = I.getParent();
290   MachineFunction *MF = BB->getParent();
291   Register DstReg = I.getOperand(0).getReg();
292   const DebugLoc &DL = I.getDebugLoc();
293   LLT Ty = MRI->getType(DstReg);
294   if (Ty.isVector())
295     return false;
296 
297   unsigned Size = Ty.getSizeInBits();
298   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
299   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
300   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
301 
302   if (Size == 32) {
303     if (IsSALU) {
304       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
305       MachineInstr *Add =
306         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
307         .add(I.getOperand(1))
308         .add(I.getOperand(2));
309       I.eraseFromParent();
310       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
311     }
312 
313     if (STI.hasAddNoCarry()) {
314       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
315       I.setDesc(TII.get(Opc));
316       I.addOperand(*MF, MachineOperand::CreateImm(0));
317       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
318       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
319     }
320 
321     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
322 
323     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
324     MachineInstr *Add
325       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
326       .addDef(UnusedCarry, RegState::Dead)
327       .add(I.getOperand(1))
328       .add(I.getOperand(2))
329       .addImm(0);
330     I.eraseFromParent();
331     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
332   }
333 
334   assert(!Sub && "illegal sub should not reach here");
335 
336   const TargetRegisterClass &RC
337     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
338   const TargetRegisterClass &HalfRC
339     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
340 
341   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
342   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
343   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
344   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
345 
346   Register DstLo = MRI->createVirtualRegister(&HalfRC);
347   Register DstHi = MRI->createVirtualRegister(&HalfRC);
348 
349   if (IsSALU) {
350     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
351       .add(Lo1)
352       .add(Lo2);
353     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
354       .add(Hi1)
355       .add(Hi2);
356   } else {
357     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
358     Register CarryReg = MRI->createVirtualRegister(CarryRC);
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
360       .addDef(CarryReg)
361       .add(Lo1)
362       .add(Lo2)
363       .addImm(0);
364     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
365       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
366       .add(Hi1)
367       .add(Hi2)
368       .addReg(CarryReg, RegState::Kill)
369       .addImm(0);
370 
371     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
372       return false;
373   }
374 
375   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
376     .addReg(DstLo)
377     .addImm(AMDGPU::sub0)
378     .addReg(DstHi)
379     .addImm(AMDGPU::sub1);
380 
381 
382   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
383     return false;
384 
385   I.eraseFromParent();
386   return true;
387 }
388 
389 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
390   MachineInstr &I) const {
391   MachineBasicBlock *BB = I.getParent();
392   MachineFunction *MF = BB->getParent();
393   const DebugLoc &DL = I.getDebugLoc();
394   Register Dst0Reg = I.getOperand(0).getReg();
395   Register Dst1Reg = I.getOperand(1).getReg();
396   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
397                      I.getOpcode() == AMDGPU::G_UADDE;
398   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
399                           I.getOpcode() == AMDGPU::G_USUBE;
400 
401   if (isVCC(Dst1Reg, *MRI)) {
402     unsigned NoCarryOpc =
403         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
404     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
405     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
406     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
407     I.addOperand(*MF, MachineOperand::CreateImm(0));
408     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
409   }
410 
411   Register Src0Reg = I.getOperand(2).getReg();
412   Register Src1Reg = I.getOperand(3).getReg();
413 
414   if (HasCarryIn) {
415     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
416       .addReg(I.getOperand(4).getReg());
417   }
418 
419   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
420   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
421 
422   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
423     .add(I.getOperand(2))
424     .add(I.getOperand(3));
425   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
426     .addReg(AMDGPU::SCC);
427 
428   if (!MRI->getRegClassOrNull(Dst1Reg))
429     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
430 
431   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
432       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
433       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
434     return false;
435 
436   if (HasCarryIn &&
437       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
438                                     AMDGPU::SReg_32RegClass, *MRI))
439     return false;
440 
441   I.eraseFromParent();
442   return true;
443 }
444 
445 // TODO: We should probably legalize these to only using 32-bit results.
446 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
447   MachineBasicBlock *BB = I.getParent();
448   Register DstReg = I.getOperand(0).getReg();
449   Register SrcReg = I.getOperand(1).getReg();
450   LLT DstTy = MRI->getType(DstReg);
451   LLT SrcTy = MRI->getType(SrcReg);
452   const unsigned SrcSize = SrcTy.getSizeInBits();
453   unsigned DstSize = DstTy.getSizeInBits();
454 
455   // TODO: Should handle any multiple of 32 offset.
456   unsigned Offset = I.getOperand(2).getImm();
457   if (Offset % 32 != 0 || DstSize > 128)
458     return false;
459 
460   // 16-bit operations really use 32-bit registers.
461   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
462   if (DstSize == 16)
463     DstSize = 32;
464 
465   const TargetRegisterClass *DstRC =
466     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
467   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
468     return false;
469 
470   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
471   const TargetRegisterClass *SrcRC =
472     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
473   if (!SrcRC)
474     return false;
475   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
476                                                          DstSize / 32);
477   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
478   if (!SrcRC)
479     return false;
480 
481   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
482                                     *SrcRC, I.getOperand(1));
483   const DebugLoc &DL = I.getDebugLoc();
484   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
485     .addReg(SrcReg, 0, SubReg);
486 
487   I.eraseFromParent();
488   return true;
489 }
490 
491 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
492   MachineBasicBlock *BB = MI.getParent();
493   Register DstReg = MI.getOperand(0).getReg();
494   LLT DstTy = MRI->getType(DstReg);
495   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
496 
497   const unsigned SrcSize = SrcTy.getSizeInBits();
498   if (SrcSize < 32)
499     return selectImpl(MI, *CoverageInfo);
500 
501   const DebugLoc &DL = MI.getDebugLoc();
502   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
503   const unsigned DstSize = DstTy.getSizeInBits();
504   const TargetRegisterClass *DstRC =
505     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
506   if (!DstRC)
507     return false;
508 
509   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
510   MachineInstrBuilder MIB =
511     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
512   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
513     MachineOperand &Src = MI.getOperand(I + 1);
514     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
515     MIB.addImm(SubRegs[I]);
516 
517     const TargetRegisterClass *SrcRC
518       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
519     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
520       return false;
521   }
522 
523   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
524     return false;
525 
526   MI.eraseFromParent();
527   return true;
528 }
529 
530 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
531   MachineBasicBlock *BB = MI.getParent();
532   const int NumDst = MI.getNumOperands() - 1;
533 
534   MachineOperand &Src = MI.getOperand(NumDst);
535 
536   Register SrcReg = Src.getReg();
537   Register DstReg0 = MI.getOperand(0).getReg();
538   LLT DstTy = MRI->getType(DstReg0);
539   LLT SrcTy = MRI->getType(SrcReg);
540 
541   const unsigned DstSize = DstTy.getSizeInBits();
542   const unsigned SrcSize = SrcTy.getSizeInBits();
543   const DebugLoc &DL = MI.getDebugLoc();
544   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
545 
546   const TargetRegisterClass *SrcRC =
547     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
548   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
549     return false;
550 
551   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
552   // source, and this relies on the fact that the same subregister indices are
553   // used for both.
554   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
555   for (int I = 0, E = NumDst; I != E; ++I) {
556     MachineOperand &Dst = MI.getOperand(I);
557     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
558       .addReg(SrcReg, 0, SubRegs[I]);
559 
560     // Make sure the subregister index is valid for the source register.
561     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
562     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
563       return false;
564 
565     const TargetRegisterClass *DstRC =
566       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
567     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
568       return false;
569   }
570 
571   MI.eraseFromParent();
572   return true;
573 }
574 
575 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
576   MachineInstr &MI) const {
577   if (selectImpl(MI, *CoverageInfo))
578     return true;
579 
580   const LLT S32 = LLT::scalar(32);
581   const LLT V2S16 = LLT::vector(2, 16);
582 
583   Register Dst = MI.getOperand(0).getReg();
584   if (MRI->getType(Dst) != V2S16)
585     return false;
586 
587   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
588   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
589     return false;
590 
591   Register Src0 = MI.getOperand(1).getReg();
592   Register Src1 = MI.getOperand(2).getReg();
593   if (MRI->getType(Src0) != S32)
594     return false;
595 
596   const DebugLoc &DL = MI.getDebugLoc();
597   MachineBasicBlock *BB = MI.getParent();
598 
599   auto ConstSrc1 =
600       getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
601   if (ConstSrc1) {
602     auto ConstSrc0 =
603         getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
604     if (ConstSrc0) {
605       const int64_t K0 = ConstSrc0->Value.getSExtValue();
606       const int64_t K1 = ConstSrc1->Value.getSExtValue();
607       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
608       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
609 
610       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
611         .addImm(Lo16 | (Hi16 << 16));
612       MI.eraseFromParent();
613       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
614     }
615   }
616 
617   // TODO: This should probably be a combine somewhere
618   // (build_vector_trunc $src0, undef -> copy $src0
619   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
620   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
621     MI.setDesc(TII.get(AMDGPU::COPY));
622     MI.RemoveOperand(2);
623     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
624            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
625   }
626 
627   Register ShiftSrc0;
628   Register ShiftSrc1;
629 
630   // With multiple uses of the shift, this will duplicate the shift and
631   // increase register pressure.
632   //
633   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
634   //  => (S_PACK_HH_B32_B16 $src0, $src1)
635   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
636   //  => (S_PACK_LH_B32_B16 $src0, $src1)
637   // (build_vector_trunc $src0, $src1)
638   //  => (S_PACK_LL_B32_B16 $src0, $src1)
639 
640   bool Shift0 = mi_match(
641       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
642 
643   bool Shift1 = mi_match(
644       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
645 
646   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
647   if (Shift0 && Shift1) {
648     Opc = AMDGPU::S_PACK_HH_B32_B16;
649     MI.getOperand(1).setReg(ShiftSrc0);
650     MI.getOperand(2).setReg(ShiftSrc1);
651   } else if (Shift1) {
652     Opc = AMDGPU::S_PACK_LH_B32_B16;
653     MI.getOperand(2).setReg(ShiftSrc1);
654   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
655     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
656     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
657       .addReg(ShiftSrc0)
658       .addImm(16);
659 
660     MI.eraseFromParent();
661     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
662   }
663 
664   MI.setDesc(TII.get(Opc));
665   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
666 }
667 
668 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
669   return selectG_ADD_SUB(I);
670 }
671 
672 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
673   const MachineOperand &MO = I.getOperand(0);
674 
675   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
676   // regbank check here is to know why getConstrainedRegClassForOperand failed.
677   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
678   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
679       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
680     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
681     return true;
682   }
683 
684   return false;
685 }
686 
687 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
688   MachineBasicBlock *BB = I.getParent();
689 
690   Register DstReg = I.getOperand(0).getReg();
691   Register Src0Reg = I.getOperand(1).getReg();
692   Register Src1Reg = I.getOperand(2).getReg();
693   LLT Src1Ty = MRI->getType(Src1Reg);
694 
695   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
696   unsigned InsSize = Src1Ty.getSizeInBits();
697 
698   int64_t Offset = I.getOperand(3).getImm();
699 
700   // FIXME: These cases should have been illegal and unnecessary to check here.
701   if (Offset % 32 != 0 || InsSize % 32 != 0)
702     return false;
703 
704   // Currently not handled by getSubRegFromChannel.
705   if (InsSize > 128)
706     return false;
707 
708   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
709   if (SubReg == AMDGPU::NoSubRegister)
710     return false;
711 
712   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
713   const TargetRegisterClass *DstRC =
714     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
715   if (!DstRC)
716     return false;
717 
718   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
719   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
720   const TargetRegisterClass *Src0RC =
721     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
722   const TargetRegisterClass *Src1RC =
723     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
724 
725   // Deal with weird cases where the class only partially supports the subreg
726   // index.
727   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
728   if (!Src0RC || !Src1RC)
729     return false;
730 
731   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
732       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
733       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
734     return false;
735 
736   const DebugLoc &DL = I.getDebugLoc();
737   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
738     .addReg(Src0Reg)
739     .addReg(Src1Reg)
740     .addImm(SubReg);
741 
742   I.eraseFromParent();
743   return true;
744 }
745 
746 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
747   if (STI.getLDSBankCount() != 16)
748     return selectImpl(MI, *CoverageInfo);
749 
750   Register Dst = MI.getOperand(0).getReg();
751   Register Src0 = MI.getOperand(2).getReg();
752   Register M0Val = MI.getOperand(6).getReg();
753   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
754       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
755       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
756     return false;
757 
758   // This requires 2 instructions. It is possible to write a pattern to support
759   // this, but the generated isel emitter doesn't correctly deal with multiple
760   // output instructions using the same physical register input. The copy to m0
761   // is incorrectly placed before the second instruction.
762   //
763   // TODO: Match source modifiers.
764 
765   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
766   const DebugLoc &DL = MI.getDebugLoc();
767   MachineBasicBlock *MBB = MI.getParent();
768 
769   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
770     .addReg(M0Val);
771   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
772     .addImm(2)
773     .addImm(MI.getOperand(4).getImm())  // $attr
774     .addImm(MI.getOperand(3).getImm()); // $attrchan
775 
776   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
777     .addImm(0)                          // $src0_modifiers
778     .addReg(Src0)                       // $src0
779     .addImm(MI.getOperand(4).getImm())  // $attr
780     .addImm(MI.getOperand(3).getImm())  // $attrchan
781     .addImm(0)                          // $src2_modifiers
782     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
783     .addImm(MI.getOperand(5).getImm())  // $high
784     .addImm(0)                          // $clamp
785     .addImm(0);                         // $omod
786 
787   MI.eraseFromParent();
788   return true;
789 }
790 
791 // Writelane is special in that it can use SGPR and M0 (which would normally
792 // count as using the constant bus twice - but in this case it is allowed since
793 // the lane selector doesn't count as a use of the constant bus). However, it is
794 // still required to abide by the 1 SGPR rule. Fix this up if we might have
795 // multiple SGPRs.
796 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
797   // With a constant bus limit of at least 2, there's no issue.
798   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
799     return selectImpl(MI, *CoverageInfo);
800 
801   MachineBasicBlock *MBB = MI.getParent();
802   const DebugLoc &DL = MI.getDebugLoc();
803   Register VDst = MI.getOperand(0).getReg();
804   Register Val = MI.getOperand(2).getReg();
805   Register LaneSelect = MI.getOperand(3).getReg();
806   Register VDstIn = MI.getOperand(4).getReg();
807 
808   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
809 
810   Optional<ValueAndVReg> ConstSelect =
811     getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
812   if (ConstSelect) {
813     // The selector has to be an inline immediate, so we can use whatever for
814     // the other operands.
815     MIB.addReg(Val);
816     MIB.addImm(ConstSelect->Value.getSExtValue() &
817                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
818   } else {
819     Optional<ValueAndVReg> ConstVal =
820       getConstantVRegValWithLookThrough(Val, *MRI, true, true);
821 
822     // If the value written is an inline immediate, we can get away without a
823     // copy to m0.
824     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
825                                                  STI.hasInv2PiInlineImm())) {
826       MIB.addImm(ConstVal->Value.getSExtValue());
827       MIB.addReg(LaneSelect);
828     } else {
829       MIB.addReg(Val);
830 
831       // If the lane selector was originally in a VGPR and copied with
832       // readfirstlane, there's a hazard to read the same SGPR from the
833       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
834       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
835 
836       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
837         .addReg(LaneSelect);
838       MIB.addReg(AMDGPU::M0);
839     }
840   }
841 
842   MIB.addReg(VDstIn);
843 
844   MI.eraseFromParent();
845   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
846 }
847 
848 // We need to handle this here because tablegen doesn't support matching
849 // instructions with multiple outputs.
850 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
851   Register Dst0 = MI.getOperand(0).getReg();
852   Register Dst1 = MI.getOperand(1).getReg();
853 
854   LLT Ty = MRI->getType(Dst0);
855   unsigned Opc;
856   if (Ty == LLT::scalar(32))
857     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
858   else if (Ty == LLT::scalar(64))
859     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
860   else
861     return false;
862 
863   // TODO: Match source modifiers.
864 
865   const DebugLoc &DL = MI.getDebugLoc();
866   MachineBasicBlock *MBB = MI.getParent();
867 
868   Register Numer = MI.getOperand(3).getReg();
869   Register Denom = MI.getOperand(4).getReg();
870   unsigned ChooseDenom = MI.getOperand(5).getImm();
871 
872   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
873 
874   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
875     .addDef(Dst1)
876     .addImm(0)     // $src0_modifiers
877     .addUse(Src0)  // $src0
878     .addImm(0)     // $src1_modifiers
879     .addUse(Denom) // $src1
880     .addImm(0)     // $src2_modifiers
881     .addUse(Numer) // $src2
882     .addImm(0)     // $clamp
883     .addImm(0);    // $omod
884 
885   MI.eraseFromParent();
886   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
887 }
888 
889 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
890   unsigned IntrinsicID = I.getIntrinsicID();
891   switch (IntrinsicID) {
892   case Intrinsic::amdgcn_if_break: {
893     MachineBasicBlock *BB = I.getParent();
894 
895     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
896     // SelectionDAG uses for wave32 vs wave64.
897     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
898       .add(I.getOperand(0))
899       .add(I.getOperand(2))
900       .add(I.getOperand(3));
901 
902     Register DstReg = I.getOperand(0).getReg();
903     Register Src0Reg = I.getOperand(2).getReg();
904     Register Src1Reg = I.getOperand(3).getReg();
905 
906     I.eraseFromParent();
907 
908     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
909       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
910 
911     return true;
912   }
913   case Intrinsic::amdgcn_interp_p1_f16:
914     return selectInterpP1F16(I);
915   case Intrinsic::amdgcn_wqm:
916     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
917   case Intrinsic::amdgcn_softwqm:
918     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
919   case Intrinsic::amdgcn_wwm:
920     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
921   case Intrinsic::amdgcn_writelane:
922     return selectWritelane(I);
923   case Intrinsic::amdgcn_div_scale:
924     return selectDivScale(I);
925   case Intrinsic::amdgcn_icmp:
926     return selectIntrinsicIcmp(I);
927   case Intrinsic::amdgcn_ballot:
928     return selectBallot(I);
929   case Intrinsic::amdgcn_reloc_constant:
930     return selectRelocConstant(I);
931   case Intrinsic::amdgcn_groupstaticsize:
932     return selectGroupStaticSize(I);
933   case Intrinsic::returnaddress:
934     return selectReturnAddress(I);
935   default:
936     return selectImpl(I, *CoverageInfo);
937   }
938 }
939 
940 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
941   if (Size != 32 && Size != 64)
942     return -1;
943   switch (P) {
944   default:
945     llvm_unreachable("Unknown condition code!");
946   case CmpInst::ICMP_NE:
947     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
948   case CmpInst::ICMP_EQ:
949     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
950   case CmpInst::ICMP_SGT:
951     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
952   case CmpInst::ICMP_SGE:
953     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
954   case CmpInst::ICMP_SLT:
955     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
956   case CmpInst::ICMP_SLE:
957     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
958   case CmpInst::ICMP_UGT:
959     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
960   case CmpInst::ICMP_UGE:
961     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
962   case CmpInst::ICMP_ULT:
963     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
964   case CmpInst::ICMP_ULE:
965     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
966   }
967 }
968 
969 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
970                                               unsigned Size) const {
971   if (Size == 64) {
972     if (!STI.hasScalarCompareEq64())
973       return -1;
974 
975     switch (P) {
976     case CmpInst::ICMP_NE:
977       return AMDGPU::S_CMP_LG_U64;
978     case CmpInst::ICMP_EQ:
979       return AMDGPU::S_CMP_EQ_U64;
980     default:
981       return -1;
982     }
983   }
984 
985   if (Size != 32)
986     return -1;
987 
988   switch (P) {
989   case CmpInst::ICMP_NE:
990     return AMDGPU::S_CMP_LG_U32;
991   case CmpInst::ICMP_EQ:
992     return AMDGPU::S_CMP_EQ_U32;
993   case CmpInst::ICMP_SGT:
994     return AMDGPU::S_CMP_GT_I32;
995   case CmpInst::ICMP_SGE:
996     return AMDGPU::S_CMP_GE_I32;
997   case CmpInst::ICMP_SLT:
998     return AMDGPU::S_CMP_LT_I32;
999   case CmpInst::ICMP_SLE:
1000     return AMDGPU::S_CMP_LE_I32;
1001   case CmpInst::ICMP_UGT:
1002     return AMDGPU::S_CMP_GT_U32;
1003   case CmpInst::ICMP_UGE:
1004     return AMDGPU::S_CMP_GE_U32;
1005   case CmpInst::ICMP_ULT:
1006     return AMDGPU::S_CMP_LT_U32;
1007   case CmpInst::ICMP_ULE:
1008     return AMDGPU::S_CMP_LE_U32;
1009   default:
1010     llvm_unreachable("Unknown condition code!");
1011   }
1012 }
1013 
1014 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1015   MachineBasicBlock *BB = I.getParent();
1016   const DebugLoc &DL = I.getDebugLoc();
1017 
1018   Register SrcReg = I.getOperand(2).getReg();
1019   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1020 
1021   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1022 
1023   Register CCReg = I.getOperand(0).getReg();
1024   if (!isVCC(CCReg, *MRI)) {
1025     int Opcode = getS_CMPOpcode(Pred, Size);
1026     if (Opcode == -1)
1027       return false;
1028     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1029             .add(I.getOperand(2))
1030             .add(I.getOperand(3));
1031     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1032       .addReg(AMDGPU::SCC);
1033     bool Ret =
1034         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1035         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1036     I.eraseFromParent();
1037     return Ret;
1038   }
1039 
1040   int Opcode = getV_CMPOpcode(Pred, Size);
1041   if (Opcode == -1)
1042     return false;
1043 
1044   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1045             I.getOperand(0).getReg())
1046             .add(I.getOperand(2))
1047             .add(I.getOperand(3));
1048   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1049                                *TRI.getBoolRC(), *MRI);
1050   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1051   I.eraseFromParent();
1052   return Ret;
1053 }
1054 
1055 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1056   Register Dst = I.getOperand(0).getReg();
1057   if (isVCC(Dst, *MRI))
1058     return false;
1059 
1060   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1061     return false;
1062 
1063   MachineBasicBlock *BB = I.getParent();
1064   const DebugLoc &DL = I.getDebugLoc();
1065   Register SrcReg = I.getOperand(2).getReg();
1066   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1067   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1068 
1069   int Opcode = getV_CMPOpcode(Pred, Size);
1070   if (Opcode == -1)
1071     return false;
1072 
1073   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1074                            .add(I.getOperand(2))
1075                            .add(I.getOperand(3));
1076   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1077                                *MRI);
1078   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1079   I.eraseFromParent();
1080   return Ret;
1081 }
1082 
1083 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1084   MachineBasicBlock *BB = I.getParent();
1085   const DebugLoc &DL = I.getDebugLoc();
1086   Register DstReg = I.getOperand(0).getReg();
1087   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1088   const bool Is64 = Size == 64;
1089 
1090   if (Size != STI.getWavefrontSize())
1091     return false;
1092 
1093   Optional<ValueAndVReg> Arg =
1094       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1095 
1096   if (Arg.hasValue()) {
1097     const int64_t Value = Arg.getValue().Value.getSExtValue();
1098     if (Value == 0) {
1099       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1100       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1101     } else if (Value == -1) { // all ones
1102       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1103       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1104     } else
1105       return false;
1106   } else {
1107     Register SrcReg = I.getOperand(2).getReg();
1108     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1109   }
1110 
1111   I.eraseFromParent();
1112   return true;
1113 }
1114 
1115 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1116   Register DstReg = I.getOperand(0).getReg();
1117   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1118   const TargetRegisterClass *DstRC =
1119     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1120   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1121     return false;
1122 
1123   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1124 
1125   Module *M = MF->getFunction().getParent();
1126   const MDNode *Metadata = I.getOperand(2).getMetadata();
1127   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1128   auto RelocSymbol = cast<GlobalVariable>(
1129     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1130 
1131   MachineBasicBlock *BB = I.getParent();
1132   BuildMI(*BB, &I, I.getDebugLoc(),
1133           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1134     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1135 
1136   I.eraseFromParent();
1137   return true;
1138 }
1139 
1140 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1141   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1142 
1143   Register DstReg = I.getOperand(0).getReg();
1144   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1145   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1146     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1147 
1148   MachineBasicBlock *MBB = I.getParent();
1149   const DebugLoc &DL = I.getDebugLoc();
1150 
1151   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1152 
1153   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1154     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1155     MIB.addImm(MFI->getLDSSize());
1156   } else {
1157     Module *M = MF->getFunction().getParent();
1158     const GlobalValue *GV
1159       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1160     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1161   }
1162 
1163   I.eraseFromParent();
1164   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1165 }
1166 
1167 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1168   MachineBasicBlock *MBB = I.getParent();
1169   MachineFunction &MF = *MBB->getParent();
1170   const DebugLoc &DL = I.getDebugLoc();
1171 
1172   MachineOperand &Dst = I.getOperand(0);
1173   Register DstReg = Dst.getReg();
1174   unsigned Depth = I.getOperand(2).getImm();
1175 
1176   const TargetRegisterClass *RC
1177     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1178   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1179       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1180     return false;
1181 
1182   // Check for kernel and shader functions
1183   if (Depth != 0 ||
1184       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1185     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1186       .addImm(0);
1187     I.eraseFromParent();
1188     return true;
1189   }
1190 
1191   MachineFrameInfo &MFI = MF.getFrameInfo();
1192   // There is a call to @llvm.returnaddress in this function
1193   MFI.setReturnAddressIsTaken(true);
1194 
1195   // Get the return address reg and mark it as an implicit live-in
1196   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1197   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1198                                              AMDGPU::SReg_64RegClass);
1199   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1200     .addReg(LiveIn);
1201   I.eraseFromParent();
1202   return true;
1203 }
1204 
1205 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1206   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1207   // SelectionDAG uses for wave32 vs wave64.
1208   MachineBasicBlock *BB = MI.getParent();
1209   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1210       .add(MI.getOperand(1));
1211 
1212   Register Reg = MI.getOperand(1).getReg();
1213   MI.eraseFromParent();
1214 
1215   if (!MRI->getRegClassOrNull(Reg))
1216     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1217   return true;
1218 }
1219 
1220 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1221   MachineInstr &MI, Intrinsic::ID IntrID) const {
1222   MachineBasicBlock *MBB = MI.getParent();
1223   MachineFunction *MF = MBB->getParent();
1224   const DebugLoc &DL = MI.getDebugLoc();
1225 
1226   unsigned IndexOperand = MI.getOperand(7).getImm();
1227   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1228   bool WaveDone = MI.getOperand(9).getImm() != 0;
1229 
1230   if (WaveDone && !WaveRelease)
1231     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1232 
1233   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1234   IndexOperand &= ~0x3f;
1235   unsigned CountDw = 0;
1236 
1237   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1238     CountDw = (IndexOperand >> 24) & 0xf;
1239     IndexOperand &= ~(0xf << 24);
1240 
1241     if (CountDw < 1 || CountDw > 4) {
1242       report_fatal_error(
1243         "ds_ordered_count: dword count must be between 1 and 4");
1244     }
1245   }
1246 
1247   if (IndexOperand)
1248     report_fatal_error("ds_ordered_count: bad index operand");
1249 
1250   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1251   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1252 
1253   unsigned Offset0 = OrderedCountIndex << 2;
1254   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1255                      (Instruction << 4);
1256 
1257   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1258     Offset1 |= (CountDw - 1) << 6;
1259 
1260   unsigned Offset = Offset0 | (Offset1 << 8);
1261 
1262   Register M0Val = MI.getOperand(2).getReg();
1263   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1264     .addReg(M0Val);
1265 
1266   Register DstReg = MI.getOperand(0).getReg();
1267   Register ValReg = MI.getOperand(3).getReg();
1268   MachineInstrBuilder DS =
1269     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1270       .addReg(ValReg)
1271       .addImm(Offset)
1272       .cloneMemRefs(MI);
1273 
1274   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1275     return false;
1276 
1277   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1278   MI.eraseFromParent();
1279   return Ret;
1280 }
1281 
1282 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1283   switch (IntrID) {
1284   case Intrinsic::amdgcn_ds_gws_init:
1285     return AMDGPU::DS_GWS_INIT;
1286   case Intrinsic::amdgcn_ds_gws_barrier:
1287     return AMDGPU::DS_GWS_BARRIER;
1288   case Intrinsic::amdgcn_ds_gws_sema_v:
1289     return AMDGPU::DS_GWS_SEMA_V;
1290   case Intrinsic::amdgcn_ds_gws_sema_br:
1291     return AMDGPU::DS_GWS_SEMA_BR;
1292   case Intrinsic::amdgcn_ds_gws_sema_p:
1293     return AMDGPU::DS_GWS_SEMA_P;
1294   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1295     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1296   default:
1297     llvm_unreachable("not a gws intrinsic");
1298   }
1299 }
1300 
1301 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1302                                                      Intrinsic::ID IID) const {
1303   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1304       !STI.hasGWSSemaReleaseAll())
1305     return false;
1306 
1307   // intrinsic ID, vsrc, offset
1308   const bool HasVSrc = MI.getNumOperands() == 3;
1309   assert(HasVSrc || MI.getNumOperands() == 2);
1310 
1311   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1312   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1313   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1314     return false;
1315 
1316   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1317   assert(OffsetDef);
1318 
1319   unsigned ImmOffset;
1320 
1321   MachineBasicBlock *MBB = MI.getParent();
1322   const DebugLoc &DL = MI.getDebugLoc();
1323 
1324   MachineInstr *Readfirstlane = nullptr;
1325 
1326   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1327   // incoming offset, in case there's an add of a constant. We'll have to put it
1328   // back later.
1329   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1330     Readfirstlane = OffsetDef;
1331     BaseOffset = OffsetDef->getOperand(1).getReg();
1332     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1333   }
1334 
1335   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1336     // If we have a constant offset, try to use the 0 in m0 as the base.
1337     // TODO: Look into changing the default m0 initialization value. If the
1338     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1339     // the immediate offset.
1340 
1341     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1342     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1343       .addImm(0);
1344   } else {
1345     std::tie(BaseOffset, ImmOffset) =
1346         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1347 
1348     if (Readfirstlane) {
1349       // We have the constant offset now, so put the readfirstlane back on the
1350       // variable component.
1351       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1352         return false;
1353 
1354       Readfirstlane->getOperand(1).setReg(BaseOffset);
1355       BaseOffset = Readfirstlane->getOperand(0).getReg();
1356     } else {
1357       if (!RBI.constrainGenericRegister(BaseOffset,
1358                                         AMDGPU::SReg_32RegClass, *MRI))
1359         return false;
1360     }
1361 
1362     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1363     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1364       .addReg(BaseOffset)
1365       .addImm(16);
1366 
1367     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1368       .addReg(M0Base);
1369   }
1370 
1371   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1372   // offset field) % 64. Some versions of the programming guide omit the m0
1373   // part, or claim it's from offset 0.
1374   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1375 
1376   if (HasVSrc) {
1377     Register VSrc = MI.getOperand(1).getReg();
1378     MIB.addReg(VSrc);
1379     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1380       return false;
1381   }
1382 
1383   MIB.addImm(ImmOffset)
1384      .cloneMemRefs(MI);
1385 
1386   MI.eraseFromParent();
1387   return true;
1388 }
1389 
1390 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1391                                                       bool IsAppend) const {
1392   Register PtrBase = MI.getOperand(2).getReg();
1393   LLT PtrTy = MRI->getType(PtrBase);
1394   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1395 
1396   unsigned Offset;
1397   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1398 
1399   // TODO: Should this try to look through readfirstlane like GWS?
1400   if (!isDSOffsetLegal(PtrBase, Offset)) {
1401     PtrBase = MI.getOperand(2).getReg();
1402     Offset = 0;
1403   }
1404 
1405   MachineBasicBlock *MBB = MI.getParent();
1406   const DebugLoc &DL = MI.getDebugLoc();
1407   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1408 
1409   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1410     .addReg(PtrBase);
1411   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1412     return false;
1413 
1414   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1415     .addImm(Offset)
1416     .addImm(IsGDS ? -1 : 0)
1417     .cloneMemRefs(MI);
1418   MI.eraseFromParent();
1419   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1420 }
1421 
1422 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1423   if (TM.getOptLevel() > CodeGenOpt::None) {
1424     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1425     if (WGSize <= STI.getWavefrontSize()) {
1426       MachineBasicBlock *MBB = MI.getParent();
1427       const DebugLoc &DL = MI.getDebugLoc();
1428       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1429       MI.eraseFromParent();
1430       return true;
1431     }
1432   }
1433   return selectImpl(MI, *CoverageInfo);
1434 }
1435 
1436 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1437                          bool &IsTexFail) {
1438   if (TexFailCtrl)
1439     IsTexFail = true;
1440 
1441   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1442   TexFailCtrl &= ~(uint64_t)0x1;
1443   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1444   TexFailCtrl &= ~(uint64_t)0x2;
1445 
1446   return TexFailCtrl == 0;
1447 }
1448 
1449 static bool parseCachePolicy(uint64_t Value,
1450                              bool *GLC, bool *SLC, bool *DLC) {
1451   if (GLC) {
1452     *GLC = (Value & 0x1) ? 1 : 0;
1453     Value &= ~(uint64_t)0x1;
1454   }
1455   if (SLC) {
1456     *SLC = (Value & 0x2) ? 1 : 0;
1457     Value &= ~(uint64_t)0x2;
1458   }
1459   if (DLC) {
1460     *DLC = (Value & 0x4) ? 1 : 0;
1461     Value &= ~(uint64_t)0x4;
1462   }
1463 
1464   return Value == 0;
1465 }
1466 
1467 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1468   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1469   MachineBasicBlock *MBB = MI.getParent();
1470   const DebugLoc &DL = MI.getDebugLoc();
1471 
1472   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1473     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1474 
1475   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1476   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1477       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1478   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1479       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1480   unsigned IntrOpcode = Intr->BaseOpcode;
1481   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1482 
1483   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1484 
1485   Register VDataIn, VDataOut;
1486   LLT VDataTy;
1487   int NumVDataDwords = -1;
1488   bool IsD16 = false;
1489 
1490   bool Unorm;
1491   if (!BaseOpcode->Sampler)
1492     Unorm = true;
1493   else
1494     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1495 
1496   bool TFE;
1497   bool LWE;
1498   bool IsTexFail = false;
1499   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1500                     TFE, LWE, IsTexFail))
1501     return false;
1502 
1503   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1504   const bool IsA16 = (Flags & 1) != 0;
1505   const bool IsG16 = (Flags & 2) != 0;
1506 
1507   // A16 implies 16 bit gradients
1508   if (IsA16 && !IsG16)
1509     return false;
1510 
1511   unsigned DMask = 0;
1512   unsigned DMaskLanes = 0;
1513 
1514   if (BaseOpcode->Atomic) {
1515     VDataOut = MI.getOperand(0).getReg();
1516     VDataIn = MI.getOperand(2).getReg();
1517     LLT Ty = MRI->getType(VDataIn);
1518 
1519     // Be careful to allow atomic swap on 16-bit element vectors.
1520     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1521       Ty.getSizeInBits() == 128 :
1522       Ty.getSizeInBits() == 64;
1523 
1524     if (BaseOpcode->AtomicX2) {
1525       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1526 
1527       DMask = Is64Bit ? 0xf : 0x3;
1528       NumVDataDwords = Is64Bit ? 4 : 2;
1529     } else {
1530       DMask = Is64Bit ? 0x3 : 0x1;
1531       NumVDataDwords = Is64Bit ? 2 : 1;
1532     }
1533   } else {
1534     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1535     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1536 
1537     // One memoperand is mandatory, except for getresinfo.
1538     // FIXME: Check this in verifier.
1539     if (!MI.memoperands_empty()) {
1540       const MachineMemOperand *MMO = *MI.memoperands_begin();
1541 
1542       // Infer d16 from the memory size, as the register type will be mangled by
1543       // unpacked subtargets, or by TFE.
1544       IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1545     }
1546 
1547     if (BaseOpcode->Store) {
1548       VDataIn = MI.getOperand(1).getReg();
1549       VDataTy = MRI->getType(VDataIn);
1550       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1551     } else {
1552       VDataOut = MI.getOperand(0).getReg();
1553       VDataTy = MRI->getType(VDataOut);
1554       NumVDataDwords = DMaskLanes;
1555 
1556       if (IsD16 && !STI.hasUnpackedD16VMem())
1557         NumVDataDwords = (DMaskLanes + 1) / 2;
1558     }
1559   }
1560 
1561   // Optimize _L to _LZ when _L is zero
1562   if (LZMappingInfo) {
1563     // The legalizer replaced the register with an immediate 0 if we need to
1564     // change the opcode.
1565     const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1566     if (Lod.isImm()) {
1567       assert(Lod.getImm() == 0);
1568       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1569     }
1570   }
1571 
1572   // Optimize _mip away, when 'lod' is zero
1573   if (MIPMappingInfo) {
1574     const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1575     if (Lod.isImm()) {
1576       assert(Lod.getImm() == 0);
1577       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1578     }
1579   }
1580 
1581   // Set G16 opcode
1582   if (IsG16 && !IsA16) {
1583     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1584         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1585     assert(G16MappingInfo);
1586     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1587   }
1588 
1589   // TODO: Check this in verifier.
1590   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1591 
1592   bool GLC = false;
1593   bool SLC = false;
1594   bool DLC = false;
1595   if (BaseOpcode->Atomic) {
1596     GLC = true; // TODO no-return optimization
1597     if (!parseCachePolicy(
1598             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
1599             &SLC, IsGFX10Plus ? &DLC : nullptr))
1600       return false;
1601   } else {
1602     if (!parseCachePolicy(
1603             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
1604             &SLC, IsGFX10Plus ? &DLC : nullptr))
1605       return false;
1606   }
1607 
1608   int NumVAddrRegs = 0;
1609   int NumVAddrDwords = 0;
1610   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1611     // Skip the $noregs and 0s inserted during legalization.
1612     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1613     if (!AddrOp.isReg())
1614       continue; // XXX - Break?
1615 
1616     Register Addr = AddrOp.getReg();
1617     if (!Addr)
1618       break;
1619 
1620     ++NumVAddrRegs;
1621     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1622   }
1623 
1624   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1625   // NSA, these should have beeen packed into a single value in the first
1626   // address register
1627   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1628   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1629     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1630     return false;
1631   }
1632 
1633   if (IsTexFail)
1634     ++NumVDataDwords;
1635 
1636   int Opcode = -1;
1637   if (IsGFX10Plus) {
1638     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1639                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1640                                           : AMDGPU::MIMGEncGfx10Default,
1641                                    NumVDataDwords, NumVAddrDwords);
1642   } else {
1643     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1644       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1645                                      NumVDataDwords, NumVAddrDwords);
1646     if (Opcode == -1)
1647       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1648                                      NumVDataDwords, NumVAddrDwords);
1649   }
1650   assert(Opcode != -1);
1651 
1652   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1653     .cloneMemRefs(MI);
1654 
1655   if (VDataOut) {
1656     if (BaseOpcode->AtomicX2) {
1657       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1658 
1659       Register TmpReg = MRI->createVirtualRegister(
1660         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1661       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1662 
1663       MIB.addDef(TmpReg);
1664       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1665         .addReg(TmpReg, RegState::Kill, SubReg);
1666 
1667     } else {
1668       MIB.addDef(VDataOut); // vdata output
1669     }
1670   }
1671 
1672   if (VDataIn)
1673     MIB.addReg(VDataIn); // vdata input
1674 
1675   for (int I = 0; I != NumVAddrRegs; ++I) {
1676     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1677     if (SrcOp.isReg()) {
1678       assert(SrcOp.getReg() != 0);
1679       MIB.addReg(SrcOp.getReg());
1680     }
1681   }
1682 
1683   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1684   if (BaseOpcode->Sampler)
1685     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1686 
1687   MIB.addImm(DMask); // dmask
1688 
1689   if (IsGFX10Plus)
1690     MIB.addImm(DimInfo->Encoding);
1691   MIB.addImm(Unorm);
1692   if (IsGFX10Plus)
1693     MIB.addImm(DLC);
1694 
1695   MIB.addImm(GLC);
1696   MIB.addImm(SLC);
1697   MIB.addImm(IsA16 &&  // a16 or r128
1698              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1699   if (IsGFX10Plus)
1700     MIB.addImm(IsA16 ? -1 : 0);
1701 
1702   MIB.addImm(TFE); // tfe
1703   MIB.addImm(LWE); // lwe
1704   if (!IsGFX10Plus)
1705     MIB.addImm(DimInfo->DA ? -1 : 0);
1706   if (BaseOpcode->HasD16)
1707     MIB.addImm(IsD16 ? -1 : 0);
1708 
1709   MI.eraseFromParent();
1710   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1711 }
1712 
1713 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1714     MachineInstr &I) const {
1715   unsigned IntrinsicID = I.getIntrinsicID();
1716   switch (IntrinsicID) {
1717   case Intrinsic::amdgcn_end_cf:
1718     return selectEndCfIntrinsic(I);
1719   case Intrinsic::amdgcn_ds_ordered_add:
1720   case Intrinsic::amdgcn_ds_ordered_swap:
1721     return selectDSOrderedIntrinsic(I, IntrinsicID);
1722   case Intrinsic::amdgcn_ds_gws_init:
1723   case Intrinsic::amdgcn_ds_gws_barrier:
1724   case Intrinsic::amdgcn_ds_gws_sema_v:
1725   case Intrinsic::amdgcn_ds_gws_sema_br:
1726   case Intrinsic::amdgcn_ds_gws_sema_p:
1727   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1728     return selectDSGWSIntrinsic(I, IntrinsicID);
1729   case Intrinsic::amdgcn_ds_append:
1730     return selectDSAppendConsume(I, true);
1731   case Intrinsic::amdgcn_ds_consume:
1732     return selectDSAppendConsume(I, false);
1733   case Intrinsic::amdgcn_s_barrier:
1734     return selectSBarrier(I);
1735   case Intrinsic::amdgcn_global_atomic_fadd:
1736     return selectGlobalAtomicFaddIntrinsic(I);
1737   default: {
1738     return selectImpl(I, *CoverageInfo);
1739   }
1740   }
1741 }
1742 
1743 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1744   if (selectImpl(I, *CoverageInfo))
1745     return true;
1746 
1747   MachineBasicBlock *BB = I.getParent();
1748   const DebugLoc &DL = I.getDebugLoc();
1749 
1750   Register DstReg = I.getOperand(0).getReg();
1751   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1752   assert(Size <= 32 || Size == 64);
1753   const MachineOperand &CCOp = I.getOperand(1);
1754   Register CCReg = CCOp.getReg();
1755   if (!isVCC(CCReg, *MRI)) {
1756     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1757                                          AMDGPU::S_CSELECT_B32;
1758     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1759             .addReg(CCReg);
1760 
1761     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1762     // bank, because it does not cover the register class that we used to represent
1763     // for it.  So we need to manually set the register class here.
1764     if (!MRI->getRegClassOrNull(CCReg))
1765         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1766     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1767             .add(I.getOperand(2))
1768             .add(I.getOperand(3));
1769 
1770     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1771                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1772     I.eraseFromParent();
1773     return Ret;
1774   }
1775 
1776   // Wide VGPR select should have been split in RegBankSelect.
1777   if (Size > 32)
1778     return false;
1779 
1780   MachineInstr *Select =
1781       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1782               .addImm(0)
1783               .add(I.getOperand(3))
1784               .addImm(0)
1785               .add(I.getOperand(2))
1786               .add(I.getOperand(1));
1787 
1788   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1789   I.eraseFromParent();
1790   return Ret;
1791 }
1792 
1793 static int sizeToSubRegIndex(unsigned Size) {
1794   switch (Size) {
1795   case 32:
1796     return AMDGPU::sub0;
1797   case 64:
1798     return AMDGPU::sub0_sub1;
1799   case 96:
1800     return AMDGPU::sub0_sub1_sub2;
1801   case 128:
1802     return AMDGPU::sub0_sub1_sub2_sub3;
1803   case 256:
1804     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1805   default:
1806     if (Size < 32)
1807       return AMDGPU::sub0;
1808     if (Size > 256)
1809       return -1;
1810     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1811   }
1812 }
1813 
1814 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1815   Register DstReg = I.getOperand(0).getReg();
1816   Register SrcReg = I.getOperand(1).getReg();
1817   const LLT DstTy = MRI->getType(DstReg);
1818   const LLT SrcTy = MRI->getType(SrcReg);
1819   const LLT S1 = LLT::scalar(1);
1820 
1821   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1822   const RegisterBank *DstRB;
1823   if (DstTy == S1) {
1824     // This is a special case. We don't treat s1 for legalization artifacts as
1825     // vcc booleans.
1826     DstRB = SrcRB;
1827   } else {
1828     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1829     if (SrcRB != DstRB)
1830       return false;
1831   }
1832 
1833   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1834 
1835   unsigned DstSize = DstTy.getSizeInBits();
1836   unsigned SrcSize = SrcTy.getSizeInBits();
1837 
1838   const TargetRegisterClass *SrcRC
1839     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1840   const TargetRegisterClass *DstRC
1841     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1842   if (!SrcRC || !DstRC)
1843     return false;
1844 
1845   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1846       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1847     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1848     return false;
1849   }
1850 
1851   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1852     MachineBasicBlock *MBB = I.getParent();
1853     const DebugLoc &DL = I.getDebugLoc();
1854 
1855     Register LoReg = MRI->createVirtualRegister(DstRC);
1856     Register HiReg = MRI->createVirtualRegister(DstRC);
1857     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1858       .addReg(SrcReg, 0, AMDGPU::sub0);
1859     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1860       .addReg(SrcReg, 0, AMDGPU::sub1);
1861 
1862     if (IsVALU && STI.hasSDWA()) {
1863       // Write the low 16-bits of the high element into the high 16-bits of the
1864       // low element.
1865       MachineInstr *MovSDWA =
1866         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1867         .addImm(0)                             // $src0_modifiers
1868         .addReg(HiReg)                         // $src0
1869         .addImm(0)                             // $clamp
1870         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1871         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1872         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1873         .addReg(LoReg, RegState::Implicit);
1874       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1875     } else {
1876       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1877       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1878       Register ImmReg = MRI->createVirtualRegister(DstRC);
1879       if (IsVALU) {
1880         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1881           .addImm(16)
1882           .addReg(HiReg);
1883       } else {
1884         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1885           .addReg(HiReg)
1886           .addImm(16);
1887       }
1888 
1889       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1890       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1891       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1892 
1893       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1894         .addImm(0xffff);
1895       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1896         .addReg(LoReg)
1897         .addReg(ImmReg);
1898       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1899         .addReg(TmpReg0)
1900         .addReg(TmpReg1);
1901     }
1902 
1903     I.eraseFromParent();
1904     return true;
1905   }
1906 
1907   if (!DstTy.isScalar())
1908     return false;
1909 
1910   if (SrcSize > 32) {
1911     int SubRegIdx = sizeToSubRegIndex(DstSize);
1912     if (SubRegIdx == -1)
1913       return false;
1914 
1915     // Deal with weird cases where the class only partially supports the subreg
1916     // index.
1917     const TargetRegisterClass *SrcWithSubRC
1918       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1919     if (!SrcWithSubRC)
1920       return false;
1921 
1922     if (SrcWithSubRC != SrcRC) {
1923       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1924         return false;
1925     }
1926 
1927     I.getOperand(1).setSubReg(SubRegIdx);
1928   }
1929 
1930   I.setDesc(TII.get(TargetOpcode::COPY));
1931   return true;
1932 }
1933 
1934 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1935 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1936   Mask = maskTrailingOnes<unsigned>(Size);
1937   int SignedMask = static_cast<int>(Mask);
1938   return SignedMask >= -16 && SignedMask <= 64;
1939 }
1940 
1941 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1942 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1943   Register Reg, const MachineRegisterInfo &MRI,
1944   const TargetRegisterInfo &TRI) const {
1945   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1946   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1947     return RB;
1948 
1949   // Ignore the type, since we don't use vcc in artifacts.
1950   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1951     return &RBI.getRegBankFromRegClass(*RC, LLT());
1952   return nullptr;
1953 }
1954 
1955 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1956   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1957   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1958   const DebugLoc &DL = I.getDebugLoc();
1959   MachineBasicBlock &MBB = *I.getParent();
1960   const Register DstReg = I.getOperand(0).getReg();
1961   const Register SrcReg = I.getOperand(1).getReg();
1962 
1963   const LLT DstTy = MRI->getType(DstReg);
1964   const LLT SrcTy = MRI->getType(SrcReg);
1965   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1966     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1967   const unsigned DstSize = DstTy.getSizeInBits();
1968   if (!DstTy.isScalar())
1969     return false;
1970 
1971   // Artifact casts should never use vcc.
1972   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1973 
1974   // FIXME: This should probably be illegal and split earlier.
1975   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1976     if (DstSize <= 32)
1977       return selectCOPY(I);
1978 
1979     const TargetRegisterClass *SrcRC =
1980         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1981     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1982     const TargetRegisterClass *DstRC =
1983         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1984 
1985     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1986     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1987     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1988       .addReg(SrcReg)
1989       .addImm(AMDGPU::sub0)
1990       .addReg(UndefReg)
1991       .addImm(AMDGPU::sub1);
1992     I.eraseFromParent();
1993 
1994     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1995            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1996   }
1997 
1998   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1999     // 64-bit should have been split up in RegBankSelect
2000 
2001     // Try to use an and with a mask if it will save code size.
2002     unsigned Mask;
2003     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2004       MachineInstr *ExtI =
2005       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2006         .addImm(Mask)
2007         .addReg(SrcReg);
2008       I.eraseFromParent();
2009       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2010     }
2011 
2012     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2013     MachineInstr *ExtI =
2014       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2015       .addReg(SrcReg)
2016       .addImm(0) // Offset
2017       .addImm(SrcSize); // Width
2018     I.eraseFromParent();
2019     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2020   }
2021 
2022   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2023     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2024       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2025     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2026       return false;
2027 
2028     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2029       const unsigned SextOpc = SrcSize == 8 ?
2030         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2031       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2032         .addReg(SrcReg);
2033       I.eraseFromParent();
2034       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2035     }
2036 
2037     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2038     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2039 
2040     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2041     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2042       // We need a 64-bit register source, but the high bits don't matter.
2043       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2044       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2045       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2046 
2047       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2048       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2049         .addReg(SrcReg, 0, SubReg)
2050         .addImm(AMDGPU::sub0)
2051         .addReg(UndefReg)
2052         .addImm(AMDGPU::sub1);
2053 
2054       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2055         .addReg(ExtReg)
2056         .addImm(SrcSize << 16);
2057 
2058       I.eraseFromParent();
2059       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2060     }
2061 
2062     unsigned Mask;
2063     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2064       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2065         .addReg(SrcReg)
2066         .addImm(Mask);
2067     } else {
2068       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2069         .addReg(SrcReg)
2070         .addImm(SrcSize << 16);
2071     }
2072 
2073     I.eraseFromParent();
2074     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2075   }
2076 
2077   return false;
2078 }
2079 
2080 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2081   MachineBasicBlock *BB = I.getParent();
2082   MachineOperand &ImmOp = I.getOperand(1);
2083   Register DstReg = I.getOperand(0).getReg();
2084   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2085 
2086   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2087   if (ImmOp.isFPImm()) {
2088     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2089     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2090   } else if (ImmOp.isCImm()) {
2091     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2092   } else {
2093     llvm_unreachable("Not supported by g_constants");
2094   }
2095 
2096   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2097   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2098 
2099   unsigned Opcode;
2100   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2101     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2102   } else {
2103     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2104 
2105     // We should never produce s1 values on banks other than VCC. If the user of
2106     // this already constrained the register, we may incorrectly think it's VCC
2107     // if it wasn't originally.
2108     if (Size == 1)
2109       return false;
2110   }
2111 
2112   if (Size != 64) {
2113     I.setDesc(TII.get(Opcode));
2114     I.addImplicitDefUseOperands(*MF);
2115     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2116   }
2117 
2118   const DebugLoc &DL = I.getDebugLoc();
2119 
2120   APInt Imm(Size, I.getOperand(1).getImm());
2121 
2122   MachineInstr *ResInst;
2123   if (IsSgpr && TII.isInlineConstant(Imm)) {
2124     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2125       .addImm(I.getOperand(1).getImm());
2126   } else {
2127     const TargetRegisterClass *RC = IsSgpr ?
2128       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2129     Register LoReg = MRI->createVirtualRegister(RC);
2130     Register HiReg = MRI->createVirtualRegister(RC);
2131 
2132     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2133       .addImm(Imm.trunc(32).getZExtValue());
2134 
2135     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2136       .addImm(Imm.ashr(32).getZExtValue());
2137 
2138     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2139       .addReg(LoReg)
2140       .addImm(AMDGPU::sub0)
2141       .addReg(HiReg)
2142       .addImm(AMDGPU::sub1);
2143   }
2144 
2145   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2146   // work for target independent opcodes
2147   I.eraseFromParent();
2148   const TargetRegisterClass *DstRC =
2149     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2150   if (!DstRC)
2151     return true;
2152   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2153 }
2154 
2155 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2156   // Only manually handle the f64 SGPR case.
2157   //
2158   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2159   // the bit ops theoretically have a second result due to the implicit def of
2160   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2161   // that is easy by disabling the check. The result works, but uses a
2162   // nonsensical sreg32orlds_and_sreg_1 regclass.
2163   //
2164   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2165   // the variadic REG_SEQUENCE operands.
2166 
2167   Register Dst = MI.getOperand(0).getReg();
2168   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2169   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2170       MRI->getType(Dst) != LLT::scalar(64))
2171     return false;
2172 
2173   Register Src = MI.getOperand(1).getReg();
2174   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2175   if (Fabs)
2176     Src = Fabs->getOperand(1).getReg();
2177 
2178   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2179       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2180     return false;
2181 
2182   MachineBasicBlock *BB = MI.getParent();
2183   const DebugLoc &DL = MI.getDebugLoc();
2184   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2185   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2186   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2187   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2188 
2189   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2190     .addReg(Src, 0, AMDGPU::sub0);
2191   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2192     .addReg(Src, 0, AMDGPU::sub1);
2193   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2194     .addImm(0x80000000);
2195 
2196   // Set or toggle sign bit.
2197   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2198   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2199     .addReg(HiReg)
2200     .addReg(ConstReg);
2201   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2202     .addReg(LoReg)
2203     .addImm(AMDGPU::sub0)
2204     .addReg(OpReg)
2205     .addImm(AMDGPU::sub1);
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2211 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2212   Register Dst = MI.getOperand(0).getReg();
2213   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2214   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2215       MRI->getType(Dst) != LLT::scalar(64))
2216     return false;
2217 
2218   Register Src = MI.getOperand(1).getReg();
2219   MachineBasicBlock *BB = MI.getParent();
2220   const DebugLoc &DL = MI.getDebugLoc();
2221   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2222   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2223   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2224   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2225 
2226   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2227       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2228     return false;
2229 
2230   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2231     .addReg(Src, 0, AMDGPU::sub0);
2232   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2233     .addReg(Src, 0, AMDGPU::sub1);
2234   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2235     .addImm(0x7fffffff);
2236 
2237   // Clear sign bit.
2238   // TODO: Should this used S_BITSET0_*?
2239   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2240     .addReg(HiReg)
2241     .addReg(ConstReg);
2242   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2243     .addReg(LoReg)
2244     .addImm(AMDGPU::sub0)
2245     .addReg(OpReg)
2246     .addImm(AMDGPU::sub1);
2247 
2248   MI.eraseFromParent();
2249   return true;
2250 }
2251 
2252 static bool isConstant(const MachineInstr &MI) {
2253   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2254 }
2255 
2256 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2257     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2258 
2259   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2260 
2261   assert(PtrMI);
2262 
2263   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2264     return;
2265 
2266   GEPInfo GEPInfo(*PtrMI);
2267 
2268   for (unsigned i = 1; i != 3; ++i) {
2269     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2270     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2271     assert(OpDef);
2272     if (i == 2 && isConstant(*OpDef)) {
2273       // TODO: Could handle constant base + variable offset, but a combine
2274       // probably should have commuted it.
2275       assert(GEPInfo.Imm == 0);
2276       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2277       continue;
2278     }
2279     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2280     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2281       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2282     else
2283       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2284   }
2285 
2286   AddrInfo.push_back(GEPInfo);
2287   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2288 }
2289 
2290 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2291   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2292 }
2293 
2294 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2295   if (!MI.hasOneMemOperand())
2296     return false;
2297 
2298   const MachineMemOperand *MMO = *MI.memoperands_begin();
2299   const Value *Ptr = MMO->getValue();
2300 
2301   // UndefValue means this is a load of a kernel input.  These are uniform.
2302   // Sometimes LDS instructions have constant pointers.
2303   // If Ptr is null, then that means this mem operand contains a
2304   // PseudoSourceValue like GOT.
2305   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2306       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2307     return true;
2308 
2309   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2310     return true;
2311 
2312   const Instruction *I = dyn_cast<Instruction>(Ptr);
2313   return I && I->getMetadata("amdgpu.uniform");
2314 }
2315 
2316 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2317   for (const GEPInfo &GEPInfo : AddrInfo) {
2318     if (!GEPInfo.VgprParts.empty())
2319       return true;
2320   }
2321   return false;
2322 }
2323 
2324 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2325   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2326   unsigned AS = PtrTy.getAddressSpace();
2327   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2328       STI.ldsRequiresM0Init()) {
2329     MachineBasicBlock *BB = I.getParent();
2330 
2331     // If DS instructions require M0 initializtion, insert it before selecting.
2332     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2333       .addImm(-1);
2334   }
2335 }
2336 
2337 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2338   MachineInstr &I) const {
2339   initM0(I);
2340   return selectImpl(I, *CoverageInfo);
2341 }
2342 
2343 // TODO: No rtn optimization.
2344 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2345   MachineInstr &MI) const {
2346   Register PtrReg = MI.getOperand(1).getReg();
2347   const LLT PtrTy = MRI->getType(PtrReg);
2348   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2349       STI.useFlatForGlobal())
2350     return selectImpl(MI, *CoverageInfo);
2351 
2352   Register DstReg = MI.getOperand(0).getReg();
2353   const LLT Ty = MRI->getType(DstReg);
2354   const bool Is64 = Ty.getSizeInBits() == 64;
2355   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2356   Register TmpReg = MRI->createVirtualRegister(
2357     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2358 
2359   const DebugLoc &DL = MI.getDebugLoc();
2360   MachineBasicBlock *BB = MI.getParent();
2361 
2362   Register VAddr, RSrcReg, SOffset;
2363   int64_t Offset = 0;
2364 
2365   unsigned Opcode;
2366   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2367     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2368                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2369   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2370                                    RSrcReg, SOffset, Offset)) {
2371     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2372                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2373   } else
2374     return selectImpl(MI, *CoverageInfo);
2375 
2376   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2377     .addReg(MI.getOperand(2).getReg());
2378 
2379   if (VAddr)
2380     MIB.addReg(VAddr);
2381 
2382   MIB.addReg(RSrcReg);
2383   if (SOffset)
2384     MIB.addReg(SOffset);
2385   else
2386     MIB.addImm(0);
2387 
2388   MIB.addImm(Offset);
2389   MIB.addImm(1); // glc
2390   MIB.addImm(0); // slc
2391   MIB.cloneMemRefs(MI);
2392 
2393   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2394     .addReg(TmpReg, RegState::Kill, SubReg);
2395 
2396   MI.eraseFromParent();
2397 
2398   MRI->setRegClass(
2399     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2400   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2401 }
2402 
2403 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2404   MachineBasicBlock *BB = I.getParent();
2405   MachineOperand &CondOp = I.getOperand(0);
2406   Register CondReg = CondOp.getReg();
2407   const DebugLoc &DL = I.getDebugLoc();
2408 
2409   unsigned BrOpcode;
2410   Register CondPhysReg;
2411   const TargetRegisterClass *ConstrainRC;
2412 
2413   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2414   // whether the branch is uniform when selecting the instruction. In
2415   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2416   // RegBankSelect knows what it's doing if the branch condition is scc, even
2417   // though it currently does not.
2418   if (!isVCC(CondReg, *MRI)) {
2419     if (MRI->getType(CondReg) != LLT::scalar(32))
2420       return false;
2421 
2422     CondPhysReg = AMDGPU::SCC;
2423     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2424     ConstrainRC = &AMDGPU::SReg_32RegClass;
2425   } else {
2426     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2427     // We sort of know that a VCC producer based on the register bank, that ands
2428     // inactive lanes with 0. What if there was a logical operation with vcc
2429     // producers in different blocks/with different exec masks?
2430     // FIXME: Should scc->vcc copies and with exec?
2431     CondPhysReg = TRI.getVCC();
2432     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2433     ConstrainRC = TRI.getBoolRC();
2434   }
2435 
2436   if (!MRI->getRegClassOrNull(CondReg))
2437     MRI->setRegClass(CondReg, ConstrainRC);
2438 
2439   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2440     .addReg(CondReg);
2441   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2442     .addMBB(I.getOperand(1).getMBB());
2443 
2444   I.eraseFromParent();
2445   return true;
2446 }
2447 
2448 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2449   MachineInstr &I) const {
2450   Register DstReg = I.getOperand(0).getReg();
2451   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2452   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2453   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2454   if (IsVGPR)
2455     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2456 
2457   return RBI.constrainGenericRegister(
2458     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2459 }
2460 
2461 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2462   Register DstReg = I.getOperand(0).getReg();
2463   Register SrcReg = I.getOperand(1).getReg();
2464   Register MaskReg = I.getOperand(2).getReg();
2465   LLT Ty = MRI->getType(DstReg);
2466   LLT MaskTy = MRI->getType(MaskReg);
2467 
2468   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2469   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2470   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2471   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2472   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2473     return false;
2474 
2475   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2476   const TargetRegisterClass &RegRC
2477     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2478 
2479   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2480                                                                   *MRI);
2481   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2482                                                                   *MRI);
2483   const TargetRegisterClass *MaskRC =
2484       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2485 
2486   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2487       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2488       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2489     return false;
2490 
2491   MachineBasicBlock *BB = I.getParent();
2492   const DebugLoc &DL = I.getDebugLoc();
2493   if (Ty.getSizeInBits() == 32) {
2494     assert(MaskTy.getSizeInBits() == 32 &&
2495            "ptrmask should have been narrowed during legalize");
2496 
2497     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2498       .addReg(SrcReg)
2499       .addReg(MaskReg);
2500     I.eraseFromParent();
2501     return true;
2502   }
2503 
2504   Register HiReg = MRI->createVirtualRegister(&RegRC);
2505   Register LoReg = MRI->createVirtualRegister(&RegRC);
2506 
2507   // Extract the subregisters from the source pointer.
2508   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2509     .addReg(SrcReg, 0, AMDGPU::sub0);
2510   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2511     .addReg(SrcReg, 0, AMDGPU::sub1);
2512 
2513   Register MaskedLo, MaskedHi;
2514 
2515   // Try to avoid emitting a bit operation when we only need to touch half of
2516   // the 64-bit pointer.
2517   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2518 
2519   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2520   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2521   if ((MaskOnes & MaskLo32) == MaskLo32) {
2522     // If all the bits in the low half are 1, we only need a copy for it.
2523     MaskedLo = LoReg;
2524   } else {
2525     // Extract the mask subregister and apply the and.
2526     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2527     MaskedLo = MRI->createVirtualRegister(&RegRC);
2528 
2529     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2530       .addReg(MaskReg, 0, AMDGPU::sub0);
2531     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2532       .addReg(LoReg)
2533       .addReg(MaskLo);
2534   }
2535 
2536   if ((MaskOnes & MaskHi32) == MaskHi32) {
2537     // If all the bits in the high half are 1, we only need a copy for it.
2538     MaskedHi = HiReg;
2539   } else {
2540     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2541     MaskedHi = MRI->createVirtualRegister(&RegRC);
2542 
2543     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2544       .addReg(MaskReg, 0, AMDGPU::sub1);
2545     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2546       .addReg(HiReg)
2547       .addReg(MaskHi);
2548   }
2549 
2550   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2551     .addReg(MaskedLo)
2552     .addImm(AMDGPU::sub0)
2553     .addReg(MaskedHi)
2554     .addImm(AMDGPU::sub1);
2555   I.eraseFromParent();
2556   return true;
2557 }
2558 
2559 /// Return the register to use for the index value, and the subregister to use
2560 /// for the indirectly accessed register.
2561 static std::pair<Register, unsigned>
2562 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2563                         const SIRegisterInfo &TRI,
2564                         const TargetRegisterClass *SuperRC,
2565                         Register IdxReg,
2566                         unsigned EltSize) {
2567   Register IdxBaseReg;
2568   int Offset;
2569 
2570   std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2571   if (IdxBaseReg == AMDGPU::NoRegister) {
2572     // This will happen if the index is a known constant. This should ordinarily
2573     // be legalized out, but handle it as a register just in case.
2574     assert(Offset == 0);
2575     IdxBaseReg = IdxReg;
2576   }
2577 
2578   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2579 
2580   // Skip out of bounds offsets, or else we would end up using an undefined
2581   // register.
2582   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2583     return std::make_pair(IdxReg, SubRegs[0]);
2584   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2585 }
2586 
2587 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2588   MachineInstr &MI) const {
2589   Register DstReg = MI.getOperand(0).getReg();
2590   Register SrcReg = MI.getOperand(1).getReg();
2591   Register IdxReg = MI.getOperand(2).getReg();
2592 
2593   LLT DstTy = MRI->getType(DstReg);
2594   LLT SrcTy = MRI->getType(SrcReg);
2595 
2596   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2597   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2598   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2599 
2600   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2601   // into a waterfall loop.
2602   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2603     return false;
2604 
2605   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2606                                                                   *MRI);
2607   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2608                                                                   *MRI);
2609   if (!SrcRC || !DstRC)
2610     return false;
2611   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2612       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2613       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2614     return false;
2615 
2616   MachineBasicBlock *BB = MI.getParent();
2617   const DebugLoc &DL = MI.getDebugLoc();
2618   const bool Is64 = DstTy.getSizeInBits() == 64;
2619 
2620   unsigned SubReg;
2621   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2622                                                      DstTy.getSizeInBits() / 8);
2623 
2624   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2625     if (DstTy.getSizeInBits() != 32 && !Is64)
2626       return false;
2627 
2628     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2629       .addReg(IdxReg);
2630 
2631     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2632     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2633       .addReg(SrcReg, 0, SubReg)
2634       .addReg(SrcReg, RegState::Implicit);
2635     MI.eraseFromParent();
2636     return true;
2637   }
2638 
2639   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2640     return false;
2641 
2642   if (!STI.useVGPRIndexMode()) {
2643     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2644       .addReg(IdxReg);
2645     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2646       .addReg(SrcReg, 0, SubReg)
2647       .addReg(SrcReg, RegState::Implicit);
2648     MI.eraseFromParent();
2649     return true;
2650   }
2651 
2652   const MCInstrDesc &GPRIDXDesc =
2653       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2654   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2655       .addReg(SrcReg)
2656       .addReg(IdxReg)
2657       .addImm(SubReg);
2658 
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2664 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2665   MachineInstr &MI) const {
2666   Register DstReg = MI.getOperand(0).getReg();
2667   Register VecReg = MI.getOperand(1).getReg();
2668   Register ValReg = MI.getOperand(2).getReg();
2669   Register IdxReg = MI.getOperand(3).getReg();
2670 
2671   LLT VecTy = MRI->getType(DstReg);
2672   LLT ValTy = MRI->getType(ValReg);
2673   unsigned VecSize = VecTy.getSizeInBits();
2674   unsigned ValSize = ValTy.getSizeInBits();
2675 
2676   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2677   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2678   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2679 
2680   assert(VecTy.getElementType() == ValTy);
2681 
2682   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2683   // into a waterfall loop.
2684   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2685     return false;
2686 
2687   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2688                                                                   *MRI);
2689   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2690                                                                   *MRI);
2691 
2692   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2693       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2694       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2695       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2696     return false;
2697 
2698   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2699     return false;
2700 
2701   unsigned SubReg;
2702   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2703                                                      ValSize / 8);
2704 
2705   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2706                          STI.useVGPRIndexMode();
2707 
2708   MachineBasicBlock *BB = MI.getParent();
2709   const DebugLoc &DL = MI.getDebugLoc();
2710 
2711   if (!IndexMode) {
2712     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2713       .addReg(IdxReg);
2714 
2715     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2716         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2717     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2718         .addReg(VecReg)
2719         .addReg(ValReg)
2720         .addImm(SubReg);
2721     MI.eraseFromParent();
2722     return true;
2723   }
2724 
2725   const MCInstrDesc &GPRIDXDesc =
2726       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2727   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2728       .addReg(VecReg)
2729       .addReg(ValReg)
2730       .addReg(IdxReg)
2731       .addImm(SubReg);
2732 
2733   MI.eraseFromParent();
2734   return true;
2735 }
2736 
2737 static bool isZeroOrUndef(int X) {
2738   return X == 0 || X == -1;
2739 }
2740 
2741 static bool isOneOrUndef(int X) {
2742   return X == 1 || X == -1;
2743 }
2744 
2745 static bool isZeroOrOneOrUndef(int X) {
2746   return X == 0 || X == 1 || X == -1;
2747 }
2748 
2749 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2750 // 32-bit register.
2751 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2752                                    ArrayRef<int> Mask) {
2753   NewMask[0] = Mask[0];
2754   NewMask[1] = Mask[1];
2755   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2756     return Src0;
2757 
2758   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2759   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2760 
2761   // Shift the mask inputs to be 0/1;
2762   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2763   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2764   return Src1;
2765 }
2766 
2767 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2768 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2769   MachineInstr &MI) const {
2770   Register DstReg = MI.getOperand(0).getReg();
2771   Register Src0Reg = MI.getOperand(1).getReg();
2772   Register Src1Reg = MI.getOperand(2).getReg();
2773   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2774 
2775   const LLT V2S16 = LLT::vector(2, 16);
2776   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2777     return false;
2778 
2779   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2780     return false;
2781 
2782   assert(ShufMask.size() == 2);
2783   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2784 
2785   MachineBasicBlock *MBB = MI.getParent();
2786   const DebugLoc &DL = MI.getDebugLoc();
2787 
2788   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2789   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2790   const TargetRegisterClass &RC = IsVALU ?
2791     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2792 
2793   // Handle the degenerate case which should have folded out.
2794   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2795     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2796 
2797     MI.eraseFromParent();
2798     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2799   }
2800 
2801   // A legal VOP3P mask only reads one of the sources.
2802   int Mask[2];
2803   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2804 
2805   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2806       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2807     return false;
2808 
2809   // TODO: This also should have been folded out
2810   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2811     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2812       .addReg(SrcVec);
2813 
2814     MI.eraseFromParent();
2815     return true;
2816   }
2817 
2818   if (Mask[0] == 1 && Mask[1] == -1) {
2819     if (IsVALU) {
2820       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2821         .addImm(16)
2822         .addReg(SrcVec);
2823     } else {
2824       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2825         .addReg(SrcVec)
2826         .addImm(16);
2827     }
2828   } else if (Mask[0] == -1 && Mask[1] == 0) {
2829     if (IsVALU) {
2830       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2831         .addImm(16)
2832         .addReg(SrcVec);
2833     } else {
2834       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2835         .addReg(SrcVec)
2836         .addImm(16);
2837     }
2838   } else if (Mask[0] == 0 && Mask[1] == 0) {
2839     if (IsVALU) {
2840       // Write low half of the register into the high half.
2841       MachineInstr *MovSDWA =
2842         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2843         .addImm(0)                             // $src0_modifiers
2844         .addReg(SrcVec)                        // $src0
2845         .addImm(0)                             // $clamp
2846         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2847         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2848         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2849         .addReg(SrcVec, RegState::Implicit);
2850       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2851     } else {
2852       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2853         .addReg(SrcVec)
2854         .addReg(SrcVec);
2855     }
2856   } else if (Mask[0] == 1 && Mask[1] == 1) {
2857     if (IsVALU) {
2858       // Write high half of the register into the low half.
2859       MachineInstr *MovSDWA =
2860         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2861         .addImm(0)                             // $src0_modifiers
2862         .addReg(SrcVec)                        // $src0
2863         .addImm(0)                             // $clamp
2864         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2865         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2866         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2867         .addReg(SrcVec, RegState::Implicit);
2868       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2869     } else {
2870       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2871         .addReg(SrcVec)
2872         .addReg(SrcVec);
2873     }
2874   } else if (Mask[0] == 1 && Mask[1] == 0) {
2875     if (IsVALU) {
2876       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2877         .addReg(SrcVec)
2878         .addReg(SrcVec)
2879         .addImm(16);
2880     } else {
2881       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2882       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2883         .addReg(SrcVec)
2884         .addImm(16);
2885       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2886         .addReg(TmpReg)
2887         .addReg(SrcVec);
2888     }
2889   } else
2890     llvm_unreachable("all shuffle masks should be handled");
2891 
2892   MI.eraseFromParent();
2893   return true;
2894 }
2895 
2896 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2897   MachineInstr &MI) const {
2898 
2899   MachineBasicBlock *MBB = MI.getParent();
2900   const DebugLoc &DL = MI.getDebugLoc();
2901 
2902   if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2903     Function &F = MBB->getParent()->getFunction();
2904     DiagnosticInfoUnsupported
2905       NoFpRet(F, "return versions of fp atomics not supported",
2906               MI.getDebugLoc(), DS_Error);
2907     F.getContext().diagnose(NoFpRet);
2908     return false;
2909   }
2910 
2911   // FIXME: This is only needed because tablegen requires number of dst operands
2912   // in match and replace pattern to be the same. Otherwise patterns can be
2913   // exported from SDag path.
2914   MachineOperand &VDataIn = MI.getOperand(1);
2915   MachineOperand &VIndex = MI.getOperand(3);
2916   MachineOperand &VOffset = MI.getOperand(4);
2917   MachineOperand &SOffset = MI.getOperand(5);
2918   int16_t Offset = MI.getOperand(6).getImm();
2919 
2920   bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2921   bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
2922 
2923   unsigned Opcode;
2924   if (HasVOffset) {
2925     Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
2926                        : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
2927   } else {
2928     Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
2929                        : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
2930   }
2931 
2932   if (MRI->getType(VDataIn.getReg()).isVector()) {
2933     switch (Opcode) {
2934     case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
2935       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
2936       break;
2937     case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
2938       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
2939       break;
2940     case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
2941       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
2942       break;
2943     case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
2944       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
2945       break;
2946     }
2947   }
2948 
2949   auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
2950   I.add(VDataIn);
2951 
2952   if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
2953       Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
2954     Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
2955     BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
2956       .addReg(VIndex.getReg())
2957       .addImm(AMDGPU::sub0)
2958       .addReg(VOffset.getReg())
2959       .addImm(AMDGPU::sub1);
2960 
2961     I.addReg(IdxReg);
2962   } else if (HasVIndex) {
2963     I.add(VIndex);
2964   } else if (HasVOffset) {
2965     I.add(VOffset);
2966   }
2967 
2968   I.add(MI.getOperand(2)); // rsrc
2969   I.add(SOffset);
2970   I.addImm(Offset);
2971   renderExtractSLC(I, MI, 7);
2972   I.cloneMemRefs(MI);
2973 
2974   MI.eraseFromParent();
2975 
2976   return true;
2977 }
2978 
2979 bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
2980   MachineInstr &MI) const{
2981 
2982   MachineBasicBlock *MBB = MI.getParent();
2983   const DebugLoc &DL = MI.getDebugLoc();
2984 
2985   if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2986     Function &F = MBB->getParent()->getFunction();
2987     DiagnosticInfoUnsupported
2988       NoFpRet(F, "return versions of fp atomics not supported",
2989               MI.getDebugLoc(), DS_Error);
2990     F.getContext().diagnose(NoFpRet);
2991     return false;
2992   }
2993 
2994   // FIXME: This is only needed because tablegen requires number of dst operands
2995   // in match and replace pattern to be the same. Otherwise patterns can be
2996   // exported from SDag path.
2997   auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
2998 
2999   Register Data = MI.getOperand(3).getReg();
3000   const unsigned Opc = MRI->getType(Data).isVector() ?
3001     AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3002   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3003     .addReg(Addr.first)
3004     .addReg(Data)
3005     .addImm(Addr.second)
3006     .addImm(0) // SLC
3007     .cloneMemRefs(MI);
3008 
3009   MI.eraseFromParent();
3010   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3011 }
3012 
3013 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3014   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3015   MI.RemoveOperand(1);
3016   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3017   return true;
3018 }
3019 
3020 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3021   if (I.isPHI())
3022     return selectPHI(I);
3023 
3024   if (!I.isPreISelOpcode()) {
3025     if (I.isCopy())
3026       return selectCOPY(I);
3027     return true;
3028   }
3029 
3030   switch (I.getOpcode()) {
3031   case TargetOpcode::G_AND:
3032   case TargetOpcode::G_OR:
3033   case TargetOpcode::G_XOR:
3034     if (selectImpl(I, *CoverageInfo))
3035       return true;
3036     return selectG_AND_OR_XOR(I);
3037   case TargetOpcode::G_ADD:
3038   case TargetOpcode::G_SUB:
3039     if (selectImpl(I, *CoverageInfo))
3040       return true;
3041     return selectG_ADD_SUB(I);
3042   case TargetOpcode::G_UADDO:
3043   case TargetOpcode::G_USUBO:
3044   case TargetOpcode::G_UADDE:
3045   case TargetOpcode::G_USUBE:
3046     return selectG_UADDO_USUBO_UADDE_USUBE(I);
3047   case TargetOpcode::G_INTTOPTR:
3048   case TargetOpcode::G_BITCAST:
3049   case TargetOpcode::G_PTRTOINT:
3050     return selectCOPY(I);
3051   case TargetOpcode::G_CONSTANT:
3052   case TargetOpcode::G_FCONSTANT:
3053     return selectG_CONSTANT(I);
3054   case TargetOpcode::G_FNEG:
3055     if (selectImpl(I, *CoverageInfo))
3056       return true;
3057     return selectG_FNEG(I);
3058   case TargetOpcode::G_FABS:
3059     if (selectImpl(I, *CoverageInfo))
3060       return true;
3061     return selectG_FABS(I);
3062   case TargetOpcode::G_EXTRACT:
3063     return selectG_EXTRACT(I);
3064   case TargetOpcode::G_MERGE_VALUES:
3065   case TargetOpcode::G_BUILD_VECTOR:
3066   case TargetOpcode::G_CONCAT_VECTORS:
3067     return selectG_MERGE_VALUES(I);
3068   case TargetOpcode::G_UNMERGE_VALUES:
3069     return selectG_UNMERGE_VALUES(I);
3070   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3071     return selectG_BUILD_VECTOR_TRUNC(I);
3072   case TargetOpcode::G_PTR_ADD:
3073     return selectG_PTR_ADD(I);
3074   case TargetOpcode::G_IMPLICIT_DEF:
3075     return selectG_IMPLICIT_DEF(I);
3076   case TargetOpcode::G_FREEZE:
3077     return selectCOPY(I);
3078   case TargetOpcode::G_INSERT:
3079     return selectG_INSERT(I);
3080   case TargetOpcode::G_INTRINSIC:
3081     return selectG_INTRINSIC(I);
3082   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3083     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3084   case TargetOpcode::G_ICMP:
3085     if (selectG_ICMP(I))
3086       return true;
3087     return selectImpl(I, *CoverageInfo);
3088   case TargetOpcode::G_LOAD:
3089   case TargetOpcode::G_STORE:
3090   case TargetOpcode::G_ATOMIC_CMPXCHG:
3091   case TargetOpcode::G_ATOMICRMW_XCHG:
3092   case TargetOpcode::G_ATOMICRMW_ADD:
3093   case TargetOpcode::G_ATOMICRMW_SUB:
3094   case TargetOpcode::G_ATOMICRMW_AND:
3095   case TargetOpcode::G_ATOMICRMW_OR:
3096   case TargetOpcode::G_ATOMICRMW_XOR:
3097   case TargetOpcode::G_ATOMICRMW_MIN:
3098   case TargetOpcode::G_ATOMICRMW_MAX:
3099   case TargetOpcode::G_ATOMICRMW_UMIN:
3100   case TargetOpcode::G_ATOMICRMW_UMAX:
3101   case TargetOpcode::G_ATOMICRMW_FADD:
3102   case AMDGPU::G_AMDGPU_ATOMIC_INC:
3103   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3104   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3105   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3106     return selectG_LOAD_STORE_ATOMICRMW(I);
3107   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3108     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3109   case TargetOpcode::G_SELECT:
3110     return selectG_SELECT(I);
3111   case TargetOpcode::G_TRUNC:
3112     return selectG_TRUNC(I);
3113   case TargetOpcode::G_SEXT:
3114   case TargetOpcode::G_ZEXT:
3115   case TargetOpcode::G_ANYEXT:
3116   case TargetOpcode::G_SEXT_INREG:
3117     if (selectImpl(I, *CoverageInfo))
3118       return true;
3119     return selectG_SZA_EXT(I);
3120   case TargetOpcode::G_BRCOND:
3121     return selectG_BRCOND(I);
3122   case TargetOpcode::G_GLOBAL_VALUE:
3123     return selectG_GLOBAL_VALUE(I);
3124   case TargetOpcode::G_PTRMASK:
3125     return selectG_PTRMASK(I);
3126   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3127     return selectG_EXTRACT_VECTOR_ELT(I);
3128   case TargetOpcode::G_INSERT_VECTOR_ELT:
3129     return selectG_INSERT_VECTOR_ELT(I);
3130   case TargetOpcode::G_SHUFFLE_VECTOR:
3131     return selectG_SHUFFLE_VECTOR(I);
3132   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3133   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3134     const AMDGPU::ImageDimIntrinsicInfo *Intr
3135       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3136     assert(Intr && "not an image intrinsic with image pseudo");
3137     return selectImageIntrinsic(I, Intr);
3138   }
3139   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3140     return selectBVHIntrinsic(I);
3141   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3142     return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3143   default:
3144     return selectImpl(I, *CoverageInfo);
3145   }
3146   return false;
3147 }
3148 
3149 InstructionSelector::ComplexRendererFns
3150 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3151   return {{
3152       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3153   }};
3154 
3155 }
3156 
3157 std::pair<Register, unsigned>
3158 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3159                                               bool AllowAbs) const {
3160   Register Src = Root.getReg();
3161   Register OrigSrc = Src;
3162   unsigned Mods = 0;
3163   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3164 
3165   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3166     Src = MI->getOperand(1).getReg();
3167     Mods |= SISrcMods::NEG;
3168     MI = getDefIgnoringCopies(Src, *MRI);
3169   }
3170 
3171   if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3172     Src = MI->getOperand(1).getReg();
3173     Mods |= SISrcMods::ABS;
3174   }
3175 
3176   if (Mods != 0 &&
3177       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3178     MachineInstr *UseMI = Root.getParent();
3179 
3180     // If we looked through copies to find source modifiers on an SGPR operand,
3181     // we now have an SGPR register source. To avoid potentially violating the
3182     // constant bus restriction, we need to insert a copy to a VGPR.
3183     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3184     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3185             TII.get(AMDGPU::COPY), VGPRSrc)
3186       .addReg(Src);
3187     Src = VGPRSrc;
3188   }
3189 
3190   return std::make_pair(Src, Mods);
3191 }
3192 
3193 ///
3194 /// This will select either an SGPR or VGPR operand and will save us from
3195 /// having to write an extra tablegen pattern.
3196 InstructionSelector::ComplexRendererFns
3197 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3198   return {{
3199       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3200   }};
3201 }
3202 
3203 InstructionSelector::ComplexRendererFns
3204 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3205   Register Src;
3206   unsigned Mods;
3207   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3208 
3209   return {{
3210       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3211       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3212       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3213       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3214   }};
3215 }
3216 
3217 InstructionSelector::ComplexRendererFns
3218 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3219   Register Src;
3220   unsigned Mods;
3221   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3222 
3223   return {{
3224       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3225       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3226       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3227       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3228   }};
3229 }
3230 
3231 InstructionSelector::ComplexRendererFns
3232 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3233   return {{
3234       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3235       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3236       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3237   }};
3238 }
3239 
3240 InstructionSelector::ComplexRendererFns
3241 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3242   Register Src;
3243   unsigned Mods;
3244   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3245 
3246   return {{
3247       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3248       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3249   }};
3250 }
3251 
3252 InstructionSelector::ComplexRendererFns
3253 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3254   Register Src;
3255   unsigned Mods;
3256   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3257 
3258   return {{
3259       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3260       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3261   }};
3262 }
3263 
3264 InstructionSelector::ComplexRendererFns
3265 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3266   Register Reg = Root.getReg();
3267   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3268   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3269               Def->getOpcode() == AMDGPU::G_FABS))
3270     return {};
3271   return {{
3272       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3273   }};
3274 }
3275 
3276 std::pair<Register, unsigned>
3277 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3278   Register Src, const MachineRegisterInfo &MRI) const {
3279   unsigned Mods = 0;
3280   MachineInstr *MI = MRI.getVRegDef(Src);
3281 
3282   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3283       // It's possible to see an f32 fneg here, but unlikely.
3284       // TODO: Treat f32 fneg as only high bit.
3285       MRI.getType(Src) == LLT::vector(2, 16)) {
3286     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3287     Src = MI->getOperand(1).getReg();
3288     MI = MRI.getVRegDef(Src);
3289   }
3290 
3291   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3292 
3293   // Packed instructions do not have abs modifiers.
3294   Mods |= SISrcMods::OP_SEL_1;
3295 
3296   return std::make_pair(Src, Mods);
3297 }
3298 
3299 InstructionSelector::ComplexRendererFns
3300 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3301   MachineRegisterInfo &MRI
3302     = Root.getParent()->getParent()->getParent()->getRegInfo();
3303 
3304   Register Src;
3305   unsigned Mods;
3306   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3307 
3308   return {{
3309       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3310       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3311   }};
3312 }
3313 
3314 InstructionSelector::ComplexRendererFns
3315 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3316   Register Src;
3317   unsigned Mods;
3318   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3319   if (!isKnownNeverNaN(Src, *MRI))
3320     return None;
3321 
3322   return {{
3323       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3324       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3325   }};
3326 }
3327 
3328 InstructionSelector::ComplexRendererFns
3329 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3330   // FIXME: Handle op_sel
3331   return {{
3332       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3333       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3334   }};
3335 }
3336 
3337 InstructionSelector::ComplexRendererFns
3338 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3339   SmallVector<GEPInfo, 4> AddrInfo;
3340   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3341 
3342   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3343     return None;
3344 
3345   const GEPInfo &GEPInfo = AddrInfo[0];
3346   Optional<int64_t> EncodedImm =
3347       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3348   if (!EncodedImm)
3349     return None;
3350 
3351   unsigned PtrReg = GEPInfo.SgprParts[0];
3352   return {{
3353     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3354     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3355   }};
3356 }
3357 
3358 InstructionSelector::ComplexRendererFns
3359 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3360   SmallVector<GEPInfo, 4> AddrInfo;
3361   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3362 
3363   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3364     return None;
3365 
3366   const GEPInfo &GEPInfo = AddrInfo[0];
3367   Register PtrReg = GEPInfo.SgprParts[0];
3368   Optional<int64_t> EncodedImm =
3369       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3370   if (!EncodedImm)
3371     return None;
3372 
3373   return {{
3374     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3375     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3376   }};
3377 }
3378 
3379 InstructionSelector::ComplexRendererFns
3380 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3381   MachineInstr *MI = Root.getParent();
3382   MachineBasicBlock *MBB = MI->getParent();
3383 
3384   SmallVector<GEPInfo, 4> AddrInfo;
3385   getAddrModeInfo(*MI, *MRI, AddrInfo);
3386 
3387   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3388   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3389   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3390     return None;
3391 
3392   const GEPInfo &GEPInfo = AddrInfo[0];
3393   // SGPR offset is unsigned.
3394   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3395     return None;
3396 
3397   // If we make it this far we have a load with an 32-bit immediate offset.
3398   // It is OK to select this using a sgpr offset, because we have already
3399   // failed trying to select this load into one of the _IMM variants since
3400   // the _IMM Patterns are considered before the _SGPR patterns.
3401   Register PtrReg = GEPInfo.SgprParts[0];
3402   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3403   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3404           .addImm(GEPInfo.Imm);
3405   return {{
3406     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3407     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3408   }};
3409 }
3410 
3411 template <bool Signed>
3412 std::pair<Register, int>
3413 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3414   MachineInstr *MI = Root.getParent();
3415 
3416   auto Default = std::make_pair(Root.getReg(), 0);
3417 
3418   if (!STI.hasFlatInstOffsets())
3419     return Default;
3420 
3421   Register PtrBase;
3422   int64_t ConstOffset;
3423   std::tie(PtrBase, ConstOffset) =
3424       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3425   if (ConstOffset == 0)
3426     return Default;
3427 
3428   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3429   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
3430     return Default;
3431 
3432   return std::make_pair(PtrBase, ConstOffset);
3433 }
3434 
3435 InstructionSelector::ComplexRendererFns
3436 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3437   auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
3438 
3439   return {{
3440       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3441       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3442     }};
3443 }
3444 
3445 InstructionSelector::ComplexRendererFns
3446 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3447   auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
3448 
3449   return {{
3450       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3451       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3452     }};
3453 }
3454 
3455 /// Match a zero extend from a 32-bit value to 64-bits.
3456 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3457   Register ZExtSrc;
3458   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3459     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3460 
3461   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3462   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3463   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3464     return false;
3465 
3466   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3467     return Def->getOperand(1).getReg();
3468   }
3469 
3470   return Register();
3471 }
3472 
3473 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3474 InstructionSelector::ComplexRendererFns
3475 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3476   Register Addr = Root.getReg();
3477   Register PtrBase;
3478   int64_t ConstOffset;
3479   int64_t ImmOffset = 0;
3480 
3481   // Match the immediate offset first, which canonically is moved as low as
3482   // possible.
3483   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3484 
3485   if (ConstOffset != 0) {
3486     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
3487       Addr = PtrBase;
3488       ImmOffset = ConstOffset;
3489     } else if (ConstOffset > 0) {
3490       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3491       if (!PtrBaseDef)
3492         return None;
3493 
3494       if (isSGPR(PtrBaseDef->Reg)) {
3495         // Offset is too large.
3496         //
3497         // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
3498         //                         + (large_offset & MaxOffset);
3499         int64_t SplitImmOffset, RemainderOffset;
3500         std::tie(SplitImmOffset, RemainderOffset)
3501           = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
3502 
3503         if (isUInt<32>(RemainderOffset)) {
3504           MachineInstr *MI = Root.getParent();
3505           MachineBasicBlock *MBB = MI->getParent();
3506           Register HighBits
3507             = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3508 
3509           BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3510                   HighBits)
3511             .addImm(RemainderOffset);
3512 
3513           return {{
3514             [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },  // saddr
3515             [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
3516             [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3517           }};
3518         }
3519       }
3520     }
3521   }
3522 
3523   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3524   if (!AddrDef)
3525     return None;
3526 
3527   // Match the variable offset.
3528   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
3529     // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3530     // drop this.
3531     if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3532         AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
3533       return None;
3534 
3535     // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3536     // moves required to copy a 64-bit SGPR to VGPR.
3537     const Register SAddr = AddrDef->Reg;
3538     if (!isSGPR(SAddr))
3539       return None;
3540 
3541     MachineInstr *MI = Root.getParent();
3542     MachineBasicBlock *MBB = MI->getParent();
3543     Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3544 
3545     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3546             VOffset)
3547       .addImm(0);
3548 
3549     return {{
3550         [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },    // saddr
3551         [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },  // voffset
3552         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3553     }};
3554   }
3555 
3556   // Look through the SGPR->VGPR copy.
3557   Register SAddr =
3558     getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3559   if (!SAddr || !isSGPR(SAddr))
3560     return None;
3561 
3562   Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3563 
3564   // It's possible voffset is an SGPR here, but the copy to VGPR will be
3565   // inserted later.
3566   Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3567   if (!VOffset)
3568     return None;
3569 
3570   return {{[=](MachineInstrBuilder &MIB) { // saddr
3571              MIB.addReg(SAddr);
3572            },
3573            [=](MachineInstrBuilder &MIB) { // voffset
3574              MIB.addReg(VOffset);
3575            },
3576            [=](MachineInstrBuilder &MIB) { // offset
3577              MIB.addImm(ImmOffset);
3578            }}};
3579 }
3580 
3581 InstructionSelector::ComplexRendererFns
3582 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3583   Register Addr = Root.getReg();
3584   Register PtrBase;
3585   int64_t ConstOffset;
3586   int64_t ImmOffset = 0;
3587 
3588   // Match the immediate offset first, which canonically is moved as low as
3589   // possible.
3590   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3591 
3592   if (ConstOffset != 0 &&
3593       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
3594     Addr = PtrBase;
3595     ImmOffset = ConstOffset;
3596   }
3597 
3598   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3599   if (!AddrDef)
3600     return None;
3601 
3602   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3603     int FI = AddrDef->MI->getOperand(1).getIndex();
3604     return {{
3605         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3606         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3607     }};
3608   }
3609 
3610   Register SAddr = AddrDef->Reg;
3611 
3612   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3613     Register LHS = AddrDef->MI->getOperand(1).getReg();
3614     Register RHS = AddrDef->MI->getOperand(2).getReg();
3615     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3616     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3617 
3618     if (LHSDef && RHSDef &&
3619         LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3620         isSGPR(RHSDef->Reg)) {
3621       int FI = LHSDef->MI->getOperand(1).getIndex();
3622       MachineInstr &I = *Root.getParent();
3623       MachineBasicBlock *BB = I.getParent();
3624       const DebugLoc &DL = I.getDebugLoc();
3625       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3626 
3627       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
3628         .addFrameIndex(FI)
3629         .addReg(RHSDef->Reg);
3630     }
3631   }
3632 
3633   if (!isSGPR(SAddr))
3634     return None;
3635 
3636   return {{
3637       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3638       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3639   }};
3640 }
3641 
3642 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3643   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3644   return PSV && PSV->isStack();
3645 }
3646 
3647 InstructionSelector::ComplexRendererFns
3648 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3649   MachineInstr *MI = Root.getParent();
3650   MachineBasicBlock *MBB = MI->getParent();
3651   MachineFunction *MF = MBB->getParent();
3652   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3653 
3654   int64_t Offset = 0;
3655   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3656       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3657     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3658 
3659     // TODO: Should this be inside the render function? The iterator seems to
3660     // move.
3661     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3662             HighBits)
3663       .addImm(Offset & ~4095);
3664 
3665     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3666                MIB.addReg(Info->getScratchRSrcReg());
3667              },
3668              [=](MachineInstrBuilder &MIB) { // vaddr
3669                MIB.addReg(HighBits);
3670              },
3671              [=](MachineInstrBuilder &MIB) { // soffset
3672                // Use constant zero for soffset and rely on eliminateFrameIndex
3673                // to choose the appropriate frame register if need be.
3674                MIB.addImm(0);
3675              },
3676              [=](MachineInstrBuilder &MIB) { // offset
3677                MIB.addImm(Offset & 4095);
3678              }}};
3679   }
3680 
3681   assert(Offset == 0 || Offset == -1);
3682 
3683   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3684   // offsets.
3685   Optional<int> FI;
3686   Register VAddr = Root.getReg();
3687   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3688     if (isBaseWithConstantOffset(Root, *MRI)) {
3689       const MachineOperand &LHS = RootDef->getOperand(1);
3690       const MachineOperand &RHS = RootDef->getOperand(2);
3691       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3692       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3693       if (LHSDef && RHSDef) {
3694         int64_t PossibleOffset =
3695             RHSDef->getOperand(1).getCImm()->getSExtValue();
3696         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3697             (!STI.privateMemoryResourceIsRangeChecked() ||
3698              KnownBits->signBitIsZero(LHS.getReg()))) {
3699           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3700             FI = LHSDef->getOperand(1).getIndex();
3701           else
3702             VAddr = LHS.getReg();
3703           Offset = PossibleOffset;
3704         }
3705       }
3706     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3707       FI = RootDef->getOperand(1).getIndex();
3708     }
3709   }
3710 
3711   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3712              MIB.addReg(Info->getScratchRSrcReg());
3713            },
3714            [=](MachineInstrBuilder &MIB) { // vaddr
3715              if (FI.hasValue())
3716                MIB.addFrameIndex(FI.getValue());
3717              else
3718                MIB.addReg(VAddr);
3719            },
3720            [=](MachineInstrBuilder &MIB) { // soffset
3721              // Use constant zero for soffset and rely on eliminateFrameIndex
3722              // to choose the appropriate frame register if need be.
3723              MIB.addImm(0);
3724            },
3725            [=](MachineInstrBuilder &MIB) { // offset
3726              MIB.addImm(Offset);
3727            }}};
3728 }
3729 
3730 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3731                                                 int64_t Offset) const {
3732   if (!isUInt<16>(Offset))
3733     return false;
3734 
3735   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3736     return true;
3737 
3738   // On Southern Islands instruction with a negative base value and an offset
3739   // don't seem to work.
3740   return KnownBits->signBitIsZero(Base);
3741 }
3742 
3743 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3744                                                  int64_t Offset1,
3745                                                  unsigned Size) const {
3746   if (Offset0 % Size != 0 || Offset1 % Size != 0)
3747     return false;
3748   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3749     return false;
3750 
3751   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3752     return true;
3753 
3754   // On Southern Islands instruction with a negative base value and an offset
3755   // don't seem to work.
3756   return KnownBits->signBitIsZero(Base);
3757 }
3758 
3759 InstructionSelector::ComplexRendererFns
3760 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3761     MachineOperand &Root) const {
3762   MachineInstr *MI = Root.getParent();
3763   MachineBasicBlock *MBB = MI->getParent();
3764 
3765   int64_t Offset = 0;
3766   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3767       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3768     return {};
3769 
3770   const MachineFunction *MF = MBB->getParent();
3771   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3772   const MachineMemOperand *MMO = *MI->memoperands_begin();
3773   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3774 
3775   return {{
3776       [=](MachineInstrBuilder &MIB) { // rsrc
3777         MIB.addReg(Info->getScratchRSrcReg());
3778       },
3779       [=](MachineInstrBuilder &MIB) { // soffset
3780         if (isStackPtrRelative(PtrInfo))
3781           MIB.addReg(Info->getStackPtrOffsetReg());
3782         else
3783           MIB.addImm(0);
3784       },
3785       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3786   }};
3787 }
3788 
3789 std::pair<Register, unsigned>
3790 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3791   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3792   if (!RootDef)
3793     return std::make_pair(Root.getReg(), 0);
3794 
3795   int64_t ConstAddr = 0;
3796 
3797   Register PtrBase;
3798   int64_t Offset;
3799   std::tie(PtrBase, Offset) =
3800     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3801 
3802   if (Offset) {
3803     if (isDSOffsetLegal(PtrBase, Offset)) {
3804       // (add n0, c0)
3805       return std::make_pair(PtrBase, Offset);
3806     }
3807   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3808     // TODO
3809 
3810 
3811   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3812     // TODO
3813 
3814   }
3815 
3816   return std::make_pair(Root.getReg(), 0);
3817 }
3818 
3819 InstructionSelector::ComplexRendererFns
3820 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3821   Register Reg;
3822   unsigned Offset;
3823   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3824   return {{
3825       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3826       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3827     }};
3828 }
3829 
3830 InstructionSelector::ComplexRendererFns
3831 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3832   return selectDSReadWrite2(Root, 4);
3833 }
3834 
3835 InstructionSelector::ComplexRendererFns
3836 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3837   return selectDSReadWrite2(Root, 8);
3838 }
3839 
3840 InstructionSelector::ComplexRendererFns
3841 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3842                                               unsigned Size) const {
3843   Register Reg;
3844   unsigned Offset;
3845   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3846   return {{
3847       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3848       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3849       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3850     }};
3851 }
3852 
3853 std::pair<Register, unsigned>
3854 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3855                                                   unsigned Size) const {
3856   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3857   if (!RootDef)
3858     return std::make_pair(Root.getReg(), 0);
3859 
3860   int64_t ConstAddr = 0;
3861 
3862   Register PtrBase;
3863   int64_t Offset;
3864   std::tie(PtrBase, Offset) =
3865     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3866 
3867   if (Offset) {
3868     int64_t OffsetValue0 = Offset;
3869     int64_t OffsetValue1 = Offset + Size;
3870     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3871       // (add n0, c0)
3872       return std::make_pair(PtrBase, OffsetValue0 / Size);
3873     }
3874   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3875     // TODO
3876 
3877   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3878     // TODO
3879 
3880   }
3881 
3882   return std::make_pair(Root.getReg(), 0);
3883 }
3884 
3885 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3886 /// the base value with the constant offset. There may be intervening copies
3887 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3888 /// not match the pattern.
3889 std::pair<Register, int64_t>
3890 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3891   Register Root, const MachineRegisterInfo &MRI) const {
3892   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3893   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3894     return {Root, 0};
3895 
3896   MachineOperand &RHS = RootI->getOperand(2);
3897   Optional<ValueAndVReg> MaybeOffset
3898     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3899   if (!MaybeOffset)
3900     return {Root, 0};
3901   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
3902 }
3903 
3904 static void addZeroImm(MachineInstrBuilder &MIB) {
3905   MIB.addImm(0);
3906 }
3907 
3908 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3909 /// BasePtr is not valid, a null base pointer will be used.
3910 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3911                           uint32_t FormatLo, uint32_t FormatHi,
3912                           Register BasePtr) {
3913   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3914   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3915   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3916   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3917 
3918   B.buildInstr(AMDGPU::S_MOV_B32)
3919     .addDef(RSrc2)
3920     .addImm(FormatLo);
3921   B.buildInstr(AMDGPU::S_MOV_B32)
3922     .addDef(RSrc3)
3923     .addImm(FormatHi);
3924 
3925   // Build the half of the subregister with the constants before building the
3926   // full 128-bit register. If we are building multiple resource descriptors,
3927   // this will allow CSEing of the 2-component register.
3928   B.buildInstr(AMDGPU::REG_SEQUENCE)
3929     .addDef(RSrcHi)
3930     .addReg(RSrc2)
3931     .addImm(AMDGPU::sub0)
3932     .addReg(RSrc3)
3933     .addImm(AMDGPU::sub1);
3934 
3935   Register RSrcLo = BasePtr;
3936   if (!BasePtr) {
3937     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3938     B.buildInstr(AMDGPU::S_MOV_B64)
3939       .addDef(RSrcLo)
3940       .addImm(0);
3941   }
3942 
3943   B.buildInstr(AMDGPU::REG_SEQUENCE)
3944     .addDef(RSrc)
3945     .addReg(RSrcLo)
3946     .addImm(AMDGPU::sub0_sub1)
3947     .addReg(RSrcHi)
3948     .addImm(AMDGPU::sub2_sub3);
3949 
3950   return RSrc;
3951 }
3952 
3953 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3954                                 const SIInstrInfo &TII, Register BasePtr) {
3955   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3956 
3957   // FIXME: Why are half the "default" bits ignored based on the addressing
3958   // mode?
3959   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3960 }
3961 
3962 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3963                                const SIInstrInfo &TII, Register BasePtr) {
3964   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3965 
3966   // FIXME: Why are half the "default" bits ignored based on the addressing
3967   // mode?
3968   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3969 }
3970 
3971 AMDGPUInstructionSelector::MUBUFAddressData
3972 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3973   MUBUFAddressData Data;
3974   Data.N0 = Src;
3975 
3976   Register PtrBase;
3977   int64_t Offset;
3978 
3979   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3980   if (isUInt<32>(Offset)) {
3981     Data.N0 = PtrBase;
3982     Data.Offset = Offset;
3983   }
3984 
3985   if (MachineInstr *InputAdd
3986       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3987     Data.N2 = InputAdd->getOperand(1).getReg();
3988     Data.N3 = InputAdd->getOperand(2).getReg();
3989 
3990     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3991     // FIXME: Don't know this was defined by operand 0
3992     //
3993     // TODO: Remove this when we have copy folding optimizations after
3994     // RegBankSelect.
3995     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3996     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3997   }
3998 
3999   return Data;
4000 }
4001 
4002 /// Return if the addr64 mubuf mode should be used for the given address.
4003 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4004   // (ptr_add N2, N3) -> addr64, or
4005   // (ptr_add (ptr_add N2, N3), C1) -> addr64
4006   if (Addr.N2)
4007     return true;
4008 
4009   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4010   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4011 }
4012 
4013 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4014 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4015 /// component.
4016 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4017   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4018   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4019     return;
4020 
4021   // Illegal offset, store it in soffset.
4022   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4023   B.buildInstr(AMDGPU::S_MOV_B32)
4024     .addDef(SOffset)
4025     .addImm(ImmOffset);
4026   ImmOffset = 0;
4027 }
4028 
4029 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4030   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4031   Register &SOffset, int64_t &Offset) const {
4032   // FIXME: Predicates should stop this from reaching here.
4033   // addr64 bit was removed for volcanic islands.
4034   if (!STI.hasAddr64() || STI.useFlatForGlobal())
4035     return false;
4036 
4037   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4038   if (!shouldUseAddr64(AddrData))
4039     return false;
4040 
4041   Register N0 = AddrData.N0;
4042   Register N2 = AddrData.N2;
4043   Register N3 = AddrData.N3;
4044   Offset = AddrData.Offset;
4045 
4046   // Base pointer for the SRD.
4047   Register SRDPtr;
4048 
4049   if (N2) {
4050     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4051       assert(N3);
4052       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4053         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4054         // addr64, and construct the default resource from a 0 address.
4055         VAddr = N0;
4056       } else {
4057         SRDPtr = N3;
4058         VAddr = N2;
4059       }
4060     } else {
4061       // N2 is not divergent.
4062       SRDPtr = N2;
4063       VAddr = N3;
4064     }
4065   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4066     // Use the default null pointer in the resource
4067     VAddr = N0;
4068   } else {
4069     // N0 -> offset, or
4070     // (N0 + C1) -> offset
4071     SRDPtr = N0;
4072   }
4073 
4074   MachineIRBuilder B(*Root.getParent());
4075   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4076   splitIllegalMUBUFOffset(B, SOffset, Offset);
4077   return true;
4078 }
4079 
4080 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4081   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4082   int64_t &Offset) const {
4083 
4084   // FIXME: Pattern should not reach here.
4085   if (STI.useFlatForGlobal())
4086     return false;
4087 
4088   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4089   if (shouldUseAddr64(AddrData))
4090     return false;
4091 
4092   // N0 -> offset, or
4093   // (N0 + C1) -> offset
4094   Register SRDPtr = AddrData.N0;
4095   Offset = AddrData.Offset;
4096 
4097   // TODO: Look through extensions for 32-bit soffset.
4098   MachineIRBuilder B(*Root.getParent());
4099 
4100   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4101   splitIllegalMUBUFOffset(B, SOffset, Offset);
4102   return true;
4103 }
4104 
4105 InstructionSelector::ComplexRendererFns
4106 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4107   Register VAddr;
4108   Register RSrcReg;
4109   Register SOffset;
4110   int64_t Offset = 0;
4111 
4112   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4113     return {};
4114 
4115   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4116   // pattern.
4117   return {{
4118       [=](MachineInstrBuilder &MIB) {  // rsrc
4119         MIB.addReg(RSrcReg);
4120       },
4121       [=](MachineInstrBuilder &MIB) { // vaddr
4122         MIB.addReg(VAddr);
4123       },
4124       [=](MachineInstrBuilder &MIB) { // soffset
4125         if (SOffset)
4126           MIB.addReg(SOffset);
4127         else
4128           MIB.addImm(0);
4129       },
4130       [=](MachineInstrBuilder &MIB) { // offset
4131         MIB.addImm(Offset);
4132       },
4133       addZeroImm, //  glc
4134       addZeroImm, //  slc
4135       addZeroImm, //  tfe
4136       addZeroImm, //  dlc
4137       addZeroImm  //  swz
4138     }};
4139 }
4140 
4141 InstructionSelector::ComplexRendererFns
4142 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4143   Register RSrcReg;
4144   Register SOffset;
4145   int64_t Offset = 0;
4146 
4147   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4148     return {};
4149 
4150   return {{
4151       [=](MachineInstrBuilder &MIB) {  // rsrc
4152         MIB.addReg(RSrcReg);
4153       },
4154       [=](MachineInstrBuilder &MIB) { // soffset
4155         if (SOffset)
4156           MIB.addReg(SOffset);
4157         else
4158           MIB.addImm(0);
4159       },
4160       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4161       addZeroImm, //  glc
4162       addZeroImm, //  slc
4163       addZeroImm, //  tfe
4164       addZeroImm, //  dlc
4165       addZeroImm  //  swz
4166     }};
4167 }
4168 
4169 InstructionSelector::ComplexRendererFns
4170 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4171   Register VAddr;
4172   Register RSrcReg;
4173   Register SOffset;
4174   int64_t Offset = 0;
4175 
4176   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4177     return {};
4178 
4179   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4180   // pattern.
4181   return {{
4182       [=](MachineInstrBuilder &MIB) {  // rsrc
4183         MIB.addReg(RSrcReg);
4184       },
4185       [=](MachineInstrBuilder &MIB) { // vaddr
4186         MIB.addReg(VAddr);
4187       },
4188       [=](MachineInstrBuilder &MIB) { // soffset
4189         if (SOffset)
4190           MIB.addReg(SOffset);
4191         else
4192           MIB.addImm(0);
4193       },
4194       [=](MachineInstrBuilder &MIB) { // offset
4195         MIB.addImm(Offset);
4196       },
4197       addZeroImm //  slc
4198     }};
4199 }
4200 
4201 InstructionSelector::ComplexRendererFns
4202 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4203   Register RSrcReg;
4204   Register SOffset;
4205   int64_t Offset = 0;
4206 
4207   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4208     return {};
4209 
4210   return {{
4211       [=](MachineInstrBuilder &MIB) {  // rsrc
4212         MIB.addReg(RSrcReg);
4213       },
4214       [=](MachineInstrBuilder &MIB) { // soffset
4215         if (SOffset)
4216           MIB.addReg(SOffset);
4217         else
4218           MIB.addImm(0);
4219       },
4220       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4221       addZeroImm //  slc
4222     }};
4223 }
4224 
4225 /// Get an immediate that must be 32-bits, and treated as zero extended.
4226 static Optional<uint64_t> getConstantZext32Val(Register Reg,
4227                                                const MachineRegisterInfo &MRI) {
4228   // getConstantVRegVal sexts any values, so see if that matters.
4229   Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
4230   if (!OffsetVal || !isInt<32>(*OffsetVal))
4231     return None;
4232   return Lo_32(*OffsetVal);
4233 }
4234 
4235 InstructionSelector::ComplexRendererFns
4236 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4237   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4238   if (!OffsetVal)
4239     return {};
4240 
4241   Optional<int64_t> EncodedImm =
4242       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4243   if (!EncodedImm)
4244     return {};
4245 
4246   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
4247 }
4248 
4249 InstructionSelector::ComplexRendererFns
4250 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4251   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
4252 
4253   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4254   if (!OffsetVal)
4255     return {};
4256 
4257   Optional<int64_t> EncodedImm
4258     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4259   if (!EncodedImm)
4260     return {};
4261 
4262   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
4263 }
4264 
4265 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4266                                                  const MachineInstr &MI,
4267                                                  int OpIdx) const {
4268   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4269          "Expected G_CONSTANT");
4270   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4271 }
4272 
4273 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4274                                                 const MachineInstr &MI,
4275                                                 int OpIdx) const {
4276   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4277          "Expected G_CONSTANT");
4278   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4279 }
4280 
4281 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4282                                                  const MachineInstr &MI,
4283                                                  int OpIdx) const {
4284   assert(OpIdx == -1);
4285 
4286   const MachineOperand &Op = MI.getOperand(1);
4287   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4288     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4289   else {
4290     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4291     MIB.addImm(Op.getCImm()->getSExtValue());
4292   }
4293 }
4294 
4295 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4296                                                 const MachineInstr &MI,
4297                                                 int OpIdx) const {
4298   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4299          "Expected G_CONSTANT");
4300   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4301 }
4302 
4303 /// This only really exists to satisfy DAG type checking machinery, so is a
4304 /// no-op here.
4305 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4306                                                 const MachineInstr &MI,
4307                                                 int OpIdx) const {
4308   MIB.addImm(MI.getOperand(OpIdx).getImm());
4309 }
4310 
4311 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
4312                                                  const MachineInstr &MI,
4313                                                  int OpIdx) const {
4314   assert(OpIdx >= 0 && "expected to match an immediate operand");
4315   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
4316 }
4317 
4318 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
4319                                                  const MachineInstr &MI,
4320                                                  int OpIdx) const {
4321   assert(OpIdx >= 0 && "expected to match an immediate operand");
4322   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
4323 }
4324 
4325 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
4326                                                  const MachineInstr &MI,
4327                                                  int OpIdx) const {
4328   assert(OpIdx >= 0 && "expected to match an immediate operand");
4329   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
4330 }
4331 
4332 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4333                                                  const MachineInstr &MI,
4334                                                  int OpIdx) const {
4335   assert(OpIdx >= 0 && "expected to match an immediate operand");
4336   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4337 }
4338 
4339 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4340                                                  const MachineInstr &MI,
4341                                                  int OpIdx) const {
4342   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4343 }
4344 
4345 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4346   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4347 }
4348 
4349 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4350   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4351 }
4352 
4353 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4354   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4355 }
4356 
4357 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4358   return TII.isInlineConstant(Imm);
4359 }
4360