xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "MCTargetDesc/R600MCTargetDesc.h"
18 #include "R600RegisterInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/SelectionDAG.h"
24 #include "llvm/CodeGen/SelectionDAGISel.h"
25 #include "llvm/CodeGen/SelectionDAGNodes.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/InitializePasses.h"
28 
29 #ifdef EXPENSIVE_CHECKS
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #endif
33 
34 #define DEBUG_TYPE "isel"
35 
36 using namespace llvm;
37 
38 //===----------------------------------------------------------------------===//
39 // Instruction Selector Implementation
40 //===----------------------------------------------------------------------===//
41 
42 namespace {
43 
44 static SDValue stripBitcast(SDValue Val) {
45   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
46 }
47 
48 // Figure out if this is really an extract of the high 16-bits of a dword.
49 static bool isExtractHiElt(SDValue In, SDValue &Out) {
50   In = stripBitcast(In);
51 
52   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
53     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
54       if (!Idx->isOne())
55         return false;
56       Out = In.getOperand(0);
57       return true;
58     }
59   }
60 
61   if (In.getOpcode() != ISD::TRUNCATE)
62     return false;
63 
64   SDValue Srl = In.getOperand(0);
65   if (Srl.getOpcode() == ISD::SRL) {
66     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
67       if (ShiftAmt->getZExtValue() == 16) {
68         Out = stripBitcast(Srl.getOperand(0));
69         return true;
70       }
71     }
72   }
73 
74   return false;
75 }
76 
77 // Look through operations that obscure just looking at the low 16-bits of the
78 // same register.
79 static SDValue stripExtractLoElt(SDValue In) {
80   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
81     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
82       if (Idx->isZero() && In.getValueSizeInBits() <= 32)
83         return In.getOperand(0);
84     }
85   }
86 
87   if (In.getOpcode() == ISD::TRUNCATE) {
88     SDValue Src = In.getOperand(0);
89     if (Src.getValueType().getSizeInBits() == 32)
90       return stripBitcast(Src);
91   }
92 
93   return In;
94 }
95 
96 }  // end anonymous namespace
97 
98 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
99                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
100 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
101 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
102 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
103 #ifdef EXPENSIVE_CHECKS
104 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
105 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
106 #endif
107 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
108                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
109 
110 /// This pass converts a legalized DAG into a AMDGPU-specific
111 // DAG, ready for instruction scheduling.
112 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
113                                         CodeGenOpt::Level OptLevel) {
114   return new AMDGPUDAGToDAGISel(TM, OptLevel);
115 }
116 
117 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(
118     TargetMachine *TM /*= nullptr*/,
119     CodeGenOpt::Level OptLevel /*= CodeGenOpt::Default*/)
120     : SelectionDAGISel(*TM, OptLevel) {
121   EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
122 }
123 
124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125 #ifdef EXPENSIVE_CHECKS
126   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
127   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
128   for (auto &L : LI->getLoopsInPreorder()) {
129     assert(L->isLCSSAForm(DT));
130   }
131 #endif
132   Subtarget = &MF.getSubtarget<GCNSubtarget>();
133   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
134   return SelectionDAGISel::runOnMachineFunction(MF);
135 }
136 
137 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
138   // XXX - only need to list legal operations.
139   switch (Opc) {
140   case ISD::FADD:
141   case ISD::FSUB:
142   case ISD::FMUL:
143   case ISD::FDIV:
144   case ISD::FREM:
145   case ISD::FCANONICALIZE:
146   case ISD::UINT_TO_FP:
147   case ISD::SINT_TO_FP:
148   case ISD::FABS:
149     // Fabs is lowered to a bit operation, but it's an and which will clear the
150     // high bits anyway.
151   case ISD::FSQRT:
152   case ISD::FSIN:
153   case ISD::FCOS:
154   case ISD::FPOWI:
155   case ISD::FPOW:
156   case ISD::FLOG:
157   case ISD::FLOG2:
158   case ISD::FLOG10:
159   case ISD::FEXP:
160   case ISD::FEXP2:
161   case ISD::FCEIL:
162   case ISD::FTRUNC:
163   case ISD::FRINT:
164   case ISD::FNEARBYINT:
165   case ISD::FROUND:
166   case ISD::FFLOOR:
167   case ISD::FMINNUM:
168   case ISD::FMAXNUM:
169   case AMDGPUISD::FRACT:
170   case AMDGPUISD::CLAMP:
171   case AMDGPUISD::COS_HW:
172   case AMDGPUISD::SIN_HW:
173   case AMDGPUISD::FMIN3:
174   case AMDGPUISD::FMAX3:
175   case AMDGPUISD::FMED3:
176   case AMDGPUISD::FMAD_FTZ:
177   case AMDGPUISD::RCP:
178   case AMDGPUISD::RSQ:
179   case AMDGPUISD::RCP_IFLAG:
180   case AMDGPUISD::LDEXP:
181     // On gfx10, all 16-bit instructions preserve the high bits.
182     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
183   case ISD::FP_ROUND:
184     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
185     // high bits on gfx9.
186     // TODO: If we had the source node we could see if the source was fma/mad
187     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
188   case ISD::FMA:
189   case ISD::FMAD:
190   case AMDGPUISD::DIV_FIXUP:
191     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
192   default:
193     // fcopysign, select and others may be lowered to 32-bit bit operations
194     // which don't zero the high bits.
195     return false;
196   }
197 }
198 
199 void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
200   AU.addRequired<AMDGPUArgumentUsageInfo>();
201   AU.addRequired<LegacyDivergenceAnalysis>();
202 #ifdef EXPENSIVE_CHECKS
203   AU.addRequired<DominatorTreeWrapperPass>();
204   AU.addRequired<LoopInfoWrapperPass>();
205 #endif
206   SelectionDAGISel::getAnalysisUsage(AU);
207 }
208 
209 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
210   assert(Subtarget->d16PreservesUnusedBits());
211   MVT VT = N->getValueType(0).getSimpleVT();
212   if (VT != MVT::v2i16 && VT != MVT::v2f16)
213     return false;
214 
215   SDValue Lo = N->getOperand(0);
216   SDValue Hi = N->getOperand(1);
217 
218   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
219 
220   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
221   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
222   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
223 
224   // Need to check for possible indirect dependencies on the other half of the
225   // vector to avoid introducing a cycle.
226   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
227     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
228 
229     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
230     SDValue Ops[] = {
231       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
232     };
233 
234     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
235     if (LdHi->getMemoryVT() == MVT::i8) {
236       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
237         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
238     } else {
239       assert(LdHi->getMemoryVT() == MVT::i16);
240     }
241 
242     SDValue NewLoadHi =
243       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
244                                   Ops, LdHi->getMemoryVT(),
245                                   LdHi->getMemOperand());
246 
247     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
248     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
249     return true;
250   }
251 
252   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
253   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
254   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
255   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
256   if (LdLo && Lo.hasOneUse()) {
257     SDValue TiedIn = getHi16Elt(Hi);
258     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
259       return false;
260 
261     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
262     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
263     if (LdLo->getMemoryVT() == MVT::i8) {
264       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
265         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
266     } else {
267       assert(LdLo->getMemoryVT() == MVT::i16);
268     }
269 
270     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
271 
272     SDValue Ops[] = {
273       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
274     };
275 
276     SDValue NewLoadLo =
277       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
278                                   Ops, LdLo->getMemoryVT(),
279                                   LdLo->getMemOperand());
280 
281     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
282     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
283     return true;
284   }
285 
286   return false;
287 }
288 
289 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
290   if (!Subtarget->d16PreservesUnusedBits())
291     return;
292 
293   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
294 
295   bool MadeChange = false;
296   while (Position != CurDAG->allnodes_begin()) {
297     SDNode *N = &*--Position;
298     if (N->use_empty())
299       continue;
300 
301     switch (N->getOpcode()) {
302     case ISD::BUILD_VECTOR:
303       MadeChange |= matchLoadD16FromBuildVector(N);
304       break;
305     default:
306       break;
307     }
308   }
309 
310   if (MadeChange) {
311     CurDAG->RemoveDeadNodes();
312     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
313                CurDAG->dump(););
314   }
315 }
316 
317 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
318   if (TM.Options.NoNaNsFPMath)
319     return true;
320 
321   // TODO: Move into isKnownNeverNaN
322   if (N->getFlags().hasNoNaNs())
323     return true;
324 
325   return CurDAG->isKnownNeverNaN(N);
326 }
327 
328 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
329                                            bool Negated) const {
330   if (N->isUndef())
331     return true;
332 
333   const SIInstrInfo *TII = Subtarget->getInstrInfo();
334   if (Negated) {
335     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
336       return TII->isInlineConstant(-C->getAPIntValue());
337 
338     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
339       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
340 
341   } else {
342     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
343       return TII->isInlineConstant(C->getAPIntValue());
344 
345     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
346       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
347   }
348 
349   return false;
350 }
351 
352 /// Determine the register class for \p OpNo
353 /// \returns The register class of the virtual register that will be used for
354 /// the given operand number \OpNo or NULL if the register class cannot be
355 /// determined.
356 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
357                                                           unsigned OpNo) const {
358   if (!N->isMachineOpcode()) {
359     if (N->getOpcode() == ISD::CopyToReg) {
360       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
361       if (Reg.isVirtual()) {
362         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
363         return MRI.getRegClass(Reg);
364       }
365 
366       const SIRegisterInfo *TRI
367         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
368       return TRI->getPhysRegClass(Reg);
369     }
370 
371     return nullptr;
372   }
373 
374   switch (N->getMachineOpcode()) {
375   default: {
376     const MCInstrDesc &Desc =
377         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
378     unsigned OpIdx = Desc.getNumDefs() + OpNo;
379     if (OpIdx >= Desc.getNumOperands())
380       return nullptr;
381     int RegClass = Desc.OpInfo[OpIdx].RegClass;
382     if (RegClass == -1)
383       return nullptr;
384 
385     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
386   }
387   case AMDGPU::REG_SEQUENCE: {
388     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
389     const TargetRegisterClass *SuperRC =
390         Subtarget->getRegisterInfo()->getRegClass(RCID);
391 
392     SDValue SubRegOp = N->getOperand(OpNo + 1);
393     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
394     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
395                                                               SubRegIdx);
396   }
397   }
398 }
399 
400 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
401                                          SDValue Glue) const {
402   SmallVector <SDValue, 8> Ops;
403   Ops.push_back(NewChain); // Replace the chain.
404   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
405     Ops.push_back(N->getOperand(i));
406 
407   Ops.push_back(Glue);
408   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
409 }
410 
411 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
412   const SITargetLowering& Lowering =
413     *static_cast<const SITargetLowering*>(getTargetLowering());
414 
415   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
416 
417   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
418   return glueCopyToOp(N, M0, M0.getValue(1));
419 }
420 
421 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
422   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
423   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
424     if (Subtarget->ldsRequiresM0Init())
425       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
426   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
427     MachineFunction &MF = CurDAG->getMachineFunction();
428     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
429     return
430         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
431   }
432   return N;
433 }
434 
435 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
436                                                   EVT VT) const {
437   SDNode *Lo = CurDAG->getMachineNode(
438       AMDGPU::S_MOV_B32, DL, MVT::i32,
439       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
440   SDNode *Hi =
441       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
442                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
443   const SDValue Ops[] = {
444       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
445       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
446       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
447 
448   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
449 }
450 
451 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
452   EVT VT = N->getValueType(0);
453   unsigned NumVectorElts = VT.getVectorNumElements();
454   EVT EltVT = VT.getVectorElementType();
455   SDLoc DL(N);
456   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 
458   if (NumVectorElts == 1) {
459     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
460                          RegClass);
461     return;
462   }
463 
464   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
465                                   "supported yet");
466   // 32 = Max Num Vector Elements
467   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
468   // 1 = Vector Register Class
469   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
470 
471   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
472                Triple::amdgcn;
473   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
474   bool IsRegSeq = true;
475   unsigned NOps = N->getNumOperands();
476   for (unsigned i = 0; i < NOps; i++) {
477     // XXX: Why is this here?
478     if (isa<RegisterSDNode>(N->getOperand(i))) {
479       IsRegSeq = false;
480       break;
481     }
482     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
483                          : R600RegisterInfo::getSubRegFromChannel(i);
484     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
485     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
486   }
487   if (NOps != NumVectorElts) {
488     // Fill in the missing undef elements if this was a scalar_to_vector.
489     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
490     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
491                                                    DL, EltVT);
492     for (unsigned i = NOps; i < NumVectorElts; ++i) {
493       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
494                            : R600RegisterInfo::getSubRegFromChannel(i);
495       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
496       RegSeqArgs[1 + (2 * i) + 1] =
497           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
498     }
499   }
500 
501   if (!IsRegSeq)
502     SelectCode(N);
503   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
504 }
505 
506 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
507   unsigned int Opc = N->getOpcode();
508   if (N->isMachineOpcode()) {
509     N->setNodeId(-1);
510     return;   // Already selected.
511   }
512 
513   // isa<MemSDNode> almost works but is slightly too permissive for some DS
514   // intrinsics.
515   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
516       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
517        Opc == ISD::ATOMIC_LOAD_FADD ||
518        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
519        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
520     N = glueCopyToM0LDSInit(N);
521     SelectCode(N);
522     return;
523   }
524 
525   switch (Opc) {
526   default:
527     break;
528   // We are selecting i64 ADD here instead of custom lower it during
529   // DAG legalization, so we can fold some i64 ADDs used for address
530   // calculation into the LOAD and STORE instructions.
531   case ISD::ADDC:
532   case ISD::ADDE:
533   case ISD::SUBC:
534   case ISD::SUBE: {
535     if (N->getValueType(0) != MVT::i64)
536       break;
537 
538     SelectADD_SUB_I64(N);
539     return;
540   }
541   case ISD::ADDCARRY:
542   case ISD::SUBCARRY:
543     if (N->getValueType(0) != MVT::i32)
544       break;
545 
546     SelectAddcSubb(N);
547     return;
548   case ISD::UADDO:
549   case ISD::USUBO: {
550     SelectUADDO_USUBO(N);
551     return;
552   }
553   case AMDGPUISD::FMUL_W_CHAIN: {
554     SelectFMUL_W_CHAIN(N);
555     return;
556   }
557   case AMDGPUISD::FMA_W_CHAIN: {
558     SelectFMA_W_CHAIN(N);
559     return;
560   }
561 
562   case ISD::SCALAR_TO_VECTOR:
563   case ISD::BUILD_VECTOR: {
564     EVT VT = N->getValueType(0);
565     unsigned NumVectorElts = VT.getVectorNumElements();
566     if (VT.getScalarSizeInBits() == 16) {
567       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
568         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
569           ReplaceNode(N, Packed);
570           return;
571         }
572       }
573 
574       break;
575     }
576 
577     assert(VT.getVectorElementType().bitsEq(MVT::i32));
578     unsigned RegClassID =
579         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
580     SelectBuildVector(N, RegClassID);
581     return;
582   }
583   case ISD::BUILD_PAIR: {
584     SDValue RC, SubReg0, SubReg1;
585     SDLoc DL(N);
586     if (N->getValueType(0) == MVT::i128) {
587       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
588       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
589       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
590     } else if (N->getValueType(0) == MVT::i64) {
591       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
592       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
593       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
594     } else {
595       llvm_unreachable("Unhandled value type for BUILD_PAIR");
596     }
597     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
598                             N->getOperand(1), SubReg1 };
599     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
600                                           N->getValueType(0), Ops));
601     return;
602   }
603 
604   case ISD::Constant:
605   case ISD::ConstantFP: {
606     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
607       break;
608 
609     uint64_t Imm;
610     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
611       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
612     else {
613       ConstantSDNode *C = cast<ConstantSDNode>(N);
614       Imm = C->getZExtValue();
615     }
616 
617     SDLoc DL(N);
618     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
619     return;
620   }
621   case AMDGPUISD::BFE_I32:
622   case AMDGPUISD::BFE_U32: {
623     // There is a scalar version available, but unlike the vector version which
624     // has a separate operand for the offset and width, the scalar version packs
625     // the width and offset into a single operand. Try to move to the scalar
626     // version if the offsets are constant, so that we can try to keep extended
627     // loads of kernel arguments in SGPRs.
628 
629     // TODO: Technically we could try to pattern match scalar bitshifts of
630     // dynamic values, but it's probably not useful.
631     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
632     if (!Offset)
633       break;
634 
635     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
636     if (!Width)
637       break;
638 
639     bool Signed = Opc == AMDGPUISD::BFE_I32;
640 
641     uint32_t OffsetVal = Offset->getZExtValue();
642     uint32_t WidthVal = Width->getZExtValue();
643 
644     ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
645                             WidthVal));
646     return;
647   }
648   case AMDGPUISD::DIV_SCALE: {
649     SelectDIV_SCALE(N);
650     return;
651   }
652   case AMDGPUISD::MAD_I64_I32:
653   case AMDGPUISD::MAD_U64_U32: {
654     SelectMAD_64_32(N);
655     return;
656   }
657   case ISD::SMUL_LOHI:
658   case ISD::UMUL_LOHI:
659     return SelectMUL_LOHI(N);
660   case ISD::CopyToReg: {
661     const SITargetLowering& Lowering =
662       *static_cast<const SITargetLowering*>(getTargetLowering());
663     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
664     break;
665   }
666   case ISD::AND:
667   case ISD::SRL:
668   case ISD::SRA:
669   case ISD::SIGN_EXTEND_INREG:
670     if (N->getValueType(0) != MVT::i32)
671       break;
672 
673     SelectS_BFE(N);
674     return;
675   case ISD::BRCOND:
676     SelectBRCOND(N);
677     return;
678   case ISD::FMAD:
679   case ISD::FMA:
680     SelectFMAD_FMA(N);
681     return;
682   case AMDGPUISD::ATOMIC_CMP_SWAP:
683     SelectATOMIC_CMP_SWAP(N);
684     return;
685   case AMDGPUISD::CVT_PKRTZ_F16_F32:
686   case AMDGPUISD::CVT_PKNORM_I16_F32:
687   case AMDGPUISD::CVT_PKNORM_U16_F32:
688   case AMDGPUISD::CVT_PK_U16_U32:
689   case AMDGPUISD::CVT_PK_I16_I32: {
690     // Hack around using a legal type if f16 is illegal.
691     if (N->getValueType(0) == MVT::i32) {
692       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
693       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
694                               { N->getOperand(0), N->getOperand(1) });
695       SelectCode(N);
696       return;
697     }
698 
699     break;
700   }
701   case ISD::INTRINSIC_W_CHAIN: {
702     SelectINTRINSIC_W_CHAIN(N);
703     return;
704   }
705   case ISD::INTRINSIC_WO_CHAIN: {
706     SelectINTRINSIC_WO_CHAIN(N);
707     return;
708   }
709   case ISD::INTRINSIC_VOID: {
710     SelectINTRINSIC_VOID(N);
711     return;
712   }
713   }
714 
715   SelectCode(N);
716 }
717 
718 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
719   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
720   const Instruction *Term = BB->getTerminator();
721   return Term->getMetadata("amdgpu.uniform") ||
722          Term->getMetadata("structurizecfg.uniform");
723 }
724 
725 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
726                                              unsigned ShAmtBits) const {
727   assert(N->getOpcode() == ISD::AND);
728 
729   const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
730   if (RHS.countTrailingOnes() >= ShAmtBits)
731     return true;
732 
733   const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
734   return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits;
735 }
736 
737 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
738                                           SDValue &N0, SDValue &N1) {
739   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
740       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
741     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
742     // (i64 (bitcast (v2i32 (build_vector
743     //                        (or (extract_vector_elt V, 0), OFFSET),
744     //                        (extract_vector_elt V, 1)))))
745     SDValue Lo = Addr.getOperand(0).getOperand(0);
746     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
747       SDValue BaseLo = Lo.getOperand(0);
748       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
749       // Check that split base (Lo and Hi) are extracted from the same one.
750       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
751           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
752           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
753           // Lo is statically extracted from index 0.
754           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
755           BaseLo.getConstantOperandVal(1) == 0 &&
756           // Hi is statically extracted from index 0.
757           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
758           BaseHi.getConstantOperandVal(1) == 1) {
759         N0 = BaseLo.getOperand(0).getOperand(0);
760         N1 = Lo.getOperand(1);
761         return true;
762       }
763     }
764   }
765   return false;
766 }
767 
768 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
769                                                     SDValue &RHS) const {
770   if (CurDAG->isBaseWithConstantOffset(Addr)) {
771     LHS = Addr.getOperand(0);
772     RHS = Addr.getOperand(1);
773     return true;
774   }
775 
776   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
777     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
778     return true;
779   }
780 
781   return false;
782 }
783 
784 StringRef AMDGPUDAGToDAGISel::getPassName() const {
785   return "AMDGPU DAG->DAG Pattern Instruction Selection";
786 }
787 
788 //===----------------------------------------------------------------------===//
789 // Complex Patterns
790 //===----------------------------------------------------------------------===//
791 
792 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
793                                             SDValue &Offset) {
794   return false;
795 }
796 
797 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
798                                             SDValue &Offset) {
799   ConstantSDNode *C;
800   SDLoc DL(Addr);
801 
802   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
803     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
804     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
805   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
806              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
807     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
808     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
809   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
810             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
811     Base = Addr.getOperand(0);
812     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
813   } else {
814     Base = Addr;
815     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
816   }
817 
818   return true;
819 }
820 
821 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
822                                                        const SDLoc &DL) const {
823   SDNode *Mov = CurDAG->getMachineNode(
824     AMDGPU::S_MOV_B32, DL, MVT::i32,
825     CurDAG->getTargetConstant(Val, DL, MVT::i32));
826   return SDValue(Mov, 0);
827 }
828 
829 // FIXME: Should only handle addcarry/subcarry
830 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
831   SDLoc DL(N);
832   SDValue LHS = N->getOperand(0);
833   SDValue RHS = N->getOperand(1);
834 
835   unsigned Opcode = N->getOpcode();
836   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
837   bool ProduceCarry =
838       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
839   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
840 
841   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
842   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
843 
844   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
845                                        DL, MVT::i32, LHS, Sub0);
846   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
847                                        DL, MVT::i32, LHS, Sub1);
848 
849   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
850                                        DL, MVT::i32, RHS, Sub0);
851   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
852                                        DL, MVT::i32, RHS, Sub1);
853 
854   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
855 
856   static const unsigned OpcMap[2][2][2] = {
857       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
858        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
859       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
860        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
861 
862   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
863   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
864 
865   SDNode *AddLo;
866   if (!ConsumeCarry) {
867     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
868     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
869   } else {
870     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
871     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
872   }
873   SDValue AddHiArgs[] = {
874     SDValue(Hi0, 0),
875     SDValue(Hi1, 0),
876     SDValue(AddLo, 1)
877   };
878   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
879 
880   SDValue RegSequenceArgs[] = {
881     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
882     SDValue(AddLo,0),
883     Sub0,
884     SDValue(AddHi,0),
885     Sub1,
886   };
887   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
888                                                MVT::i64, RegSequenceArgs);
889 
890   if (ProduceCarry) {
891     // Replace the carry-use
892     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
893   }
894 
895   // Replace the remaining uses.
896   ReplaceNode(N, RegSequence);
897 }
898 
899 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
900   SDLoc DL(N);
901   SDValue LHS = N->getOperand(0);
902   SDValue RHS = N->getOperand(1);
903   SDValue CI = N->getOperand(2);
904 
905   if (N->isDivergent()) {
906     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
907                                                    : AMDGPU::V_SUBB_U32_e64;
908     CurDAG->SelectNodeTo(
909         N, Opc, N->getVTList(),
910         {LHS, RHS, CI,
911          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
912   } else {
913     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
914                                                    : AMDGPU::S_SUB_CO_PSEUDO;
915     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
916   }
917 }
918 
919 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
920   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
921   // carry out despite the _i32 name. These were renamed in VI to _U32.
922   // FIXME: We should probably rename the opcodes here.
923   bool IsAdd = N->getOpcode() == ISD::UADDO;
924   bool IsVALU = N->isDivergent();
925 
926   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
927        ++UI)
928     if (UI.getUse().getResNo() == 1) {
929       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
930           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
931         IsVALU = true;
932         break;
933       }
934     }
935 
936   if (IsVALU) {
937     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
938 
939     CurDAG->SelectNodeTo(
940         N, Opc, N->getVTList(),
941         {N->getOperand(0), N->getOperand(1),
942          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
943   } else {
944     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
945                                                 : AMDGPU::S_USUBO_PSEUDO;
946 
947     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
948                          {N->getOperand(0), N->getOperand(1)});
949   }
950 }
951 
952 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
953   SDLoc SL(N);
954   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
955   SDValue Ops[10];
956 
957   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
958   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
959   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
960   Ops[8] = N->getOperand(0);
961   Ops[9] = N->getOperand(4);
962 
963   // If there are no source modifiers, prefer fmac over fma because it can use
964   // the smaller VOP2 encoding.
965   bool UseFMAC = Subtarget->hasDLInsts() &&
966                  cast<ConstantSDNode>(Ops[0])->isZero() &&
967                  cast<ConstantSDNode>(Ops[2])->isZero() &&
968                  cast<ConstantSDNode>(Ops[4])->isZero();
969   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
970   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
971 }
972 
973 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
974   SDLoc SL(N);
975   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
976   SDValue Ops[8];
977 
978   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
979   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
980   Ops[6] = N->getOperand(0);
981   Ops[7] = N->getOperand(3);
982 
983   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
984 }
985 
986 // We need to handle this here because tablegen doesn't support matching
987 // instructions with multiple outputs.
988 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
989   SDLoc SL(N);
990   EVT VT = N->getValueType(0);
991 
992   assert(VT == MVT::f32 || VT == MVT::f64);
993 
994   unsigned Opc
995     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
996 
997   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
998   // omod
999   SDValue Ops[8];
1000   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1001   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1002   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1003   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1004 }
1005 
1006 // We need to handle this here because tablegen doesn't support matching
1007 // instructions with multiple outputs.
1008 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1009   SDLoc SL(N);
1010   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1011   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1012 
1013   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1014   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1015                     Clamp };
1016   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017 }
1018 
1019 // We need to handle this here because tablegen doesn't support matching
1020 // instructions with multiple outputs.
1021 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1022   SDLoc SL(N);
1023   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1024   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1025 
1026   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1027   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1029   SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1030   if (!SDValue(N, 0).use_empty()) {
1031     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1032     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1033                                         MVT::i32, SDValue(Mad, 0), Sub0);
1034     ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1035   }
1036   if (!SDValue(N, 1).use_empty()) {
1037     SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1038     SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1039                                         MVT::i32, SDValue(Mad, 0), Sub1);
1040     ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1041   }
1042   CurDAG->RemoveDeadNode(N);
1043 }
1044 
1045 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1046   if (!isUInt<16>(Offset))
1047     return false;
1048 
1049   if (!Base || Subtarget->hasUsableDSOffset() ||
1050       Subtarget->unsafeDSOffsetFoldingEnabled())
1051     return true;
1052 
1053   // On Southern Islands instruction with a negative base value and an offset
1054   // don't seem to work.
1055   return CurDAG->SignBitIsZero(Base);
1056 }
1057 
1058 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1059                                               SDValue &Offset) const {
1060   SDLoc DL(Addr);
1061   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1062     SDValue N0 = Addr.getOperand(0);
1063     SDValue N1 = Addr.getOperand(1);
1064     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1065     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1066       // (add n0, c0)
1067       Base = N0;
1068       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1069       return true;
1070     }
1071   } else if (Addr.getOpcode() == ISD::SUB) {
1072     // sub C, x -> add (sub 0, x), C
1073     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1074       int64_t ByteOffset = C->getSExtValue();
1075       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1076         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1077 
1078         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1079         // the known bits in isDSOffsetLegal. We need to emit the selected node
1080         // here, so this is thrown away.
1081         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1082                                       Zero, Addr.getOperand(1));
1083 
1084         if (isDSOffsetLegal(Sub, ByteOffset)) {
1085           SmallVector<SDValue, 3> Opnds;
1086           Opnds.push_back(Zero);
1087           Opnds.push_back(Addr.getOperand(1));
1088 
1089           // FIXME: Select to VOP3 version for with-carry.
1090           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1091           if (Subtarget->hasAddNoCarry()) {
1092             SubOp = AMDGPU::V_SUB_U32_e64;
1093             Opnds.push_back(
1094                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1095           }
1096 
1097           MachineSDNode *MachineSub =
1098               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1099 
1100           Base = SDValue(MachineSub, 0);
1101           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1102           return true;
1103         }
1104       }
1105     }
1106   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1107     // If we have a constant address, prefer to put the constant into the
1108     // offset. This can save moves to load the constant address since multiple
1109     // operations can share the zero base address register, and enables merging
1110     // into read2 / write2 instructions.
1111 
1112     SDLoc DL(Addr);
1113 
1114     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1115       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1116       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1117                                  DL, MVT::i32, Zero);
1118       Base = SDValue(MovZero, 0);
1119       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1120       return true;
1121     }
1122   }
1123 
1124   // default case
1125   Base = Addr;
1126   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1127   return true;
1128 }
1129 
1130 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1131                                           unsigned Offset1,
1132                                           unsigned Size) const {
1133   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1134     return false;
1135   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1136     return false;
1137 
1138   if (!Base || Subtarget->hasUsableDSOffset() ||
1139       Subtarget->unsafeDSOffsetFoldingEnabled())
1140     return true;
1141 
1142   // On Southern Islands instruction with a negative base value and an offset
1143   // don't seem to work.
1144   return CurDAG->SignBitIsZero(Base);
1145 }
1146 
1147 // TODO: If offset is too big, put low 16-bit into offset.
1148 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1149                                                    SDValue &Offset0,
1150                                                    SDValue &Offset1) const {
1151   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1152 }
1153 
1154 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1155                                                     SDValue &Offset0,
1156                                                     SDValue &Offset1) const {
1157   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1158 }
1159 
1160 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1161                                             SDValue &Offset0, SDValue &Offset1,
1162                                             unsigned Size) const {
1163   SDLoc DL(Addr);
1164 
1165   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1166     SDValue N0 = Addr.getOperand(0);
1167     SDValue N1 = Addr.getOperand(1);
1168     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1169     unsigned OffsetValue0 = C1->getZExtValue();
1170     unsigned OffsetValue1 = OffsetValue0 + Size;
1171 
1172     // (add n0, c0)
1173     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1174       Base = N0;
1175       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1176       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1177       return true;
1178     }
1179   } else if (Addr.getOpcode() == ISD::SUB) {
1180     // sub C, x -> add (sub 0, x), C
1181     if (const ConstantSDNode *C =
1182             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1183       unsigned OffsetValue0 = C->getZExtValue();
1184       unsigned OffsetValue1 = OffsetValue0 + Size;
1185 
1186       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1187         SDLoc DL(Addr);
1188         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1189 
1190         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1191         // the known bits in isDSOffsetLegal. We need to emit the selected node
1192         // here, so this is thrown away.
1193         SDValue Sub =
1194             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1195 
1196         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1197           SmallVector<SDValue, 3> Opnds;
1198           Opnds.push_back(Zero);
1199           Opnds.push_back(Addr.getOperand(1));
1200           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1201           if (Subtarget->hasAddNoCarry()) {
1202             SubOp = AMDGPU::V_SUB_U32_e64;
1203             Opnds.push_back(
1204                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1205           }
1206 
1207           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1208               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1209 
1210           Base = SDValue(MachineSub, 0);
1211           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1212           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1213           return true;
1214         }
1215       }
1216     }
1217   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1218     unsigned OffsetValue0 = CAddr->getZExtValue();
1219     unsigned OffsetValue1 = OffsetValue0 + Size;
1220 
1221     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1222       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1223       MachineSDNode *MovZero =
1224           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1225       Base = SDValue(MovZero, 0);
1226       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1227       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1228       return true;
1229     }
1230   }
1231 
1232   // default case
1233 
1234   Base = Addr;
1235   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1236   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1237   return true;
1238 }
1239 
1240 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1241                                      SDValue &SOffset, SDValue &Offset,
1242                                      SDValue &Offen, SDValue &Idxen,
1243                                      SDValue &Addr64) const {
1244   // Subtarget prefers to use flat instruction
1245   // FIXME: This should be a pattern predicate and not reach here
1246   if (Subtarget->useFlatForGlobal())
1247     return false;
1248 
1249   SDLoc DL(Addr);
1250 
1251   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1252   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1253   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1254   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1255 
1256   ConstantSDNode *C1 = nullptr;
1257   SDValue N0 = Addr;
1258   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1259     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1260     if (isUInt<32>(C1->getZExtValue()))
1261       N0 = Addr.getOperand(0);
1262     else
1263       C1 = nullptr;
1264   }
1265 
1266   if (N0.getOpcode() == ISD::ADD) {
1267     // (add N2, N3) -> addr64, or
1268     // (add (add N2, N3), C1) -> addr64
1269     SDValue N2 = N0.getOperand(0);
1270     SDValue N3 = N0.getOperand(1);
1271     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1272 
1273     if (N2->isDivergent()) {
1274       if (N3->isDivergent()) {
1275         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1276         // addr64, and construct the resource from a 0 address.
1277         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1278         VAddr = N0;
1279       } else {
1280         // N2 is divergent, N3 is not.
1281         Ptr = N3;
1282         VAddr = N2;
1283       }
1284     } else {
1285       // N2 is not divergent.
1286       Ptr = N2;
1287       VAddr = N3;
1288     }
1289     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1290   } else if (N0->isDivergent()) {
1291     // N0 is divergent. Use it as the addr64, and construct the resource from a
1292     // 0 address.
1293     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1294     VAddr = N0;
1295     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1296   } else {
1297     // N0 -> offset, or
1298     // (N0 + C1) -> offset
1299     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1300     Ptr = N0;
1301   }
1302 
1303   if (!C1) {
1304     // No offset.
1305     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1306     return true;
1307   }
1308 
1309   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1310     // Legal offset for instruction.
1311     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1312     return true;
1313   }
1314 
1315   // Illegal offset, store it in soffset.
1316   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1317   SOffset =
1318       SDValue(CurDAG->getMachineNode(
1319                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1320                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1321               0);
1322   return true;
1323 }
1324 
1325 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1326                                            SDValue &VAddr, SDValue &SOffset,
1327                                            SDValue &Offset) const {
1328   SDValue Ptr, Offen, Idxen, Addr64;
1329 
1330   // addr64 bit was removed for volcanic islands.
1331   // FIXME: This should be a pattern predicate and not reach here
1332   if (!Subtarget->hasAddr64())
1333     return false;
1334 
1335   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1336     return false;
1337 
1338   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1339   if (C->getSExtValue()) {
1340     SDLoc DL(Addr);
1341 
1342     const SITargetLowering& Lowering =
1343       *static_cast<const SITargetLowering*>(getTargetLowering());
1344 
1345     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1346     return true;
1347   }
1348 
1349   return false;
1350 }
1351 
1352 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1353   SDLoc DL(N);
1354 
1355   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1356   SDValue TFI =
1357       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1358 
1359   // We rebase the base address into an absolute stack address and hence
1360   // use constant 0 for soffset. This value must be retained until
1361   // frame elimination and eliminateFrameIndex will choose the appropriate
1362   // frame register if need be.
1363   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1364 }
1365 
1366 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1367                                                  SDValue Addr, SDValue &Rsrc,
1368                                                  SDValue &VAddr, SDValue &SOffset,
1369                                                  SDValue &ImmOffset) const {
1370 
1371   SDLoc DL(Addr);
1372   MachineFunction &MF = CurDAG->getMachineFunction();
1373   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1374 
1375   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1376 
1377   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1378     int64_t Imm = CAddr->getSExtValue();
1379     const int64_t NullPtr =
1380         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1381     // Don't fold null pointer.
1382     if (Imm != NullPtr) {
1383       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1384       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1385         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1386       VAddr = SDValue(MovHighBits, 0);
1387 
1388       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1389       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1390       return true;
1391     }
1392   }
1393 
1394   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1395     // (add n0, c1)
1396 
1397     SDValue N0 = Addr.getOperand(0);
1398     SDValue N1 = Addr.getOperand(1);
1399 
1400     // Offsets in vaddr must be positive if range checking is enabled.
1401     //
1402     // The total computation of vaddr + soffset + offset must not overflow.  If
1403     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1404     // overflowing.
1405     //
1406     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1407     // always perform a range check. If a negative vaddr base index was used,
1408     // this would fail the range check. The overall address computation would
1409     // compute a valid address, but this doesn't happen due to the range
1410     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1411     //
1412     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1413     // MUBUF vaddr, but not on older subtargets which can only do this if the
1414     // sign bit is known 0.
1415     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1416     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1417         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1418          CurDAG->SignBitIsZero(N0))) {
1419       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1420       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1421       return true;
1422     }
1423   }
1424 
1425   // (node)
1426   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1427   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1428   return true;
1429 }
1430 
1431 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1432   if (Val.getOpcode() != ISD::CopyFromReg)
1433     return false;
1434   auto RC =
1435       TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
1436   return RC && TRI.isSGPRClass(RC);
1437 }
1438 
1439 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1440                                                   SDValue Addr,
1441                                                   SDValue &SRsrc,
1442                                                   SDValue &SOffset,
1443                                                   SDValue &Offset) const {
1444   const SIRegisterInfo *TRI =
1445       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1446   MachineFunction &MF = CurDAG->getMachineFunction();
1447   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1448   SDLoc DL(Addr);
1449 
1450   // CopyFromReg <sgpr>
1451   if (IsCopyFromSGPR(*TRI, Addr)) {
1452     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1453     SOffset = Addr;
1454     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1455     return true;
1456   }
1457 
1458   ConstantSDNode *CAddr;
1459   if (Addr.getOpcode() == ISD::ADD) {
1460     // Add (CopyFromReg <sgpr>) <constant>
1461     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1462     if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1463       return false;
1464     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1465       return false;
1466 
1467     SOffset = Addr.getOperand(0);
1468   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1469              SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1470     // <constant>
1471     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1472   } else {
1473     return false;
1474   }
1475 
1476   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1477 
1478   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1479   return true;
1480 }
1481 
1482 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1483                                            SDValue &SOffset, SDValue &Offset
1484                                            ) const {
1485   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1486   const SIInstrInfo *TII =
1487     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1488 
1489   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1490     return false;
1491 
1492   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1493       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1494       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1495     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1496                     APInt::getAllOnes(32).getZExtValue(); // Size
1497     SDLoc DL(Addr);
1498 
1499     const SITargetLowering& Lowering =
1500       *static_cast<const SITargetLowering*>(getTargetLowering());
1501 
1502     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1503     return true;
1504   }
1505   return false;
1506 }
1507 
1508 // Find a load or store from corresponding pattern root.
1509 // Roots may be build_vector, bitconvert or their combinations.
1510 static MemSDNode* findMemSDNode(SDNode *N) {
1511   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1512   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1513     return MN;
1514   assert(isa<BuildVectorSDNode>(N));
1515   for (SDValue V : N->op_values())
1516     if (MemSDNode *MN =
1517           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1518       return MN;
1519   llvm_unreachable("cannot find MemSDNode in the pattern!");
1520 }
1521 
1522 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1523                                               SDValue &VAddr, SDValue &Offset,
1524                                               uint64_t FlatVariant) const {
1525   int64_t OffsetVal = 0;
1526 
1527   unsigned AS = findMemSDNode(N)->getAddressSpace();
1528 
1529   bool CanHaveFlatSegmentOffsetBug =
1530       Subtarget->hasFlatSegmentOffsetBug() &&
1531       FlatVariant == SIInstrFlags::FLAT &&
1532       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1533 
1534   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1535     SDValue N0, N1;
1536     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1537       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1538 
1539       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1540       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1541         Addr = N0;
1542         OffsetVal = COffsetVal;
1543       } else {
1544         // If the offset doesn't fit, put the low bits into the offset field and
1545         // add the rest.
1546         //
1547         // For a FLAT instruction the hardware decides whether to access
1548         // global/scratch/shared memory based on the high bits of vaddr,
1549         // ignoring the offset field, so we have to ensure that when we add
1550         // remainder to vaddr it still points into the same underlying object.
1551         // The easiest way to do that is to make sure that we split the offset
1552         // into two pieces that are both >= 0 or both <= 0.
1553 
1554         SDLoc DL(N);
1555         uint64_t RemainderOffset;
1556 
1557         std::tie(OffsetVal, RemainderOffset) =
1558             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1559 
1560         SDValue AddOffsetLo =
1561             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1562         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1563 
1564         if (Addr.getValueType().getSizeInBits() == 32) {
1565           SmallVector<SDValue, 3> Opnds;
1566           Opnds.push_back(N0);
1567           Opnds.push_back(AddOffsetLo);
1568           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1569           if (Subtarget->hasAddNoCarry()) {
1570             AddOp = AMDGPU::V_ADD_U32_e64;
1571             Opnds.push_back(Clamp);
1572           }
1573           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1574         } else {
1575           // TODO: Should this try to use a scalar add pseudo if the base address
1576           // is uniform and saddr is usable?
1577           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1578           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1579 
1580           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1581                                                 DL, MVT::i32, N0, Sub0);
1582           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1583                                                 DL, MVT::i32, N0, Sub1);
1584 
1585           SDValue AddOffsetHi =
1586               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1587 
1588           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1589 
1590           SDNode *Add =
1591               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1592                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1593 
1594           SDNode *Addc = CurDAG->getMachineNode(
1595               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1596               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1597 
1598           SDValue RegSequenceArgs[] = {
1599               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1600               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1601 
1602           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1603                                                 MVT::i64, RegSequenceArgs),
1604                          0);
1605         }
1606       }
1607     }
1608   }
1609 
1610   VAddr = Addr;
1611   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1612   return true;
1613 }
1614 
1615 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1616                                           SDValue &VAddr,
1617                                           SDValue &Offset) const {
1618   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1619 }
1620 
1621 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1622                                             SDValue &VAddr,
1623                                             SDValue &Offset) const {
1624   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1625 }
1626 
1627 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1628                                              SDValue &VAddr,
1629                                              SDValue &Offset) const {
1630   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1631                               SIInstrFlags::FlatScratch);
1632 }
1633 
1634 // If this matches zero_extend i32:x, return x
1635 static SDValue matchZExtFromI32(SDValue Op) {
1636   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1637     return SDValue();
1638 
1639   SDValue ExtSrc = Op.getOperand(0);
1640   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1641 }
1642 
1643 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1644 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1645                                            SDValue Addr,
1646                                            SDValue &SAddr,
1647                                            SDValue &VOffset,
1648                                            SDValue &Offset) const {
1649   int64_t ImmOffset = 0;
1650 
1651   // Match the immediate offset first, which canonically is moved as low as
1652   // possible.
1653 
1654   SDValue LHS, RHS;
1655   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1656     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1657     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658 
1659     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1660                                SIInstrFlags::FlatGlobal)) {
1661       Addr = LHS;
1662       ImmOffset = COffsetVal;
1663     } else if (!LHS->isDivergent()) {
1664       if (COffsetVal > 0) {
1665         SDLoc SL(N);
1666         // saddr + large_offset -> saddr +
1667         //                         (voffset = large_offset & ~MaxOffset) +
1668         //                         (large_offset & MaxOffset);
1669         int64_t SplitImmOffset, RemainderOffset;
1670         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1671             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1672 
1673         if (isUInt<32>(RemainderOffset)) {
1674           SDNode *VMov = CurDAG->getMachineNode(
1675               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1676               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1677           VOffset = SDValue(VMov, 0);
1678           SAddr = LHS;
1679           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1680           return true;
1681         }
1682       }
1683 
1684       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1685       // is 1 we would need to perform 1 or 2 extra moves for each half of
1686       // the constant and it is better to do a scalar add and then issue a
1687       // single VALU instruction to materialize zero. Otherwise it is less
1688       // instructions to perform VALU adds with immediates or inline literals.
1689       unsigned NumLiterals =
1690           !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1691           !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1692       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1693         return false;
1694     }
1695   }
1696 
1697   // Match the variable offset.
1698   if (Addr.getOpcode() == ISD::ADD) {
1699     LHS = Addr.getOperand(0);
1700     RHS = Addr.getOperand(1);
1701 
1702     if (!LHS->isDivergent()) {
1703       // add (i64 sgpr), (zero_extend (i32 vgpr))
1704       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1705         SAddr = LHS;
1706         VOffset = ZextRHS;
1707       }
1708     }
1709 
1710     if (!SAddr && !RHS->isDivergent()) {
1711       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1712       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1713         SAddr = RHS;
1714         VOffset = ZextLHS;
1715       }
1716     }
1717 
1718     if (SAddr) {
1719       Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1720       return true;
1721     }
1722   }
1723 
1724   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1725       isa<ConstantSDNode>(Addr))
1726     return false;
1727 
1728   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1729   // moves required to copy a 64-bit SGPR to VGPR.
1730   SAddr = Addr;
1731   SDNode *VMov =
1732       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1733                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1734   VOffset = SDValue(VMov, 0);
1735   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1736   return true;
1737 }
1738 
1739 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1740   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1741     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1742   } else if (SAddr.getOpcode() == ISD::ADD &&
1743              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1744     // Materialize this into a scalar move for scalar address to avoid
1745     // readfirstlane.
1746     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1747     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1748                                               FI->getValueType(0));
1749     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1750                                            MVT::i32, TFI, SAddr.getOperand(1)),
1751                     0);
1752   }
1753 
1754   return SAddr;
1755 }
1756 
1757 // Match (32-bit SGPR base) + sext(imm offset)
1758 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1759                                             SDValue &SAddr,
1760                                             SDValue &Offset) const {
1761   if (Addr->isDivergent())
1762     return false;
1763 
1764   SDLoc DL(Addr);
1765 
1766   int64_t COffsetVal = 0;
1767 
1768   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1769     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1770     SAddr = Addr.getOperand(0);
1771   } else {
1772     SAddr = Addr;
1773   }
1774 
1775   SAddr = SelectSAddrFI(CurDAG, SAddr);
1776 
1777   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1778 
1779   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1780                               SIInstrFlags::FlatScratch)) {
1781     int64_t SplitImmOffset, RemainderOffset;
1782     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1783         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1784 
1785     COffsetVal = SplitImmOffset;
1786 
1787     SDValue AddOffset =
1788         SAddr.getOpcode() == ISD::TargetFrameIndex
1789             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1790             : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1791     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1792                                            SAddr, AddOffset),
1793                     0);
1794   }
1795 
1796   Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1797 
1798   return true;
1799 }
1800 
1801 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1802                                           SDValue &Offset, bool &Imm) const {
1803   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1804   if (!C) {
1805     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1806         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1807       Offset = ByteOffsetNode;
1808       Imm = false;
1809       return true;
1810     }
1811     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1812       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1813         Offset = ByteOffsetNode.getOperand(0);
1814         Imm = false;
1815         return true;
1816       }
1817     }
1818     return false;
1819   }
1820 
1821   SDLoc SL(ByteOffsetNode);
1822   // GFX9 and GFX10 have signed byte immediate offsets.
1823   int64_t ByteOffset = C->getSExtValue();
1824   Optional<int64_t> EncodedOffset =
1825       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1826   if (EncodedOffset) {
1827     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1828     Imm = true;
1829     return true;
1830   }
1831 
1832   // SGPR and literal offsets are unsigned.
1833   if (ByteOffset < 0)
1834     return false;
1835 
1836   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1837   if (EncodedOffset) {
1838     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1839     return true;
1840   }
1841 
1842   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1843     return false;
1844 
1845   SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1846   Offset = SDValue(
1847       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1848 
1849   return true;
1850 }
1851 
1852 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1853   if (Addr.getValueType() != MVT::i32)
1854     return Addr;
1855 
1856   // Zero-extend a 32-bit address.
1857   SDLoc SL(Addr);
1858 
1859   const MachineFunction &MF = CurDAG->getMachineFunction();
1860   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1861   unsigned AddrHiVal = Info->get32BitAddressHighBits();
1862   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1863 
1864   const SDValue Ops[] = {
1865     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1866     Addr,
1867     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1868     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1869             0),
1870     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1871   };
1872 
1873   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1874                                         Ops), 0);
1875 }
1876 
1877 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
1878                                      SDValue &Offset, bool &Imm) const {
1879   SDLoc SL(Addr);
1880 
1881   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1882   // wraparound, because s_load instructions perform the addition in 64 bits.
1883   if ((Addr.getValueType() != MVT::i32 ||
1884        Addr->getFlags().hasNoUnsignedWrap())) {
1885     SDValue N0, N1;
1886     // Extract the base and offset if possible.
1887     if (CurDAG->isBaseWithConstantOffset(Addr) ||
1888         Addr.getOpcode() == ISD::ADD) {
1889       N0 = Addr.getOperand(0);
1890       N1 = Addr.getOperand(1);
1891     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
1892       assert(N0 && N1 && isa<ConstantSDNode>(N1));
1893     }
1894     if (N0 && N1) {
1895       if (SelectSMRDOffset(N1, Offset, Imm)) {
1896         SBase = Expand32BitAddress(N0);
1897         return true;
1898       }
1899     }
1900   }
1901   SBase = Expand32BitAddress(Addr);
1902   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
1903   Imm = true;
1904   return true;
1905 }
1906 
1907 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
1908                                        SDValue &Offset) const {
1909   bool Imm = false;
1910   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
1911 }
1912 
1913 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
1914                                          SDValue &Offset) const {
1915 
1916   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
1917 
1918   bool Imm = false;
1919   if (!SelectSMRD(Addr, SBase, Offset, Imm))
1920     return false;
1921 
1922   return !Imm && isa<ConstantSDNode>(Offset);
1923 }
1924 
1925 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
1926                                         SDValue &Offset) const {
1927   bool Imm = false;
1928   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
1929          !isa<ConstantSDNode>(Offset);
1930 }
1931 
1932 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
1933                                              SDValue &Offset) const {
1934   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
1935     // The immediate offset for S_BUFFER instructions is unsigned.
1936     if (auto Imm =
1937             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
1938       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
1939       return true;
1940     }
1941   }
1942 
1943   return false;
1944 }
1945 
1946 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
1947                                                SDValue &Offset) const {
1948   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
1949 
1950   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
1951     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
1952                                                          C->getZExtValue())) {
1953       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
1954       return true;
1955     }
1956   }
1957 
1958   return false;
1959 }
1960 
1961 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
1962                                             SDValue &Base,
1963                                             SDValue &Offset) const {
1964   SDLoc DL(Index);
1965 
1966   if (CurDAG->isBaseWithConstantOffset(Index)) {
1967     SDValue N0 = Index.getOperand(0);
1968     SDValue N1 = Index.getOperand(1);
1969     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1970 
1971     // (add n0, c0)
1972     // Don't peel off the offset (c0) if doing so could possibly lead
1973     // the base (n0) to be negative.
1974     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
1975     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
1976         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
1977       Base = N0;
1978       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1979       return true;
1980     }
1981   }
1982 
1983   if (isa<ConstantSDNode>(Index))
1984     return false;
1985 
1986   Base = Index;
1987   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1988   return true;
1989 }
1990 
1991 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
1992                                      SDValue Val, uint32_t Offset,
1993                                      uint32_t Width) {
1994   if (Val->isDivergent()) {
1995     unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1996     SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
1997     SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
1998 
1999     return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2000   }
2001   unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2002   // Transformation function, pack the offset and width of a BFE into
2003   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2004   // source, bits [5:0] contain the offset and bits [22:16] the width.
2005   uint32_t PackedVal = Offset | (Width << 16);
2006   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2007 
2008   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2009 }
2010 
2011 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2012   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2013   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2014   // Predicate: 0 < b <= c < 32
2015 
2016   const SDValue &Shl = N->getOperand(0);
2017   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2018   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2019 
2020   if (B && C) {
2021     uint32_t BVal = B->getZExtValue();
2022     uint32_t CVal = C->getZExtValue();
2023 
2024     if (0 < BVal && BVal <= CVal && CVal < 32) {
2025       bool Signed = N->getOpcode() == ISD::SRA;
2026       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2027                   32 - CVal));
2028       return;
2029     }
2030   }
2031   SelectCode(N);
2032 }
2033 
2034 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2035   switch (N->getOpcode()) {
2036   case ISD::AND:
2037     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2038       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2039       // Predicate: isMask(mask)
2040       const SDValue &Srl = N->getOperand(0);
2041       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2042       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2043 
2044       if (Shift && Mask) {
2045         uint32_t ShiftVal = Shift->getZExtValue();
2046         uint32_t MaskVal = Mask->getZExtValue();
2047 
2048         if (isMask_32(MaskVal)) {
2049           uint32_t WidthVal = countPopulation(MaskVal);
2050           ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2051                                   WidthVal));
2052           return;
2053         }
2054       }
2055     }
2056     break;
2057   case ISD::SRL:
2058     if (N->getOperand(0).getOpcode() == ISD::AND) {
2059       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2060       // Predicate: isMask(mask >> b)
2061       const SDValue &And = N->getOperand(0);
2062       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2063       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2064 
2065       if (Shift && Mask) {
2066         uint32_t ShiftVal = Shift->getZExtValue();
2067         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2068 
2069         if (isMask_32(MaskVal)) {
2070           uint32_t WidthVal = countPopulation(MaskVal);
2071           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2072                       WidthVal));
2073           return;
2074         }
2075       }
2076     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2077       SelectS_BFEFromShifts(N);
2078       return;
2079     }
2080     break;
2081   case ISD::SRA:
2082     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2083       SelectS_BFEFromShifts(N);
2084       return;
2085     }
2086     break;
2087 
2088   case ISD::SIGN_EXTEND_INREG: {
2089     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2090     SDValue Src = N->getOperand(0);
2091     if (Src.getOpcode() != ISD::SRL)
2092       break;
2093 
2094     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2095     if (!Amt)
2096       break;
2097 
2098     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2099     ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2100                             Amt->getZExtValue(), Width));
2101     return;
2102   }
2103   }
2104 
2105   SelectCode(N);
2106 }
2107 
2108 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2109   assert(N->getOpcode() == ISD::BRCOND);
2110   if (!N->hasOneUse())
2111     return false;
2112 
2113   SDValue Cond = N->getOperand(1);
2114   if (Cond.getOpcode() == ISD::CopyToReg)
2115     Cond = Cond.getOperand(2);
2116 
2117   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2118     return false;
2119 
2120   MVT VT = Cond.getOperand(0).getSimpleValueType();
2121   if (VT == MVT::i32)
2122     return true;
2123 
2124   if (VT == MVT::i64) {
2125     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2126 
2127     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2128     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2129   }
2130 
2131   return false;
2132 }
2133 
2134 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2135   SDValue Cond = N->getOperand(1);
2136 
2137   if (Cond.isUndef()) {
2138     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2139                          N->getOperand(2), N->getOperand(0));
2140     return;
2141   }
2142 
2143   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2144   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2145 
2146   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2147   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2148   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2149   SDLoc SL(N);
2150 
2151   if (!UseSCCBr) {
2152     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2153     // analyzed what generates the vcc value, so we do not know whether vcc
2154     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2155     // disabled lanes.
2156     //
2157     // For the case that we select S_CBRANCH_SCC1 and it gets
2158     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2159     // SIInstrInfo::moveToVALU which inserts the S_AND).
2160     //
2161     // We could add an analysis of what generates the vcc value here and omit
2162     // the S_AND when is unnecessary. But it would be better to add a separate
2163     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2164     // catches both cases.
2165     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2166                                                          : AMDGPU::S_AND_B64,
2167                      SL, MVT::i1,
2168                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2169                                                         : AMDGPU::EXEC,
2170                                          MVT::i1),
2171                     Cond),
2172                    0);
2173   }
2174 
2175   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2176   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2177                        N->getOperand(2), // Basic Block
2178                        VCC.getValue(0));
2179 }
2180 
2181 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2182   MVT VT = N->getSimpleValueType(0);
2183   bool IsFMA = N->getOpcode() == ISD::FMA;
2184   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2185                          !Subtarget->hasFmaMixInsts()) ||
2186       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2187        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2188     SelectCode(N);
2189     return;
2190   }
2191 
2192   SDValue Src0 = N->getOperand(0);
2193   SDValue Src1 = N->getOperand(1);
2194   SDValue Src2 = N->getOperand(2);
2195   unsigned Src0Mods, Src1Mods, Src2Mods;
2196 
2197   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2198   // using the conversion from f16.
2199   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2200   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2201   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2202 
2203   assert((IsFMA || !Mode.allFP32Denormals()) &&
2204          "fmad selected with denormals enabled");
2205   // TODO: We can select this with f32 denormals enabled if all the sources are
2206   // converted from f16 (in which case fmad isn't legal).
2207 
2208   if (Sel0 || Sel1 || Sel2) {
2209     // For dummy operands.
2210     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2211     SDValue Ops[] = {
2212       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2213       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2214       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2215       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2216       Zero, Zero
2217     };
2218 
2219     CurDAG->SelectNodeTo(N,
2220                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2221                          MVT::f32, Ops);
2222   } else {
2223     SelectCode(N);
2224   }
2225 }
2226 
2227 // This is here because there isn't a way to use the generated sub0_sub1 as the
2228 // subreg index to EXTRACT_SUBREG in tablegen.
2229 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2230   MemSDNode *Mem = cast<MemSDNode>(N);
2231   unsigned AS = Mem->getAddressSpace();
2232   if (AS == AMDGPUAS::FLAT_ADDRESS) {
2233     SelectCode(N);
2234     return;
2235   }
2236 
2237   MVT VT = N->getSimpleValueType(0);
2238   bool Is32 = (VT == MVT::i32);
2239   SDLoc SL(N);
2240 
2241   MachineSDNode *CmpSwap = nullptr;
2242   if (Subtarget->hasAddr64()) {
2243     SDValue SRsrc, VAddr, SOffset, Offset;
2244 
2245     if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
2246       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2247         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2248       SDValue CmpVal = Mem->getOperand(2);
2249       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2250 
2251       // XXX - Do we care about glue operands?
2252 
2253       SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
2254                        Mem->getChain()};
2255 
2256       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2257     }
2258   }
2259 
2260   if (!CmpSwap) {
2261     SDValue SRsrc, SOffset, Offset;
2262     if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
2263       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2264         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2265 
2266       SDValue CmpVal = Mem->getOperand(2);
2267       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2268       SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
2269 
2270       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2271     }
2272   }
2273 
2274   if (!CmpSwap) {
2275     SelectCode(N);
2276     return;
2277   }
2278 
2279   MachineMemOperand *MMO = Mem->getMemOperand();
2280   CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2281 
2282   unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2283   SDValue Extract
2284     = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2285 
2286   ReplaceUses(SDValue(N, 0), Extract);
2287   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2288   CurDAG->RemoveDeadNode(N);
2289 }
2290 
2291 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2292   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2293   // be copied to an SGPR with readfirstlane.
2294   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2295     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2296 
2297   SDValue Chain = N->getOperand(0);
2298   SDValue Ptr = N->getOperand(2);
2299   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2300   MachineMemOperand *MMO = M->getMemOperand();
2301   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2302 
2303   SDValue Offset;
2304   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2305     SDValue PtrBase = Ptr.getOperand(0);
2306     SDValue PtrOffset = Ptr.getOperand(1);
2307 
2308     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2309     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2310       N = glueCopyToM0(N, PtrBase);
2311       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2312     }
2313   }
2314 
2315   if (!Offset) {
2316     N = glueCopyToM0(N, Ptr);
2317     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2318   }
2319 
2320   SDValue Ops[] = {
2321     Offset,
2322     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2323     Chain,
2324     N->getOperand(N->getNumOperands() - 1) // New glue
2325   };
2326 
2327   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2328   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2329 }
2330 
2331 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2332   switch (IntrID) {
2333   case Intrinsic::amdgcn_ds_gws_init:
2334     return AMDGPU::DS_GWS_INIT;
2335   case Intrinsic::amdgcn_ds_gws_barrier:
2336     return AMDGPU::DS_GWS_BARRIER;
2337   case Intrinsic::amdgcn_ds_gws_sema_v:
2338     return AMDGPU::DS_GWS_SEMA_V;
2339   case Intrinsic::amdgcn_ds_gws_sema_br:
2340     return AMDGPU::DS_GWS_SEMA_BR;
2341   case Intrinsic::amdgcn_ds_gws_sema_p:
2342     return AMDGPU::DS_GWS_SEMA_P;
2343   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2344     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2345   default:
2346     llvm_unreachable("not a gws intrinsic");
2347   }
2348 }
2349 
2350 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2351   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2352       !Subtarget->hasGWSSemaReleaseAll()) {
2353     // Let this error.
2354     SelectCode(N);
2355     return;
2356   }
2357 
2358   // Chain, intrinsic ID, vsrc, offset
2359   const bool HasVSrc = N->getNumOperands() == 4;
2360   assert(HasVSrc || N->getNumOperands() == 3);
2361 
2362   SDLoc SL(N);
2363   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2364   int ImmOffset = 0;
2365   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2366   MachineMemOperand *MMO = M->getMemOperand();
2367 
2368   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2369   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2370 
2371   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2372   // offset field) % 64. Some versions of the programming guide omit the m0
2373   // part, or claim it's from offset 0.
2374   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2375     // If we have a constant offset, try to use the 0 in m0 as the base.
2376     // TODO: Look into changing the default m0 initialization value. If the
2377     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2378     // the immediate offset.
2379     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2380     ImmOffset = ConstOffset->getZExtValue();
2381   } else {
2382     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2383       ImmOffset = BaseOffset.getConstantOperandVal(1);
2384       BaseOffset = BaseOffset.getOperand(0);
2385     }
2386 
2387     // Prefer to do the shift in an SGPR since it should be possible to use m0
2388     // as the result directly. If it's already an SGPR, it will be eliminated
2389     // later.
2390     SDNode *SGPROffset
2391       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2392                                BaseOffset);
2393     // Shift to offset in m0
2394     SDNode *M0Base
2395       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2396                                SDValue(SGPROffset, 0),
2397                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2398     glueCopyToM0(N, SDValue(M0Base, 0));
2399   }
2400 
2401   SDValue Chain = N->getOperand(0);
2402   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2403 
2404   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2405   SmallVector<SDValue, 5> Ops;
2406   if (HasVSrc)
2407     Ops.push_back(N->getOperand(2));
2408   Ops.push_back(OffsetField);
2409   Ops.push_back(Chain);
2410 
2411   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2412   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2413 }
2414 
2415 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2416   if (Subtarget->getLDSBankCount() != 16) {
2417     // This is a single instruction with a pattern.
2418     SelectCode(N);
2419     return;
2420   }
2421 
2422   SDLoc DL(N);
2423 
2424   // This requires 2 instructions. It is possible to write a pattern to support
2425   // this, but the generated isel emitter doesn't correctly deal with multiple
2426   // output instructions using the same physical register input. The copy to m0
2427   // is incorrectly placed before the second instruction.
2428   //
2429   // TODO: Match source modifiers.
2430   //
2431   // def : Pat <
2432   //   (int_amdgcn_interp_p1_f16
2433   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2434   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2435   //                             (i1 timm:$high), M0),
2436   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2437   //       timm:$attrchan, 0,
2438   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2439   //   let Predicates = [has16BankLDS];
2440   // }
2441 
2442   // 16 bank LDS
2443   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2444                                       N->getOperand(5), SDValue());
2445 
2446   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2447 
2448   SDNode *InterpMov =
2449     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2450         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2451         N->getOperand(3),  // Attr
2452         N->getOperand(2),  // Attrchan
2453         ToM0.getValue(1) // In glue
2454   });
2455 
2456   SDNode *InterpP1LV =
2457     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2458         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2459         N->getOperand(1), // Src0
2460         N->getOperand(3), // Attr
2461         N->getOperand(2), // Attrchan
2462         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2463         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2464         N->getOperand(4), // high
2465         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2466         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2467         SDValue(InterpMov, 1)
2468   });
2469 
2470   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2471 }
2472 
2473 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2474   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2475   switch (IntrID) {
2476   case Intrinsic::amdgcn_ds_append:
2477   case Intrinsic::amdgcn_ds_consume: {
2478     if (N->getValueType(0) != MVT::i32)
2479       break;
2480     SelectDSAppendConsume(N, IntrID);
2481     return;
2482   }
2483   }
2484 
2485   SelectCode(N);
2486 }
2487 
2488 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2489   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2490   unsigned Opcode;
2491   switch (IntrID) {
2492   case Intrinsic::amdgcn_wqm:
2493     Opcode = AMDGPU::WQM;
2494     break;
2495   case Intrinsic::amdgcn_softwqm:
2496     Opcode = AMDGPU::SOFT_WQM;
2497     break;
2498   case Intrinsic::amdgcn_wwm:
2499   case Intrinsic::amdgcn_strict_wwm:
2500     Opcode = AMDGPU::STRICT_WWM;
2501     break;
2502   case Intrinsic::amdgcn_strict_wqm:
2503     Opcode = AMDGPU::STRICT_WQM;
2504     break;
2505   case Intrinsic::amdgcn_interp_p1_f16:
2506     SelectInterpP1F16(N);
2507     return;
2508   default:
2509     SelectCode(N);
2510     return;
2511   }
2512 
2513   SDValue Src = N->getOperand(1);
2514   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2515 }
2516 
2517 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2518   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2519   switch (IntrID) {
2520   case Intrinsic::amdgcn_ds_gws_init:
2521   case Intrinsic::amdgcn_ds_gws_barrier:
2522   case Intrinsic::amdgcn_ds_gws_sema_v:
2523   case Intrinsic::amdgcn_ds_gws_sema_br:
2524   case Intrinsic::amdgcn_ds_gws_sema_p:
2525   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2526     SelectDS_GWS(N, IntrID);
2527     return;
2528   default:
2529     break;
2530   }
2531 
2532   SelectCode(N);
2533 }
2534 
2535 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2536                                             unsigned &Mods,
2537                                             bool AllowAbs) const {
2538   Mods = 0;
2539   Src = In;
2540 
2541   if (Src.getOpcode() == ISD::FNEG) {
2542     Mods |= SISrcMods::NEG;
2543     Src = Src.getOperand(0);
2544   }
2545 
2546   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2547     Mods |= SISrcMods::ABS;
2548     Src = Src.getOperand(0);
2549   }
2550 
2551   return true;
2552 }
2553 
2554 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2555                                         SDValue &SrcMods) const {
2556   unsigned Mods;
2557   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2558     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2559     return true;
2560   }
2561 
2562   return false;
2563 }
2564 
2565 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2566                                          SDValue &SrcMods) const {
2567   unsigned Mods;
2568   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2569     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2570     return true;
2571   }
2572 
2573   return false;
2574 }
2575 
2576 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2577                                              SDValue &SrcMods) const {
2578   SelectVOP3Mods(In, Src, SrcMods);
2579   return isNoNanSrc(Src);
2580 }
2581 
2582 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2583   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2584     return false;
2585 
2586   Src = In;
2587   return true;
2588 }
2589 
2590 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2591                                          SDValue &SrcMods, SDValue &Clamp,
2592                                          SDValue &Omod) const {
2593   SDLoc DL(In);
2594   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2595   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2596 
2597   return SelectVOP3Mods(In, Src, SrcMods);
2598 }
2599 
2600 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2601                                           SDValue &SrcMods, SDValue &Clamp,
2602                                           SDValue &Omod) const {
2603   SDLoc DL(In);
2604   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2605   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2606 
2607   return SelectVOP3BMods(In, Src, SrcMods);
2608 }
2609 
2610 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2611                                          SDValue &Clamp, SDValue &Omod) const {
2612   Src = In;
2613 
2614   SDLoc DL(In);
2615   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2616   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2617 
2618   return true;
2619 }
2620 
2621 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2622                                          SDValue &SrcMods) const {
2623   unsigned Mods = 0;
2624   Src = In;
2625 
2626   if (Src.getOpcode() == ISD::FNEG) {
2627     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2628     Src = Src.getOperand(0);
2629   }
2630 
2631   if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2632     unsigned VecMods = Mods;
2633 
2634     SDValue Lo = stripBitcast(Src.getOperand(0));
2635     SDValue Hi = stripBitcast(Src.getOperand(1));
2636 
2637     if (Lo.getOpcode() == ISD::FNEG) {
2638       Lo = stripBitcast(Lo.getOperand(0));
2639       Mods ^= SISrcMods::NEG;
2640     }
2641 
2642     if (Hi.getOpcode() == ISD::FNEG) {
2643       Hi = stripBitcast(Hi.getOperand(0));
2644       Mods ^= SISrcMods::NEG_HI;
2645     }
2646 
2647     if (isExtractHiElt(Lo, Lo))
2648       Mods |= SISrcMods::OP_SEL_0;
2649 
2650     if (isExtractHiElt(Hi, Hi))
2651       Mods |= SISrcMods::OP_SEL_1;
2652 
2653     unsigned VecSize = Src.getValueSizeInBits();
2654     Lo = stripExtractLoElt(Lo);
2655     Hi = stripExtractLoElt(Hi);
2656 
2657     if (Lo.getValueSizeInBits() > VecSize) {
2658       Lo = CurDAG->getTargetExtractSubreg(
2659         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2660         MVT::getIntegerVT(VecSize), Lo);
2661     }
2662 
2663     if (Hi.getValueSizeInBits() > VecSize) {
2664       Hi = CurDAG->getTargetExtractSubreg(
2665         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2666         MVT::getIntegerVT(VecSize), Hi);
2667     }
2668 
2669     assert(Lo.getValueSizeInBits() <= VecSize &&
2670            Hi.getValueSizeInBits() <= VecSize);
2671 
2672     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2673       // Really a scalar input. Just select from the low half of the register to
2674       // avoid packing.
2675 
2676       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2677         Src = Lo;
2678       } else {
2679         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2680 
2681         SDLoc SL(In);
2682         SDValue Undef = SDValue(
2683           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2684                                  Lo.getValueType()), 0);
2685         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2686                                     : AMDGPU::SReg_64RegClassID;
2687         const SDValue Ops[] = {
2688           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2689           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2690           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2691 
2692         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2693                                              Src.getValueType(), Ops), 0);
2694       }
2695       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2696       return true;
2697     }
2698 
2699     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2700       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2701                       .bitcastToAPInt().getZExtValue();
2702       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2703         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2704         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2705         return true;
2706       }
2707     }
2708 
2709     Mods = VecMods;
2710   }
2711 
2712   // Packed instructions do not have abs modifiers.
2713   Mods |= SISrcMods::OP_SEL_1;
2714 
2715   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2716   return true;
2717 }
2718 
2719 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2720                                          SDValue &SrcMods) const {
2721   Src = In;
2722   // FIXME: Handle op_sel
2723   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2724   return true;
2725 }
2726 
2727 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2728                                              SDValue &SrcMods) const {
2729   // FIXME: Handle op_sel
2730   return SelectVOP3Mods(In, Src, SrcMods);
2731 }
2732 
2733 // The return value is not whether the match is possible (which it always is),
2734 // but whether or not it a conversion is really used.
2735 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2736                                                    unsigned &Mods) const {
2737   Mods = 0;
2738   SelectVOP3ModsImpl(In, Src, Mods);
2739 
2740   if (Src.getOpcode() == ISD::FP_EXTEND) {
2741     Src = Src.getOperand(0);
2742     assert(Src.getValueType() == MVT::f16);
2743     Src = stripBitcast(Src);
2744 
2745     // Be careful about folding modifiers if we already have an abs. fneg is
2746     // applied last, so we don't want to apply an earlier fneg.
2747     if ((Mods & SISrcMods::ABS) == 0) {
2748       unsigned ModsTmp;
2749       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2750 
2751       if ((ModsTmp & SISrcMods::NEG) != 0)
2752         Mods ^= SISrcMods::NEG;
2753 
2754       if ((ModsTmp & SISrcMods::ABS) != 0)
2755         Mods |= SISrcMods::ABS;
2756     }
2757 
2758     // op_sel/op_sel_hi decide the source type and source.
2759     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2760     // If the sources's op_sel is set, it picks the high half of the source
2761     // register.
2762 
2763     Mods |= SISrcMods::OP_SEL_1;
2764     if (isExtractHiElt(Src, Src)) {
2765       Mods |= SISrcMods::OP_SEL_0;
2766 
2767       // TODO: Should we try to look for neg/abs here?
2768     }
2769 
2770     return true;
2771   }
2772 
2773   return false;
2774 }
2775 
2776 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2777                                                SDValue &SrcMods) const {
2778   unsigned Mods = 0;
2779   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2780   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2781   return true;
2782 }
2783 
2784 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2785   if (In.isUndef())
2786     return CurDAG->getUNDEF(MVT::i32);
2787 
2788   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2789     SDLoc SL(In);
2790     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2791   }
2792 
2793   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2794     SDLoc SL(In);
2795     return CurDAG->getConstant(
2796       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2797   }
2798 
2799   SDValue Src;
2800   if (isExtractHiElt(In, Src))
2801     return Src;
2802 
2803   return SDValue();
2804 }
2805 
2806 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2807   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2808 
2809   const SIRegisterInfo *SIRI =
2810     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2811   const SIInstrInfo * SII =
2812     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2813 
2814   unsigned Limit = 0;
2815   bool AllUsesAcceptSReg = true;
2816   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2817     Limit < 10 && U != E; ++U, ++Limit) {
2818     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2819 
2820     // If the register class is unknown, it could be an unknown
2821     // register class that needs to be an SGPR, e.g. an inline asm
2822     // constraint
2823     if (!RC || SIRI->isSGPRClass(RC))
2824       return false;
2825 
2826     if (RC != &AMDGPU::VS_32RegClass) {
2827       AllUsesAcceptSReg = false;
2828       SDNode * User = *U;
2829       if (User->isMachineOpcode()) {
2830         unsigned Opc = User->getMachineOpcode();
2831         MCInstrDesc Desc = SII->get(Opc);
2832         if (Desc.isCommutable()) {
2833           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2834           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2835           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2836             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2837             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2838             if (CommutedRC == &AMDGPU::VS_32RegClass)
2839               AllUsesAcceptSReg = true;
2840           }
2841         }
2842       }
2843       // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2844       // commuting current user. This means have at least one use
2845       // that strictly require VGPR. Thus, we will not attempt to commute
2846       // other user instructions.
2847       if (!AllUsesAcceptSReg)
2848         break;
2849     }
2850   }
2851   return !AllUsesAcceptSReg && (Limit < 10);
2852 }
2853 
2854 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2855   auto Ld = cast<LoadSDNode>(N);
2856 
2857   return Ld->getAlignment() >= 4 &&
2858         (
2859           (
2860             (
2861               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
2862               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2863             )
2864             &&
2865             !N->isDivergent()
2866           )
2867           ||
2868           (
2869             Subtarget->getScalarizeGlobalBehavior() &&
2870             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2871             Ld->isSimple() &&
2872             !N->isDivergent() &&
2873             static_cast<const SITargetLowering *>(
2874               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
2875           )
2876         );
2877 }
2878 
2879 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2880   const AMDGPUTargetLowering& Lowering =
2881     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
2882   bool IsModified = false;
2883   do {
2884     IsModified = false;
2885 
2886     // Go over all selected nodes and try to fold them a bit more
2887     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
2888     while (Position != CurDAG->allnodes_end()) {
2889       SDNode *Node = &*Position++;
2890       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
2891       if (!MachineNode)
2892         continue;
2893 
2894       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
2895       if (ResNode != Node) {
2896         if (ResNode)
2897           ReplaceUses(Node, ResNode);
2898         IsModified = true;
2899       }
2900     }
2901     CurDAG->RemoveDeadNodes();
2902   } while (IsModified);
2903 }
2904