xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "MCTargetDesc/R600MCTargetDesc.h"
21 #include "R600RegisterInfo.h"
22 #include "SIISelLowering.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "llvm/Analysis/UniformityAnalysis.h"
25 #include "llvm/CodeGen/FunctionLoweringInfo.h"
26 #include "llvm/CodeGen/SelectionDAG.h"
27 #include "llvm/CodeGen/SelectionDAGISel.h"
28 #include "llvm/CodeGen/SelectionDAGNodes.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/Support/ErrorHandling.h"
31 
32 #ifdef EXPENSIVE_CHECKS
33 #include "llvm/Analysis/LoopInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #endif
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 
41 //===----------------------------------------------------------------------===//
42 // Instruction Selector Implementation
43 //===----------------------------------------------------------------------===//
44 
45 namespace {
stripBitcast(SDValue Val)46 static SDValue stripBitcast(SDValue Val) {
47   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48 }
49 
50 // Figure out if this is really an extract of the high 16-bits of a dword.
isExtractHiElt(SDValue In,SDValue & Out)51 static bool isExtractHiElt(SDValue In, SDValue &Out) {
52   In = stripBitcast(In);
53 
54   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56       if (!Idx->isOne())
57         return false;
58       Out = In.getOperand(0);
59       return true;
60     }
61   }
62 
63   if (In.getOpcode() != ISD::TRUNCATE)
64     return false;
65 
66   SDValue Srl = In.getOperand(0);
67   if (Srl.getOpcode() == ISD::SRL) {
68     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69       if (ShiftAmt->getZExtValue() == 16) {
70         Out = stripBitcast(Srl.getOperand(0));
71         return true;
72       }
73     }
74   }
75 
76   return false;
77 }
78 
79 // Look through operations that obscure just looking at the low 16-bits of the
80 // same register.
stripExtractLoElt(SDValue In)81 static SDValue stripExtractLoElt(SDValue In) {
82   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83     SDValue Idx = In.getOperand(1);
84     if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85       return In.getOperand(0);
86   }
87 
88   if (In.getOpcode() == ISD::TRUNCATE) {
89     SDValue Src = In.getOperand(0);
90     if (Src.getValueType().getSizeInBits() == 32)
91       return stripBitcast(Src);
92   }
93 
94   return In;
95 }
96 
97 } // end anonymous namespace
98 
99 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
100                       "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101                       false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)102 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
103 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
104 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
105 #ifdef EXPENSIVE_CHECKS
106 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
107 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
108 #endif
109 INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
110                     "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111                     false)
112 
113 /// This pass converts a legalized DAG into a AMDGPU-specific
114 // DAG, ready for instruction scheduling.
115 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
116                                         CodeGenOptLevel OptLevel) {
117   return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118 }
119 
AMDGPUDAGToDAGISel(TargetMachine & TM,CodeGenOptLevel OptLevel)120 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
121                                        CodeGenOptLevel OptLevel)
122     : SelectionDAGISel(TM, OptLevel) {}
123 
runOnMachineFunction(MachineFunction & MF)124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125   Subtarget = &MF.getSubtarget<GCNSubtarget>();
126   Subtarget->checkSubtargetFeatures(MF.getFunction());
127   Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
128   return SelectionDAGISel::runOnMachineFunction(MF);
129 }
130 
fp16SrcZerosHighBits(unsigned Opc) const131 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132   // XXX - only need to list legal operations.
133   switch (Opc) {
134   case ISD::FADD:
135   case ISD::FSUB:
136   case ISD::FMUL:
137   case ISD::FDIV:
138   case ISD::FREM:
139   case ISD::FCANONICALIZE:
140   case ISD::UINT_TO_FP:
141   case ISD::SINT_TO_FP:
142   case ISD::FABS:
143     // Fabs is lowered to a bit operation, but it's an and which will clear the
144     // high bits anyway.
145   case ISD::FSQRT:
146   case ISD::FSIN:
147   case ISD::FCOS:
148   case ISD::FPOWI:
149   case ISD::FPOW:
150   case ISD::FLOG:
151   case ISD::FLOG2:
152   case ISD::FLOG10:
153   case ISD::FEXP:
154   case ISD::FEXP2:
155   case ISD::FCEIL:
156   case ISD::FTRUNC:
157   case ISD::FRINT:
158   case ISD::FNEARBYINT:
159   case ISD::FROUNDEVEN:
160   case ISD::FROUND:
161   case ISD::FFLOOR:
162   case ISD::FMINNUM:
163   case ISD::FMAXNUM:
164   case ISD::FLDEXP:
165   case AMDGPUISD::FRACT:
166   case AMDGPUISD::CLAMP:
167   case AMDGPUISD::COS_HW:
168   case AMDGPUISD::SIN_HW:
169   case AMDGPUISD::FMIN3:
170   case AMDGPUISD::FMAX3:
171   case AMDGPUISD::FMED3:
172   case AMDGPUISD::FMAD_FTZ:
173   case AMDGPUISD::RCP:
174   case AMDGPUISD::RSQ:
175   case AMDGPUISD::RCP_IFLAG:
176     // On gfx10, all 16-bit instructions preserve the high bits.
177     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178   case ISD::FP_ROUND:
179     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180     // high bits on gfx9.
181     // TODO: If we had the source node we could see if the source was fma/mad
182     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
183   case ISD::FMA:
184   case ISD::FMAD:
185   case AMDGPUISD::DIV_FIXUP:
186     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187   default:
188     // fcopysign, select and others may be lowered to 32-bit bit operations
189     // which don't zero the high bits.
190     return false;
191   }
192 }
193 
runOnMachineFunction(MachineFunction & MF)194 bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
195 #ifdef EXPENSIVE_CHECKS
196   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198   for (auto &L : LI->getLoopsInPreorder()) {
199     assert(L->isLCSSAForm(DT));
200   }
201 #endif
202   return SelectionDAGISelLegacy::runOnMachineFunction(MF);
203 }
204 
getAnalysisUsage(AnalysisUsage & AU) const205 void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
206   AU.addRequired<AMDGPUArgumentUsageInfo>();
207   AU.addRequired<UniformityInfoWrapperPass>();
208 #ifdef EXPENSIVE_CHECKS
209   AU.addRequired<DominatorTreeWrapperPass>();
210   AU.addRequired<LoopInfoWrapperPass>();
211 #endif
212   SelectionDAGISelLegacy::getAnalysisUsage(AU);
213 }
214 
matchLoadD16FromBuildVector(SDNode * N) const215 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
216   assert(Subtarget->d16PreservesUnusedBits());
217   MVT VT = N->getValueType(0).getSimpleVT();
218   if (VT != MVT::v2i16 && VT != MVT::v2f16)
219     return false;
220 
221   SDValue Lo = N->getOperand(0);
222   SDValue Hi = N->getOperand(1);
223 
224   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225 
226   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229 
230   // Need to check for possible indirect dependencies on the other half of the
231   // vector to avoid introducing a cycle.
232   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234 
235     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
236     SDValue Ops[] = {
237       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238     };
239 
240     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241     if (LdHi->getMemoryVT() == MVT::i8) {
242       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
243         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
244     } else {
245       assert(LdHi->getMemoryVT() == MVT::i16);
246     }
247 
248     SDValue NewLoadHi =
249       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250                                   Ops, LdHi->getMemoryVT(),
251                                   LdHi->getMemOperand());
252 
253     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255     return true;
256   }
257 
258   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262   if (LdLo && Lo.hasOneUse()) {
263     SDValue TiedIn = getHi16Elt(Hi);
264     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265       return false;
266 
267     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269     if (LdLo->getMemoryVT() == MVT::i8) {
270       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
271         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
272     } else {
273       assert(LdLo->getMemoryVT() == MVT::i16);
274     }
275 
276     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277 
278     SDValue Ops[] = {
279       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280     };
281 
282     SDValue NewLoadLo =
283       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284                                   Ops, LdLo->getMemoryVT(),
285                                   LdLo->getMemOperand());
286 
287     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289     return true;
290   }
291 
292   return false;
293 }
294 
PreprocessISelDAG()295 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
296   if (!Subtarget->d16PreservesUnusedBits())
297     return;
298 
299   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
300 
301   bool MadeChange = false;
302   while (Position != CurDAG->allnodes_begin()) {
303     SDNode *N = &*--Position;
304     if (N->use_empty())
305       continue;
306 
307     switch (N->getOpcode()) {
308     case ISD::BUILD_VECTOR:
309       // TODO: Match load d16 from shl (extload:i16), 16
310       MadeChange |= matchLoadD16FromBuildVector(N);
311       break;
312     default:
313       break;
314     }
315   }
316 
317   if (MadeChange) {
318     CurDAG->RemoveDeadNodes();
319     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320                CurDAG->dump(););
321   }
322 }
323 
isInlineImmediate(const SDNode * N) const324 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325   if (N->isUndef())
326     return true;
327 
328   const SIInstrInfo *TII = Subtarget->getInstrInfo();
329   if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330     return TII->isInlineConstant(C->getAPIntValue());
331 
332   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333     return TII->isInlineConstant(C->getValueAPF());
334 
335   return false;
336 }
337 
338 /// Determine the register class for \p OpNo
339 /// \returns The register class of the virtual register that will be used for
340 /// the given operand number \OpNo or NULL if the register class cannot be
341 /// determined.
getOperandRegClass(SDNode * N,unsigned OpNo) const342 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343                                                           unsigned OpNo) const {
344   if (!N->isMachineOpcode()) {
345     if (N->getOpcode() == ISD::CopyToReg) {
346       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347       if (Reg.isVirtual()) {
348         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
349         return MRI.getRegClass(Reg);
350       }
351 
352       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353       return TRI->getPhysRegBaseClass(Reg);
354     }
355 
356     return nullptr;
357   }
358 
359   switch (N->getMachineOpcode()) {
360   default: {
361     const MCInstrDesc &Desc =
362         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363     unsigned OpIdx = Desc.getNumDefs() + OpNo;
364     if (OpIdx >= Desc.getNumOperands())
365       return nullptr;
366     int RegClass = Desc.operands()[OpIdx].RegClass;
367     if (RegClass == -1)
368       return nullptr;
369 
370     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371   }
372   case AMDGPU::REG_SEQUENCE: {
373     unsigned RCID = N->getConstantOperandVal(0);
374     const TargetRegisterClass *SuperRC =
375         Subtarget->getRegisterInfo()->getRegClass(RCID);
376 
377     SDValue SubRegOp = N->getOperand(OpNo + 1);
378     unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380                                                               SubRegIdx);
381   }
382   }
383 }
384 
glueCopyToOp(SDNode * N,SDValue NewChain,SDValue Glue) const385 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386                                          SDValue Glue) const {
387   SmallVector <SDValue, 8> Ops;
388   Ops.push_back(NewChain); // Replace the chain.
389   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390     Ops.push_back(N->getOperand(i));
391 
392   Ops.push_back(Glue);
393   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394 }
395 
glueCopyToM0(SDNode * N,SDValue Val) const396 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
397   const SITargetLowering& Lowering =
398     *static_cast<const SITargetLowering*>(getTargetLowering());
399 
400   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401 
402   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403   return glueCopyToOp(N, M0, M0.getValue(1));
404 }
405 
glueCopyToM0LDSInit(SDNode * N) const406 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409     if (Subtarget->ldsRequiresM0Init())
410       return glueCopyToM0(
411           N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
413     MachineFunction &MF = CurDAG->getMachineFunction();
414     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415     return
416         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417   }
418   return N;
419 }
420 
buildSMovImm64(SDLoc & DL,uint64_t Imm,EVT VT) const421 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422                                                   EVT VT) const {
423   SDNode *Lo = CurDAG->getMachineNode(
424       AMDGPU::S_MOV_B32, DL, MVT::i32,
425       CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
426   SDNode *Hi = CurDAG->getMachineNode(
427       AMDGPU::S_MOV_B32, DL, MVT::i32,
428       CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429   const SDValue Ops[] = {
430       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433 
434   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435 }
436 
SelectBuildVector(SDNode * N,unsigned RegClassID)437 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438   EVT VT = N->getValueType(0);
439   unsigned NumVectorElts = VT.getVectorNumElements();
440   EVT EltVT = VT.getVectorElementType();
441   SDLoc DL(N);
442   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443 
444   if (NumVectorElts == 1) {
445     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446                          RegClass);
447     return;
448   }
449 
450   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
451   if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
452       CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
453     uint64_t C = 0;
454     bool AllConst = true;
455     unsigned EltSize = EltVT.getSizeInBits();
456     for (unsigned I = 0; I < NumVectorElts; ++I) {
457       SDValue Op = N->getOperand(I);
458       if (Op.isUndef()) {
459         AllConst = false;
460         break;
461       }
462       uint64_t Val;
463       if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
464         Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
465       } else
466         Val = cast<ConstantSDNode>(Op)->getZExtValue();
467       C |= Val << (EltSize * I);
468     }
469     if (AllConst) {
470       SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
471       MachineSDNode *Copy =
472           CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
473       CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
474                            RegClass);
475       return;
476     }
477   }
478 
479   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
480                                   "supported yet");
481   // 32 = Max Num Vector Elements
482   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
483   // 1 = Vector Register Class
484   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
485 
486   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
487   bool IsRegSeq = true;
488   unsigned NOps = N->getNumOperands();
489   for (unsigned i = 0; i < NOps; i++) {
490     // XXX: Why is this here?
491     if (isa<RegisterSDNode>(N->getOperand(i))) {
492       IsRegSeq = false;
493       break;
494     }
495     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
496                          : R600RegisterInfo::getSubRegFromChannel(i);
497     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
498     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
499   }
500   if (NOps != NumVectorElts) {
501     // Fill in the missing undef elements if this was a scalar_to_vector.
502     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
503     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
504                                                    DL, EltVT);
505     for (unsigned i = NOps; i < NumVectorElts; ++i) {
506       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
507                            : R600RegisterInfo::getSubRegFromChannel(i);
508       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
509       RegSeqArgs[1 + (2 * i) + 1] =
510           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
511     }
512   }
513 
514   if (!IsRegSeq)
515     SelectCode(N);
516   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
517 }
518 
SelectVectorShuffle(SDNode * N)519 void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
520   EVT VT = N->getValueType(0);
521   EVT EltVT = VT.getVectorElementType();
522 
523   // TODO: Handle 16-bit element vectors with even aligned masks.
524   if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
525       VT.getVectorNumElements() != 2) {
526     SelectCode(N);
527     return;
528   }
529 
530   auto *SVN = cast<ShuffleVectorSDNode>(N);
531 
532   SDValue Src0 = SVN->getOperand(0);
533   SDValue Src1 = SVN->getOperand(1);
534   ArrayRef<int> Mask = SVN->getMask();
535   SDLoc DL(N);
536 
537   assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
538          Mask[0] < 4 && Mask[1] < 4);
539 
540   SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
541   SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
542   unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
543   unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
544 
545   if (Mask[0] < 0) {
546     Src0SubReg = Src1SubReg;
547     MachineSDNode *ImpDef =
548         CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
549     VSrc0 = SDValue(ImpDef, 0);
550   }
551 
552   if (Mask[1] < 0) {
553     Src1SubReg = Src0SubReg;
554     MachineSDNode *ImpDef =
555         CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
556     VSrc1 = SDValue(ImpDef, 0);
557   }
558 
559   // SGPR case needs to lower to copies.
560   //
561   // Also use subregister extract when we can directly blend the registers with
562   // a simple subregister copy.
563   //
564   // TODO: Maybe we should fold this out earlier
565   if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
566       Src1SubReg == AMDGPU::sub0) {
567     // The low element of the result always comes from src0.
568     // The high element of the result always comes from src1.
569     // op_sel selects the high half of src0.
570     // op_sel_hi selects the high half of src1.
571 
572     unsigned Src0OpSel =
573         Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
574     unsigned Src1OpSel =
575         Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
576 
577     // Enable op_sel_hi to avoid printing it. This should have no effect on the
578     // result.
579     Src0OpSel |= SISrcMods::OP_SEL_1;
580     Src1OpSel |= SISrcMods::OP_SEL_1;
581 
582     SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
583     SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
584     SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
585 
586     CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
587                          {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
588                           ZeroMods,   // clamp
589                           ZeroMods,   // op_sel
590                           ZeroMods,   // op_sel_hi
591                           ZeroMods,   // neg_lo
592                           ZeroMods}); // neg_hi
593     return;
594   }
595 
596   SDValue ResultElt0 =
597       CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
598   SDValue ResultElt1 =
599       CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
600 
601   const SDValue Ops[] = {
602       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
603       ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
604       ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
605   CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
606 }
607 
Select(SDNode * N)608 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
609   unsigned int Opc = N->getOpcode();
610   if (N->isMachineOpcode()) {
611     N->setNodeId(-1);
612     return;   // Already selected.
613   }
614 
615   // isa<MemSDNode> almost works but is slightly too permissive for some DS
616   // intrinsics.
617   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
618     N = glueCopyToM0LDSInit(N);
619     SelectCode(N);
620     return;
621   }
622 
623   switch (Opc) {
624   default:
625     break;
626   // We are selecting i64 ADD here instead of custom lower it during
627   // DAG legalization, so we can fold some i64 ADDs used for address
628   // calculation into the LOAD and STORE instructions.
629   case ISD::ADDC:
630   case ISD::ADDE:
631   case ISD::SUBC:
632   case ISD::SUBE: {
633     if (N->getValueType(0) != MVT::i64)
634       break;
635 
636     SelectADD_SUB_I64(N);
637     return;
638   }
639   case ISD::UADDO_CARRY:
640   case ISD::USUBO_CARRY:
641     if (N->getValueType(0) != MVT::i32)
642       break;
643 
644     SelectAddcSubb(N);
645     return;
646   case ISD::UADDO:
647   case ISD::USUBO: {
648     SelectUADDO_USUBO(N);
649     return;
650   }
651   case AMDGPUISD::FMUL_W_CHAIN: {
652     SelectFMUL_W_CHAIN(N);
653     return;
654   }
655   case AMDGPUISD::FMA_W_CHAIN: {
656     SelectFMA_W_CHAIN(N);
657     return;
658   }
659 
660   case ISD::SCALAR_TO_VECTOR:
661   case ISD::BUILD_VECTOR: {
662     EVT VT = N->getValueType(0);
663     unsigned NumVectorElts = VT.getVectorNumElements();
664     if (VT.getScalarSizeInBits() == 16) {
665       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
666         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
667           ReplaceNode(N, Packed);
668           return;
669         }
670       }
671 
672       break;
673     }
674 
675     assert(VT.getVectorElementType().bitsEq(MVT::i32));
676     unsigned RegClassID =
677         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
678     SelectBuildVector(N, RegClassID);
679     return;
680   }
681   case ISD::VECTOR_SHUFFLE:
682     SelectVectorShuffle(N);
683     return;
684   case ISD::BUILD_PAIR: {
685     SDValue RC, SubReg0, SubReg1;
686     SDLoc DL(N);
687     if (N->getValueType(0) == MVT::i128) {
688       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
689       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
690       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
691     } else if (N->getValueType(0) == MVT::i64) {
692       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
693       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
694       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
695     } else {
696       llvm_unreachable("Unhandled value type for BUILD_PAIR");
697     }
698     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
699                             N->getOperand(1), SubReg1 };
700     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
701                                           N->getValueType(0), Ops));
702     return;
703   }
704 
705   case ISD::Constant:
706   case ISD::ConstantFP: {
707     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
708         Subtarget->has64BitLiterals())
709       break;
710 
711     uint64_t Imm;
712     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
713       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
714       if (AMDGPU::isValid32BitLiteral(Imm, true))
715         break;
716     } else {
717       ConstantSDNode *C = cast<ConstantSDNode>(N);
718       Imm = C->getZExtValue();
719       if (AMDGPU::isValid32BitLiteral(Imm, false))
720         break;
721     }
722 
723     SDLoc DL(N);
724     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
725     return;
726   }
727   case AMDGPUISD::BFE_I32:
728   case AMDGPUISD::BFE_U32: {
729     // There is a scalar version available, but unlike the vector version which
730     // has a separate operand for the offset and width, the scalar version packs
731     // the width and offset into a single operand. Try to move to the scalar
732     // version if the offsets are constant, so that we can try to keep extended
733     // loads of kernel arguments in SGPRs.
734 
735     // TODO: Technically we could try to pattern match scalar bitshifts of
736     // dynamic values, but it's probably not useful.
737     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
738     if (!Offset)
739       break;
740 
741     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
742     if (!Width)
743       break;
744 
745     bool Signed = Opc == AMDGPUISD::BFE_I32;
746 
747     uint32_t OffsetVal = Offset->getZExtValue();
748     uint32_t WidthVal = Width->getZExtValue();
749 
750     ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
751                             WidthVal));
752     return;
753   }
754   case AMDGPUISD::DIV_SCALE: {
755     SelectDIV_SCALE(N);
756     return;
757   }
758   case AMDGPUISD::MAD_I64_I32:
759   case AMDGPUISD::MAD_U64_U32: {
760     SelectMAD_64_32(N);
761     return;
762   }
763   case ISD::SMUL_LOHI:
764   case ISD::UMUL_LOHI:
765     return SelectMUL_LOHI(N);
766   case ISD::CopyToReg: {
767     const SITargetLowering& Lowering =
768       *static_cast<const SITargetLowering*>(getTargetLowering());
769     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
770     break;
771   }
772   case ISD::AND:
773   case ISD::SRL:
774   case ISD::SRA:
775   case ISD::SIGN_EXTEND_INREG:
776     if (N->getValueType(0) != MVT::i32)
777       break;
778 
779     SelectS_BFE(N);
780     return;
781   case ISD::BRCOND:
782     SelectBRCOND(N);
783     return;
784   case ISD::FP_EXTEND:
785     SelectFP_EXTEND(N);
786     return;
787   case AMDGPUISD::CVT_PKRTZ_F16_F32:
788   case AMDGPUISD::CVT_PKNORM_I16_F32:
789   case AMDGPUISD::CVT_PKNORM_U16_F32:
790   case AMDGPUISD::CVT_PK_U16_U32:
791   case AMDGPUISD::CVT_PK_I16_I32: {
792     // Hack around using a legal type if f16 is illegal.
793     if (N->getValueType(0) == MVT::i32) {
794       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
795       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
796                               { N->getOperand(0), N->getOperand(1) });
797       SelectCode(N);
798       return;
799     }
800 
801     break;
802   }
803   case ISD::INTRINSIC_W_CHAIN: {
804     SelectINTRINSIC_W_CHAIN(N);
805     return;
806   }
807   case ISD::INTRINSIC_WO_CHAIN: {
808     SelectINTRINSIC_WO_CHAIN(N);
809     return;
810   }
811   case ISD::INTRINSIC_VOID: {
812     SelectINTRINSIC_VOID(N);
813     return;
814   }
815   case AMDGPUISD::WAVE_ADDRESS: {
816     SelectWAVE_ADDRESS(N);
817     return;
818   }
819   case ISD::STACKRESTORE: {
820     SelectSTACKRESTORE(N);
821     return;
822   }
823   }
824 
825   SelectCode(N);
826 }
827 
isUniformBr(const SDNode * N) const828 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
829   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
830   const Instruction *Term = BB->getTerminator();
831   return Term->getMetadata("amdgpu.uniform") ||
832          Term->getMetadata("structurizecfg.uniform");
833 }
834 
isUnneededShiftMask(const SDNode * N,unsigned ShAmtBits) const835 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
836                                              unsigned ShAmtBits) const {
837   assert(N->getOpcode() == ISD::AND);
838 
839   const APInt &RHS = N->getConstantOperandAPInt(1);
840   if (RHS.countr_one() >= ShAmtBits)
841     return true;
842 
843   const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
844   return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
845 }
846 
getBaseWithOffsetUsingSplitOR(SelectionDAG & DAG,SDValue Addr,SDValue & N0,SDValue & N1)847 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
848                                           SDValue &N0, SDValue &N1) {
849   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
850       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
851     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
852     // (i64 (bitcast (v2i32 (build_vector
853     //                        (or (extract_vector_elt V, 0), OFFSET),
854     //                        (extract_vector_elt V, 1)))))
855     SDValue Lo = Addr.getOperand(0).getOperand(0);
856     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
857       SDValue BaseLo = Lo.getOperand(0);
858       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
859       // Check that split base (Lo and Hi) are extracted from the same one.
860       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
861           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
862           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
863           // Lo is statically extracted from index 0.
864           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
865           BaseLo.getConstantOperandVal(1) == 0 &&
866           // Hi is statically extracted from index 0.
867           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
868           BaseHi.getConstantOperandVal(1) == 1) {
869         N0 = BaseLo.getOperand(0).getOperand(0);
870         N1 = Lo.getOperand(1);
871         return true;
872       }
873     }
874   }
875   return false;
876 }
877 
isBaseWithConstantOffset64(SDValue Addr,SDValue & LHS,SDValue & RHS) const878 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
879                                                     SDValue &RHS) const {
880   if (CurDAG->isBaseWithConstantOffset(Addr)) {
881     LHS = Addr.getOperand(0);
882     RHS = Addr.getOperand(1);
883     return true;
884   }
885 
886   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
887     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
888     return true;
889   }
890 
891   return false;
892 }
893 
getPassName() const894 StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
895   return "AMDGPU DAG->DAG Pattern Instruction Selection";
896 }
897 
AMDGPUISelDAGToDAGPass(TargetMachine & TM)898 AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
899     : SelectionDAGISelPass(
900           std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
901 
902 PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)903 AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
904                             MachineFunctionAnalysisManager &MFAM) {
905 #ifdef EXPENSIVE_CHECKS
906   auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
907                   .getManager();
908   auto &F = MF.getFunction();
909   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
910   LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
911   for (auto &L : LI.getLoopsInPreorder())
912     assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
913 #endif
914   return SelectionDAGISelPass::run(MF, MFAM);
915 }
916 
917 //===----------------------------------------------------------------------===//
918 // Complex Patterns
919 //===----------------------------------------------------------------------===//
920 
SelectADDRVTX_READ(SDValue Addr,SDValue & Base,SDValue & Offset)921 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
922                                             SDValue &Offset) {
923   return false;
924 }
925 
SelectADDRIndirect(SDValue Addr,SDValue & Base,SDValue & Offset)926 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
927                                             SDValue &Offset) {
928   ConstantSDNode *C;
929   SDLoc DL(Addr);
930 
931   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
932     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
933     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
934   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
935              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
936     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
937     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
938   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
939             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
940     Base = Addr.getOperand(0);
941     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
942   } else {
943     Base = Addr;
944     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
945   }
946 
947   return true;
948 }
949 
getMaterializedScalarImm32(int64_t Val,const SDLoc & DL) const950 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
951                                                        const SDLoc &DL) const {
952   SDNode *Mov = CurDAG->getMachineNode(
953     AMDGPU::S_MOV_B32, DL, MVT::i32,
954     CurDAG->getTargetConstant(Val, DL, MVT::i32));
955   return SDValue(Mov, 0);
956 }
957 
958 // FIXME: Should only handle uaddo_carry/usubo_carry
SelectADD_SUB_I64(SDNode * N)959 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
960   SDLoc DL(N);
961   SDValue LHS = N->getOperand(0);
962   SDValue RHS = N->getOperand(1);
963 
964   unsigned Opcode = N->getOpcode();
965   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
966   bool ProduceCarry =
967       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
968   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
969 
970   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
971   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
972 
973   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
974                                        DL, MVT::i32, LHS, Sub0);
975   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
976                                        DL, MVT::i32, LHS, Sub1);
977 
978   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
979                                        DL, MVT::i32, RHS, Sub0);
980   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
981                                        DL, MVT::i32, RHS, Sub1);
982 
983   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
984 
985   static const unsigned OpcMap[2][2][2] = {
986       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
987        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
988       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
989        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
990 
991   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
992   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
993 
994   SDNode *AddLo;
995   if (!ConsumeCarry) {
996     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
997     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
998   } else {
999     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1000     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1001   }
1002   SDValue AddHiArgs[] = {
1003     SDValue(Hi0, 0),
1004     SDValue(Hi1, 0),
1005     SDValue(AddLo, 1)
1006   };
1007   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1008 
1009   SDValue RegSequenceArgs[] = {
1010     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1011     SDValue(AddLo,0),
1012     Sub0,
1013     SDValue(AddHi,0),
1014     Sub1,
1015   };
1016   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1017                                                MVT::i64, RegSequenceArgs);
1018 
1019   if (ProduceCarry) {
1020     // Replace the carry-use
1021     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1022   }
1023 
1024   // Replace the remaining uses.
1025   ReplaceNode(N, RegSequence);
1026 }
1027 
SelectAddcSubb(SDNode * N)1028 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1029   SDValue LHS = N->getOperand(0);
1030   SDValue RHS = N->getOperand(1);
1031   SDValue CI = N->getOperand(2);
1032 
1033   if (N->isDivergent()) {
1034     unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1035                                                       : AMDGPU::V_SUBB_U32_e64;
1036     CurDAG->SelectNodeTo(
1037         N, Opc, N->getVTList(),
1038         {LHS, RHS, CI,
1039          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1040   } else {
1041     unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1042                                                       : AMDGPU::S_SUB_CO_PSEUDO;
1043     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1044   }
1045 }
1046 
SelectUADDO_USUBO(SDNode * N)1047 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1048   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1049   // carry out despite the _i32 name. These were renamed in VI to _U32.
1050   // FIXME: We should probably rename the opcodes here.
1051   bool IsAdd = N->getOpcode() == ISD::UADDO;
1052   bool IsVALU = N->isDivergent();
1053 
1054   for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1055        ++UI)
1056     if (UI.getUse().getResNo() == 1) {
1057       if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
1058           (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
1059         IsVALU = true;
1060         break;
1061       }
1062     }
1063 
1064   if (IsVALU) {
1065     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1066 
1067     CurDAG->SelectNodeTo(
1068         N, Opc, N->getVTList(),
1069         {N->getOperand(0), N->getOperand(1),
1070          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1071   } else {
1072     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1073                                                 : AMDGPU::S_USUBO_PSEUDO;
1074 
1075     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1076                          {N->getOperand(0), N->getOperand(1)});
1077   }
1078 }
1079 
SelectFMA_W_CHAIN(SDNode * N)1080 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1081   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1082   SDValue Ops[10];
1083 
1084   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1085   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1086   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1087   Ops[8] = N->getOperand(0);
1088   Ops[9] = N->getOperand(4);
1089 
1090   // If there are no source modifiers, prefer fmac over fma because it can use
1091   // the smaller VOP2 encoding.
1092   bool UseFMAC = Subtarget->hasDLInsts() &&
1093                  cast<ConstantSDNode>(Ops[0])->isZero() &&
1094                  cast<ConstantSDNode>(Ops[2])->isZero() &&
1095                  cast<ConstantSDNode>(Ops[4])->isZero();
1096   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1097   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1098 }
1099 
SelectFMUL_W_CHAIN(SDNode * N)1100 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1101   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
1102   SDValue Ops[8];
1103 
1104   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1105   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1106   Ops[6] = N->getOperand(0);
1107   Ops[7] = N->getOperand(3);
1108 
1109   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1110 }
1111 
1112 // We need to handle this here because tablegen doesn't support matching
1113 // instructions with multiple outputs.
SelectDIV_SCALE(SDNode * N)1114 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1115   EVT VT = N->getValueType(0);
1116 
1117   assert(VT == MVT::f32 || VT == MVT::f64);
1118 
1119   unsigned Opc
1120     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1121 
1122   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1123   // omod
1124   SDValue Ops[8];
1125   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1126   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1127   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1128   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1129 }
1130 
1131 // We need to handle this here because tablegen doesn't support matching
1132 // instructions with multiple outputs.
SelectMAD_64_32(SDNode * N)1133 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1134   SDLoc SL(N);
1135   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1136   unsigned Opc;
1137   if (Subtarget->hasMADIntraFwdBug())
1138     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1139                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1140   else
1141     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1142 
1143   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1144   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1145                     Clamp };
1146   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1147 }
1148 
1149 // We need to handle this here because tablegen doesn't support matching
1150 // instructions with multiple outputs.
SelectMUL_LOHI(SDNode * N)1151 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1152   SDLoc SL(N);
1153   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1154   unsigned Opc;
1155   if (Subtarget->hasMADIntraFwdBug())
1156     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1157                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1158   else
1159     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1160 
1161   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1162   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1163   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1164   SDNode *Mad = CurDAG->getMachineNode(
1165       Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
1166   if (!SDValue(N, 0).use_empty()) {
1167     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1168     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1169                                         MVT::i32, SDValue(Mad, 0), Sub0);
1170     ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1171   }
1172   if (!SDValue(N, 1).use_empty()) {
1173     SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1174     SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1175                                         MVT::i32, SDValue(Mad, 0), Sub1);
1176     ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1177   }
1178   CurDAG->RemoveDeadNode(N);
1179 }
1180 
isDSOffsetLegal(SDValue Base,unsigned Offset) const1181 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1182   if (!isUInt<16>(Offset))
1183     return false;
1184 
1185   if (!Base || Subtarget->hasUsableDSOffset() ||
1186       Subtarget->unsafeDSOffsetFoldingEnabled())
1187     return true;
1188 
1189   // On Southern Islands instruction with a negative base value and an offset
1190   // don't seem to work.
1191   return CurDAG->SignBitIsZero(Base);
1192 }
1193 
SelectDS1Addr1Offset(SDValue Addr,SDValue & Base,SDValue & Offset) const1194 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1195                                               SDValue &Offset) const {
1196   SDLoc DL(Addr);
1197   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1198     SDValue N0 = Addr.getOperand(0);
1199     SDValue N1 = Addr.getOperand(1);
1200     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1201     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1202       // (add n0, c0)
1203       Base = N0;
1204       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1205       return true;
1206     }
1207   } else if (Addr.getOpcode() == ISD::SUB) {
1208     // sub C, x -> add (sub 0, x), C
1209     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1210       int64_t ByteOffset = C->getSExtValue();
1211       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1212         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1213 
1214         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1215         // the known bits in isDSOffsetLegal. We need to emit the selected node
1216         // here, so this is thrown away.
1217         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1218                                       Zero, Addr.getOperand(1));
1219 
1220         if (isDSOffsetLegal(Sub, ByteOffset)) {
1221           SmallVector<SDValue, 3> Opnds;
1222           Opnds.push_back(Zero);
1223           Opnds.push_back(Addr.getOperand(1));
1224 
1225           // FIXME: Select to VOP3 version for with-carry.
1226           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1227           if (Subtarget->hasAddNoCarry()) {
1228             SubOp = AMDGPU::V_SUB_U32_e64;
1229             Opnds.push_back(
1230                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1231           }
1232 
1233           MachineSDNode *MachineSub =
1234               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1235 
1236           Base = SDValue(MachineSub, 0);
1237           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1238           return true;
1239         }
1240       }
1241     }
1242   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1243     // If we have a constant address, prefer to put the constant into the
1244     // offset. This can save moves to load the constant address since multiple
1245     // operations can share the zero base address register, and enables merging
1246     // into read2 / write2 instructions.
1247 
1248     SDLoc DL(Addr);
1249 
1250     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1251       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1252       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1253                                  DL, MVT::i32, Zero);
1254       Base = SDValue(MovZero, 0);
1255       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1256       return true;
1257     }
1258   }
1259 
1260   // default case
1261   Base = Addr;
1262   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1263   return true;
1264 }
1265 
isDSOffset2Legal(SDValue Base,unsigned Offset0,unsigned Offset1,unsigned Size) const1266 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1267                                           unsigned Offset1,
1268                                           unsigned Size) const {
1269   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1270     return false;
1271   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1272     return false;
1273 
1274   if (!Base || Subtarget->hasUsableDSOffset() ||
1275       Subtarget->unsafeDSOffsetFoldingEnabled())
1276     return true;
1277 
1278   // On Southern Islands instruction with a negative base value and an offset
1279   // don't seem to work.
1280   return CurDAG->SignBitIsZero(Base);
1281 }
1282 
1283 // Return whether the operation has NoUnsignedWrap property.
isNoUnsignedWrap(SDValue Addr)1284 static bool isNoUnsignedWrap(SDValue Addr) {
1285   return (Addr.getOpcode() == ISD::ADD &&
1286           Addr->getFlags().hasNoUnsignedWrap()) ||
1287          Addr->getOpcode() == ISD::OR;
1288 }
1289 
1290 // Check that the base address of flat scratch load/store in the form of `base +
1291 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1292 // requirement). We always treat the first operand as the base address here.
isFlatScratchBaseLegal(SDValue Addr) const1293 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1294   if (isNoUnsignedWrap(Addr))
1295     return true;
1296 
1297   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1298   // values.
1299   if (Subtarget->hasSignedScratchOffsets())
1300     return true;
1301 
1302   auto LHS = Addr.getOperand(0);
1303   auto RHS = Addr.getOperand(1);
1304 
1305   // If the immediate offset is negative and within certain range, the base
1306   // address cannot also be negative. If the base is also negative, the sum
1307   // would be either negative or much larger than the valid range of scratch
1308   // memory a thread can access.
1309   ConstantSDNode *ImmOp = nullptr;
1310   if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1311     if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1312       return true;
1313   }
1314 
1315   return CurDAG->SignBitIsZero(LHS);
1316 }
1317 
1318 // Check address value in SGPR/VGPR are legal for flat scratch in the form
1319 // of: SGPR + VGPR.
isFlatScratchBaseLegalSV(SDValue Addr) const1320 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1321   if (isNoUnsignedWrap(Addr))
1322     return true;
1323 
1324   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1325   // values.
1326   if (Subtarget->hasSignedScratchOffsets())
1327     return true;
1328 
1329   auto LHS = Addr.getOperand(0);
1330   auto RHS = Addr.getOperand(1);
1331   return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1332 }
1333 
1334 // Check address value in SGPR/VGPR are legal for flat scratch in the form
1335 // of: SGPR + VGPR + Imm.
isFlatScratchBaseLegalSVImm(SDValue Addr) const1336 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1337   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1338   // values.
1339   if (AMDGPU::isGFX12Plus(*Subtarget))
1340     return true;
1341 
1342   auto Base = Addr.getOperand(0);
1343   auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1344   // If the immediate offset is negative and within certain range, the base
1345   // address cannot also be negative. If the base is also negative, the sum
1346   // would be either negative or much larger than the valid range of scratch
1347   // memory a thread can access.
1348   if (isNoUnsignedWrap(Base) &&
1349       (isNoUnsignedWrap(Addr) ||
1350        (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1351     return true;
1352 
1353   auto LHS = Base.getOperand(0);
1354   auto RHS = Base.getOperand(1);
1355   return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1356 }
1357 
1358 // TODO: If offset is too big, put low 16-bit into offset.
SelectDS64Bit4ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1359 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1360                                                    SDValue &Offset0,
1361                                                    SDValue &Offset1) const {
1362   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1363 }
1364 
SelectDS128Bit8ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1365 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1366                                                     SDValue &Offset0,
1367                                                     SDValue &Offset1) const {
1368   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1369 }
1370 
SelectDSReadWrite2(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1,unsigned Size) const1371 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1372                                             SDValue &Offset0, SDValue &Offset1,
1373                                             unsigned Size) const {
1374   SDLoc DL(Addr);
1375 
1376   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1377     SDValue N0 = Addr.getOperand(0);
1378     SDValue N1 = Addr.getOperand(1);
1379     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1380     unsigned OffsetValue0 = C1->getZExtValue();
1381     unsigned OffsetValue1 = OffsetValue0 + Size;
1382 
1383     // (add n0, c0)
1384     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1385       Base = N0;
1386       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1387       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1388       return true;
1389     }
1390   } else if (Addr.getOpcode() == ISD::SUB) {
1391     // sub C, x -> add (sub 0, x), C
1392     if (const ConstantSDNode *C =
1393             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1394       unsigned OffsetValue0 = C->getZExtValue();
1395       unsigned OffsetValue1 = OffsetValue0 + Size;
1396 
1397       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1398         SDLoc DL(Addr);
1399         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1400 
1401         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1402         // the known bits in isDSOffsetLegal. We need to emit the selected node
1403         // here, so this is thrown away.
1404         SDValue Sub =
1405             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1406 
1407         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1408           SmallVector<SDValue, 3> Opnds;
1409           Opnds.push_back(Zero);
1410           Opnds.push_back(Addr.getOperand(1));
1411           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1412           if (Subtarget->hasAddNoCarry()) {
1413             SubOp = AMDGPU::V_SUB_U32_e64;
1414             Opnds.push_back(
1415                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1416           }
1417 
1418           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1419               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1420 
1421           Base = SDValue(MachineSub, 0);
1422           Offset0 =
1423               CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1424           Offset1 =
1425               CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1426           return true;
1427         }
1428       }
1429     }
1430   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1431     unsigned OffsetValue0 = CAddr->getZExtValue();
1432     unsigned OffsetValue1 = OffsetValue0 + Size;
1433 
1434     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1435       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1436       MachineSDNode *MovZero =
1437           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1438       Base = SDValue(MovZero, 0);
1439       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1440       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1441       return true;
1442     }
1443   }
1444 
1445   // default case
1446 
1447   Base = Addr;
1448   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1449   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1450   return true;
1451 }
1452 
SelectMUBUF(SDValue Addr,SDValue & Ptr,SDValue & VAddr,SDValue & SOffset,SDValue & Offset,SDValue & Offen,SDValue & Idxen,SDValue & Addr64) const1453 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1454                                      SDValue &SOffset, SDValue &Offset,
1455                                      SDValue &Offen, SDValue &Idxen,
1456                                      SDValue &Addr64) const {
1457   // Subtarget prefers to use flat instruction
1458   // FIXME: This should be a pattern predicate and not reach here
1459   if (Subtarget->useFlatForGlobal())
1460     return false;
1461 
1462   SDLoc DL(Addr);
1463 
1464   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1465   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1466   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1467   SOffset = Subtarget->hasRestrictedSOffset()
1468                 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1469                 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1470 
1471   ConstantSDNode *C1 = nullptr;
1472   SDValue N0 = Addr;
1473   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1474     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1475     if (isUInt<32>(C1->getZExtValue()))
1476       N0 = Addr.getOperand(0);
1477     else
1478       C1 = nullptr;
1479   }
1480 
1481   if (N0.getOpcode() == ISD::ADD) {
1482     // (add N2, N3) -> addr64, or
1483     // (add (add N2, N3), C1) -> addr64
1484     SDValue N2 = N0.getOperand(0);
1485     SDValue N3 = N0.getOperand(1);
1486     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1487 
1488     if (N2->isDivergent()) {
1489       if (N3->isDivergent()) {
1490         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1491         // addr64, and construct the resource from a 0 address.
1492         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1493         VAddr = N0;
1494       } else {
1495         // N2 is divergent, N3 is not.
1496         Ptr = N3;
1497         VAddr = N2;
1498       }
1499     } else {
1500       // N2 is not divergent.
1501       Ptr = N2;
1502       VAddr = N3;
1503     }
1504     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1505   } else if (N0->isDivergent()) {
1506     // N0 is divergent. Use it as the addr64, and construct the resource from a
1507     // 0 address.
1508     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1509     VAddr = N0;
1510     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1511   } else {
1512     // N0 -> offset, or
1513     // (N0 + C1) -> offset
1514     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1515     Ptr = N0;
1516   }
1517 
1518   if (!C1) {
1519     // No offset.
1520     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1521     return true;
1522   }
1523 
1524   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1525   if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1526     // Legal offset for instruction.
1527     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1528     return true;
1529   }
1530 
1531   // Illegal offset, store it in soffset.
1532   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1533   SOffset =
1534       SDValue(CurDAG->getMachineNode(
1535                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1536                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1537               0);
1538   return true;
1539 }
1540 
SelectMUBUFAddr64(SDValue Addr,SDValue & SRsrc,SDValue & VAddr,SDValue & SOffset,SDValue & Offset) const1541 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1542                                            SDValue &VAddr, SDValue &SOffset,
1543                                            SDValue &Offset) const {
1544   SDValue Ptr, Offen, Idxen, Addr64;
1545 
1546   // addr64 bit was removed for volcanic islands.
1547   // FIXME: This should be a pattern predicate and not reach here
1548   if (!Subtarget->hasAddr64())
1549     return false;
1550 
1551   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1552     return false;
1553 
1554   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1555   if (C->getSExtValue()) {
1556     SDLoc DL(Addr);
1557 
1558     const SITargetLowering& Lowering =
1559       *static_cast<const SITargetLowering*>(getTargetLowering());
1560 
1561     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1562     return true;
1563   }
1564 
1565   return false;
1566 }
1567 
foldFrameIndex(SDValue N) const1568 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1569   SDLoc DL(N);
1570 
1571   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1572   SDValue TFI =
1573       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1574 
1575   // We rebase the base address into an absolute stack address and hence
1576   // use constant 0 for soffset. This value must be retained until
1577   // frame elimination and eliminateFrameIndex will choose the appropriate
1578   // frame register if need be.
1579   return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1580 }
1581 
SelectMUBUFScratchOffen(SDNode * Parent,SDValue Addr,SDValue & Rsrc,SDValue & VAddr,SDValue & SOffset,SDValue & ImmOffset) const1582 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1583                                                  SDValue Addr, SDValue &Rsrc,
1584                                                  SDValue &VAddr, SDValue &SOffset,
1585                                                  SDValue &ImmOffset) const {
1586 
1587   SDLoc DL(Addr);
1588   MachineFunction &MF = CurDAG->getMachineFunction();
1589   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1590 
1591   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1592 
1593   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1594     int64_t Imm = CAddr->getSExtValue();
1595     const int64_t NullPtr =
1596         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1597     // Don't fold null pointer.
1598     if (Imm != NullPtr) {
1599       const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1600       SDValue HighBits =
1601           CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1602       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1603         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1604       VAddr = SDValue(MovHighBits, 0);
1605 
1606       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1607       ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1608       return true;
1609     }
1610   }
1611 
1612   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1613     // (add n0, c1)
1614 
1615     SDValue N0 = Addr.getOperand(0);
1616     uint64_t C1 = Addr.getConstantOperandVal(1);
1617 
1618     // Offsets in vaddr must be positive if range checking is enabled.
1619     //
1620     // The total computation of vaddr + soffset + offset must not overflow.  If
1621     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1622     // overflowing.
1623     //
1624     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1625     // always perform a range check. If a negative vaddr base index was used,
1626     // this would fail the range check. The overall address computation would
1627     // compute a valid address, but this doesn't happen due to the range
1628     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1629     //
1630     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1631     // MUBUF vaddr, but not on older subtargets which can only do this if the
1632     // sign bit is known 0.
1633     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1634     if (TII->isLegalMUBUFImmOffset(C1) &&
1635         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1636          CurDAG->SignBitIsZero(N0))) {
1637       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1638       ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1639       return true;
1640     }
1641   }
1642 
1643   // (node)
1644   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1645   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1646   return true;
1647 }
1648 
IsCopyFromSGPR(const SIRegisterInfo & TRI,SDValue Val)1649 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1650   if (Val.getOpcode() != ISD::CopyFromReg)
1651     return false;
1652   auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1653   if (!Reg.isPhysical())
1654     return false;
1655   const auto *RC = TRI.getPhysRegBaseClass(Reg);
1656   return RC && TRI.isSGPRClass(RC);
1657 }
1658 
SelectMUBUFScratchOffset(SDNode * Parent,SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1659 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1660                                                   SDValue Addr,
1661                                                   SDValue &SRsrc,
1662                                                   SDValue &SOffset,
1663                                                   SDValue &Offset) const {
1664   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1665   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1666   MachineFunction &MF = CurDAG->getMachineFunction();
1667   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1668   SDLoc DL(Addr);
1669 
1670   // CopyFromReg <sgpr>
1671   if (IsCopyFromSGPR(*TRI, Addr)) {
1672     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1673     SOffset = Addr;
1674     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1675     return true;
1676   }
1677 
1678   ConstantSDNode *CAddr;
1679   if (Addr.getOpcode() == ISD::ADD) {
1680     // Add (CopyFromReg <sgpr>) <constant>
1681     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1682     if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1683       return false;
1684     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1685       return false;
1686 
1687     SOffset = Addr.getOperand(0);
1688   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1689              TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1690     // <constant>
1691     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1692   } else {
1693     return false;
1694   }
1695 
1696   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1697 
1698   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1699   return true;
1700 }
1701 
SelectMUBUFOffset(SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1702 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1703                                            SDValue &SOffset, SDValue &Offset
1704                                            ) const {
1705   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1706   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1707 
1708   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1709     return false;
1710 
1711   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1712       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1713       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1714     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1715                     maskTrailingOnes<uint64_t>(32); // Size
1716     SDLoc DL(Addr);
1717 
1718     const SITargetLowering& Lowering =
1719       *static_cast<const SITargetLowering*>(getTargetLowering());
1720 
1721     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1722     return true;
1723   }
1724   return false;
1725 }
1726 
SelectBUFSOffset(SDValue ByteOffsetNode,SDValue & SOffset) const1727 bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1728                                           SDValue &SOffset) const {
1729   if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1730     SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1731     return true;
1732   }
1733 
1734   SOffset = ByteOffsetNode;
1735   return true;
1736 }
1737 
1738 // Find a load or store from corresponding pattern root.
1739 // Roots may be build_vector, bitconvert or their combinations.
findMemSDNode(SDNode * N)1740 static MemSDNode* findMemSDNode(SDNode *N) {
1741   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1742   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1743     return MN;
1744   assert(isa<BuildVectorSDNode>(N));
1745   for (SDValue V : N->op_values())
1746     if (MemSDNode *MN =
1747           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1748       return MN;
1749   llvm_unreachable("cannot find MemSDNode in the pattern!");
1750 }
1751 
SelectFlatOffsetImpl(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset,uint64_t FlatVariant) const1752 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1753                                               SDValue &VAddr, SDValue &Offset,
1754                                               uint64_t FlatVariant) const {
1755   int64_t OffsetVal = 0;
1756 
1757   unsigned AS = findMemSDNode(N)->getAddressSpace();
1758 
1759   bool CanHaveFlatSegmentOffsetBug =
1760       Subtarget->hasFlatSegmentOffsetBug() &&
1761       FlatVariant == SIInstrFlags::FLAT &&
1762       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1763 
1764   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1765     SDValue N0, N1;
1766     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1767         (FlatVariant != SIInstrFlags::FlatScratch ||
1768          isFlatScratchBaseLegal(Addr))) {
1769       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1770 
1771       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1772       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1773         Addr = N0;
1774         OffsetVal = COffsetVal;
1775       } else {
1776         // If the offset doesn't fit, put the low bits into the offset field and
1777         // add the rest.
1778         //
1779         // For a FLAT instruction the hardware decides whether to access
1780         // global/scratch/shared memory based on the high bits of vaddr,
1781         // ignoring the offset field, so we have to ensure that when we add
1782         // remainder to vaddr it still points into the same underlying object.
1783         // The easiest way to do that is to make sure that we split the offset
1784         // into two pieces that are both >= 0 or both <= 0.
1785 
1786         SDLoc DL(N);
1787         uint64_t RemainderOffset;
1788 
1789         std::tie(OffsetVal, RemainderOffset) =
1790             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1791 
1792         SDValue AddOffsetLo =
1793             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1794         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1795 
1796         if (Addr.getValueType().getSizeInBits() == 32) {
1797           SmallVector<SDValue, 3> Opnds;
1798           Opnds.push_back(N0);
1799           Opnds.push_back(AddOffsetLo);
1800           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1801           if (Subtarget->hasAddNoCarry()) {
1802             AddOp = AMDGPU::V_ADD_U32_e64;
1803             Opnds.push_back(Clamp);
1804           }
1805           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1806         } else {
1807           // TODO: Should this try to use a scalar add pseudo if the base address
1808           // is uniform and saddr is usable?
1809           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1810           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1811 
1812           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1813                                                 DL, MVT::i32, N0, Sub0);
1814           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1815                                                 DL, MVT::i32, N0, Sub1);
1816 
1817           SDValue AddOffsetHi =
1818               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1819 
1820           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1821 
1822           SDNode *Add =
1823               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1824                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1825 
1826           SDNode *Addc = CurDAG->getMachineNode(
1827               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1828               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1829 
1830           SDValue RegSequenceArgs[] = {
1831               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1832               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1833 
1834           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1835                                                 MVT::i64, RegSequenceArgs),
1836                          0);
1837         }
1838       }
1839     }
1840   }
1841 
1842   VAddr = Addr;
1843   Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1844   return true;
1845 }
1846 
SelectFlatOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1847 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1848                                           SDValue &VAddr,
1849                                           SDValue &Offset) const {
1850   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1851 }
1852 
SelectGlobalOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1853 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1854                                             SDValue &VAddr,
1855                                             SDValue &Offset) const {
1856   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1857 }
1858 
SelectScratchOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1859 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1860                                              SDValue &VAddr,
1861                                              SDValue &Offset) const {
1862   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1863                               SIInstrFlags::FlatScratch);
1864 }
1865 
1866 // If this matches zero_extend i32:x, return x
matchZExtFromI32(SDValue Op)1867 static SDValue matchZExtFromI32(SDValue Op) {
1868   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1869     return SDValue();
1870 
1871   SDValue ExtSrc = Op.getOperand(0);
1872   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1873 }
1874 
1875 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
SelectGlobalSAddr(SDNode * N,SDValue Addr,SDValue & SAddr,SDValue & VOffset,SDValue & Offset) const1876 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1877                                            SDValue Addr,
1878                                            SDValue &SAddr,
1879                                            SDValue &VOffset,
1880                                            SDValue &Offset) const {
1881   int64_t ImmOffset = 0;
1882 
1883   // Match the immediate offset first, which canonically is moved as low as
1884   // possible.
1885 
1886   SDValue LHS, RHS;
1887   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1888     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1889     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1890 
1891     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1892                                SIInstrFlags::FlatGlobal)) {
1893       Addr = LHS;
1894       ImmOffset = COffsetVal;
1895     } else if (!LHS->isDivergent()) {
1896       if (COffsetVal > 0) {
1897         SDLoc SL(N);
1898         // saddr + large_offset -> saddr +
1899         //                         (voffset = large_offset & ~MaxOffset) +
1900         //                         (large_offset & MaxOffset);
1901         int64_t SplitImmOffset, RemainderOffset;
1902         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1903             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1904 
1905         if (isUInt<32>(RemainderOffset)) {
1906           SDNode *VMov = CurDAG->getMachineNode(
1907               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1908               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1909           VOffset = SDValue(VMov, 0);
1910           SAddr = LHS;
1911           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1912           return true;
1913         }
1914       }
1915 
1916       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1917       // is 1 we would need to perform 1 or 2 extra moves for each half of
1918       // the constant and it is better to do a scalar add and then issue a
1919       // single VALU instruction to materialize zero. Otherwise it is less
1920       // instructions to perform VALU adds with immediates or inline literals.
1921       unsigned NumLiterals =
1922           !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1923           !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1924       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1925         return false;
1926     }
1927   }
1928 
1929   // Match the variable offset.
1930   if (Addr.getOpcode() == ISD::ADD) {
1931     LHS = Addr.getOperand(0);
1932     RHS = Addr.getOperand(1);
1933 
1934     if (!LHS->isDivergent()) {
1935       // add (i64 sgpr), (zero_extend (i32 vgpr))
1936       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1937         SAddr = LHS;
1938         VOffset = ZextRHS;
1939       }
1940     }
1941 
1942     if (!SAddr && !RHS->isDivergent()) {
1943       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1944       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1945         SAddr = RHS;
1946         VOffset = ZextLHS;
1947       }
1948     }
1949 
1950     if (SAddr) {
1951       Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1952       return true;
1953     }
1954   }
1955 
1956   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1957       isa<ConstantSDNode>(Addr))
1958     return false;
1959 
1960   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1961   // moves required to copy a 64-bit SGPR to VGPR.
1962   SAddr = Addr;
1963   SDNode *VMov =
1964       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1965                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1966   VOffset = SDValue(VMov, 0);
1967   Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1968   return true;
1969 }
1970 
SelectSAddrFI(SelectionDAG * CurDAG,SDValue SAddr)1971 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1972   if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1973     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1974   } else if (SAddr.getOpcode() == ISD::ADD &&
1975              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1976     // Materialize this into a scalar move for scalar address to avoid
1977     // readfirstlane.
1978     auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1979     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1980                                               FI->getValueType(0));
1981     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1982                                            MVT::i32, TFI, SAddr.getOperand(1)),
1983                     0);
1984   }
1985 
1986   return SAddr;
1987 }
1988 
1989 // Match (32-bit SGPR base) + sext(imm offset)
SelectScratchSAddr(SDNode * Parent,SDValue Addr,SDValue & SAddr,SDValue & Offset) const1990 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1991                                             SDValue &SAddr,
1992                                             SDValue &Offset) const {
1993   if (Addr->isDivergent())
1994     return false;
1995 
1996   SDLoc DL(Addr);
1997 
1998   int64_t COffsetVal = 0;
1999 
2000   if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2001     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2002     SAddr = Addr.getOperand(0);
2003   } else {
2004     SAddr = Addr;
2005   }
2006 
2007   SAddr = SelectSAddrFI(CurDAG, SAddr);
2008 
2009   const SIInstrInfo *TII = Subtarget->getInstrInfo();
2010 
2011   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2012                               SIInstrFlags::FlatScratch)) {
2013     int64_t SplitImmOffset, RemainderOffset;
2014     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2015         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
2016 
2017     COffsetVal = SplitImmOffset;
2018 
2019     SDValue AddOffset =
2020         SAddr.getOpcode() == ISD::TargetFrameIndex
2021             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2022             : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2023     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2024                                            SAddr, AddOffset),
2025                     0);
2026   }
2027 
2028   Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2029 
2030   return true;
2031 }
2032 
2033 // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(SDValue VAddr,SDValue SAddr,uint64_t ImmOffset) const2034 bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2035     SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2036   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2037     return false;
2038 
2039   // The bug affects the swizzling of SVS accesses if there is any carry out
2040   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2041   // voffset to (soffset + inst_offset).
2042   KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2043   KnownBits SKnown =
2044       KnownBits::add(CurDAG->computeKnownBits(SAddr),
2045                      KnownBits::makeConstant(APInt(32, ImmOffset,
2046                                                    /*isSigned=*/true)));
2047   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2048   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2049   return (VMax & 3) + (SMax & 3) >= 4;
2050 }
2051 
SelectScratchSVAddr(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & SAddr,SDValue & Offset) const2052 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2053                                              SDValue &VAddr, SDValue &SAddr,
2054                                              SDValue &Offset) const  {
2055   int64_t ImmOffset = 0;
2056 
2057   SDValue LHS, RHS;
2058   SDValue OrigAddr = Addr;
2059   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2060     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2061     const SIInstrInfo *TII = Subtarget->getInstrInfo();
2062 
2063     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2064                                SIInstrFlags::FlatScratch)) {
2065       Addr = LHS;
2066       ImmOffset = COffsetVal;
2067     } else if (!LHS->isDivergent() && COffsetVal > 0) {
2068       SDLoc SL(N);
2069       // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2070       //                         (large_offset & MaxOffset);
2071       int64_t SplitImmOffset, RemainderOffset;
2072       std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2073           COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
2074 
2075       if (isUInt<32>(RemainderOffset)) {
2076         SDNode *VMov = CurDAG->getMachineNode(
2077           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2078           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2079         VAddr = SDValue(VMov, 0);
2080         SAddr = LHS;
2081         if (!isFlatScratchBaseLegal(Addr))
2082           return false;
2083         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2084           return false;
2085         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2086         return true;
2087       }
2088     }
2089   }
2090 
2091   if (Addr.getOpcode() != ISD::ADD)
2092     return false;
2093 
2094   LHS = Addr.getOperand(0);
2095   RHS = Addr.getOperand(1);
2096 
2097   if (!LHS->isDivergent() && RHS->isDivergent()) {
2098     SAddr = LHS;
2099     VAddr = RHS;
2100   } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2101     SAddr = RHS;
2102     VAddr = LHS;
2103   } else {
2104     return false;
2105   }
2106 
2107   if (OrigAddr != Addr) {
2108     if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2109       return false;
2110   } else {
2111     if (!isFlatScratchBaseLegalSV(OrigAddr))
2112       return false;
2113   }
2114 
2115   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2116     return false;
2117   SAddr = SelectSAddrFI(CurDAG, SAddr);
2118   Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2119   return true;
2120 }
2121 
2122 // For unbuffered smem loads, it is illegal for the Immediate Offset to be
2123 // negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2124 // Handle the case where the Immediate Offset + SOffset is negative.
isSOffsetLegalWithImmOffset(SDValue * SOffset,bool Imm32Only,bool IsBuffer,int64_t ImmOffset) const2125 bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2126                                                      bool Imm32Only,
2127                                                      bool IsBuffer,
2128                                                      int64_t ImmOffset) const {
2129   if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2130       AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2131     KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2132     if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2133       return false;
2134   }
2135 
2136   return true;
2137 }
2138 
2139 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2140 // not null) offset. If Imm32Only is true, match only 32-bit immediate
2141 // offsets available on CI.
SelectSMRDOffset(SDValue ByteOffsetNode,SDValue * SOffset,SDValue * Offset,bool Imm32Only,bool IsBuffer,bool HasSOffset,int64_t ImmOffset) const2142 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2143                                           SDValue *SOffset, SDValue *Offset,
2144                                           bool Imm32Only, bool IsBuffer,
2145                                           bool HasSOffset,
2146                                           int64_t ImmOffset) const {
2147   assert((!SOffset || !Offset) &&
2148          "Cannot match both soffset and offset at the same time!");
2149 
2150   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2151   if (!C) {
2152     if (!SOffset)
2153       return false;
2154 
2155     if (ByteOffsetNode.getValueType().isScalarInteger() &&
2156         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2157       *SOffset = ByteOffsetNode;
2158       return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2159                                          ImmOffset);
2160     }
2161     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2162       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2163         *SOffset = ByteOffsetNode.getOperand(0);
2164         return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2165                                            ImmOffset);
2166       }
2167     }
2168     return false;
2169   }
2170 
2171   SDLoc SL(ByteOffsetNode);
2172 
2173   // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2174   // offset for S_BUFFER instructions is unsigned.
2175   int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2176   std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2177       *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2178   if (EncodedOffset && Offset && !Imm32Only) {
2179     *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2180     return true;
2181   }
2182 
2183   // SGPR and literal offsets are unsigned.
2184   if (ByteOffset < 0)
2185     return false;
2186 
2187   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2188   if (EncodedOffset && Offset && Imm32Only) {
2189     *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2190     return true;
2191   }
2192 
2193   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2194     return false;
2195 
2196   if (SOffset) {
2197     SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2198     *SOffset = SDValue(
2199         CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2200     return true;
2201   }
2202 
2203   return false;
2204 }
2205 
Expand32BitAddress(SDValue Addr) const2206 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2207   if (Addr.getValueType() != MVT::i32)
2208     return Addr;
2209 
2210   // Zero-extend a 32-bit address.
2211   SDLoc SL(Addr);
2212 
2213   const MachineFunction &MF = CurDAG->getMachineFunction();
2214   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2215   unsigned AddrHiVal = Info->get32BitAddressHighBits();
2216   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2217 
2218   const SDValue Ops[] = {
2219     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2220     Addr,
2221     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2222     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2223             0),
2224     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2225   };
2226 
2227   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2228                                         Ops), 0);
2229 }
2230 
2231 // Match a base and an immediate (if Offset is not null) or an SGPR (if
2232 // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2233 // true, match only 32-bit immediate offsets available on CI.
SelectSMRDBaseOffset(SDValue Addr,SDValue & SBase,SDValue * SOffset,SDValue * Offset,bool Imm32Only,bool IsBuffer,bool HasSOffset,int64_t ImmOffset) const2234 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2235                                               SDValue *SOffset, SDValue *Offset,
2236                                               bool Imm32Only, bool IsBuffer,
2237                                               bool HasSOffset,
2238                                               int64_t ImmOffset) const {
2239   if (SOffset && Offset) {
2240     assert(!Imm32Only && !IsBuffer);
2241     SDValue B;
2242 
2243     if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2244       return false;
2245 
2246     int64_t ImmOff = 0;
2247     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2248       ImmOff = C->getSExtValue();
2249 
2250     return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2251                                 ImmOff);
2252   }
2253 
2254   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2255   // wraparound, because s_load instructions perform the addition in 64 bits.
2256   if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2257       !Addr->getFlags().hasNoUnsignedWrap())
2258     return false;
2259 
2260   SDValue N0, N1;
2261   // Extract the base and offset if possible.
2262   if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2263     N0 = Addr.getOperand(0);
2264     N1 = Addr.getOperand(1);
2265   } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2266     assert(N0 && N1 && isa<ConstantSDNode>(N1));
2267   }
2268   if (!N0 || !N1)
2269     return false;
2270 
2271   if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2272                        ImmOffset)) {
2273     SBase = N0;
2274     return true;
2275   }
2276   if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2277                        ImmOffset)) {
2278     SBase = N1;
2279     return true;
2280   }
2281   return false;
2282 }
2283 
SelectSMRD(SDValue Addr,SDValue & SBase,SDValue * SOffset,SDValue * Offset,bool Imm32Only) const2284 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2285                                     SDValue *SOffset, SDValue *Offset,
2286                                     bool Imm32Only) const {
2287   if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2288     SBase = Expand32BitAddress(SBase);
2289     return true;
2290   }
2291 
2292   if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2293     SBase = Expand32BitAddress(Addr);
2294     *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2295     return true;
2296   }
2297 
2298   return false;
2299 }
2300 
SelectSMRDImm(SDValue Addr,SDValue & SBase,SDValue & Offset) const2301 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2302                                        SDValue &Offset) const {
2303   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2304 }
2305 
SelectSMRDImm32(SDValue Addr,SDValue & SBase,SDValue & Offset) const2306 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2307                                          SDValue &Offset) const {
2308   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2309   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2310                     /* Imm32Only */ true);
2311 }
2312 
SelectSMRDSgpr(SDValue Addr,SDValue & SBase,SDValue & SOffset) const2313 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2314                                         SDValue &SOffset) const {
2315   return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2316 }
2317 
SelectSMRDSgprImm(SDValue Addr,SDValue & SBase,SDValue & SOffset,SDValue & Offset) const2318 bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2319                                            SDValue &SOffset,
2320                                            SDValue &Offset) const {
2321   return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2322 }
2323 
SelectSMRDBufferImm(SDValue N,SDValue & Offset) const2324 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2325   return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2326                           /* Imm32Only */ false, /* IsBuffer */ true);
2327 }
2328 
SelectSMRDBufferImm32(SDValue N,SDValue & Offset) const2329 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2330                                                SDValue &Offset) const {
2331   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2332   return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2333                           /* Imm32Only */ true, /* IsBuffer */ true);
2334 }
2335 
SelectSMRDBufferSgprImm(SDValue N,SDValue & SOffset,SDValue & Offset) const2336 bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2337                                                  SDValue &Offset) const {
2338   // Match the (soffset + offset) pair as a 32-bit register base and
2339   // an immediate offset.
2340   return N.getValueType() == MVT::i32 &&
2341          SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2342                               &Offset, /* Imm32Only */ false,
2343                               /* IsBuffer */ true);
2344 }
2345 
SelectMOVRELOffset(SDValue Index,SDValue & Base,SDValue & Offset) const2346 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2347                                             SDValue &Base,
2348                                             SDValue &Offset) const {
2349   SDLoc DL(Index);
2350 
2351   if (CurDAG->isBaseWithConstantOffset(Index)) {
2352     SDValue N0 = Index.getOperand(0);
2353     SDValue N1 = Index.getOperand(1);
2354     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2355 
2356     // (add n0, c0)
2357     // Don't peel off the offset (c0) if doing so could possibly lead
2358     // the base (n0) to be negative.
2359     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2360     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2361         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2362       Base = N0;
2363       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2364       return true;
2365     }
2366   }
2367 
2368   if (isa<ConstantSDNode>(Index))
2369     return false;
2370 
2371   Base = Index;
2372   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2373   return true;
2374 }
2375 
getBFE32(bool IsSigned,const SDLoc & DL,SDValue Val,uint32_t Offset,uint32_t Width)2376 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2377                                      SDValue Val, uint32_t Offset,
2378                                      uint32_t Width) {
2379   if (Val->isDivergent()) {
2380     unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2381     SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2382     SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2383 
2384     return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2385   }
2386   unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2387   // Transformation function, pack the offset and width of a BFE into
2388   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2389   // source, bits [5:0] contain the offset and bits [22:16] the width.
2390   uint32_t PackedVal = Offset | (Width << 16);
2391   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2392 
2393   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2394 }
2395 
SelectS_BFEFromShifts(SDNode * N)2396 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2397   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2398   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2399   // Predicate: 0 < b <= c < 32
2400 
2401   const SDValue &Shl = N->getOperand(0);
2402   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2403   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2404 
2405   if (B && C) {
2406     uint32_t BVal = B->getZExtValue();
2407     uint32_t CVal = C->getZExtValue();
2408 
2409     if (0 < BVal && BVal <= CVal && CVal < 32) {
2410       bool Signed = N->getOpcode() == ISD::SRA;
2411       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2412                   32 - CVal));
2413       return;
2414     }
2415   }
2416   SelectCode(N);
2417 }
2418 
SelectS_BFE(SDNode * N)2419 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2420   switch (N->getOpcode()) {
2421   case ISD::AND:
2422     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2423       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2424       // Predicate: isMask(mask)
2425       const SDValue &Srl = N->getOperand(0);
2426       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2427       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2428 
2429       if (Shift && Mask) {
2430         uint32_t ShiftVal = Shift->getZExtValue();
2431         uint32_t MaskVal = Mask->getZExtValue();
2432 
2433         if (isMask_32(MaskVal)) {
2434           uint32_t WidthVal = llvm::popcount(MaskVal);
2435           ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2436                                   WidthVal));
2437           return;
2438         }
2439       }
2440     }
2441     break;
2442   case ISD::SRL:
2443     if (N->getOperand(0).getOpcode() == ISD::AND) {
2444       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2445       // Predicate: isMask(mask >> b)
2446       const SDValue &And = N->getOperand(0);
2447       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2448       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2449 
2450       if (Shift && Mask) {
2451         uint32_t ShiftVal = Shift->getZExtValue();
2452         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2453 
2454         if (isMask_32(MaskVal)) {
2455           uint32_t WidthVal = llvm::popcount(MaskVal);
2456           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2457                       WidthVal));
2458           return;
2459         }
2460       }
2461     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2462       SelectS_BFEFromShifts(N);
2463       return;
2464     }
2465     break;
2466   case ISD::SRA:
2467     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2468       SelectS_BFEFromShifts(N);
2469       return;
2470     }
2471     break;
2472 
2473   case ISD::SIGN_EXTEND_INREG: {
2474     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2475     SDValue Src = N->getOperand(0);
2476     if (Src.getOpcode() != ISD::SRL)
2477       break;
2478 
2479     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2480     if (!Amt)
2481       break;
2482 
2483     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2484     ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2485                             Amt->getZExtValue(), Width));
2486     return;
2487   }
2488   }
2489 
2490   SelectCode(N);
2491 }
2492 
isCBranchSCC(const SDNode * N) const2493 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2494   assert(N->getOpcode() == ISD::BRCOND);
2495   if (!N->hasOneUse())
2496     return false;
2497 
2498   SDValue Cond = N->getOperand(1);
2499   if (Cond.getOpcode() == ISD::CopyToReg)
2500     Cond = Cond.getOperand(2);
2501 
2502   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2503     return false;
2504 
2505   MVT VT = Cond.getOperand(0).getSimpleValueType();
2506   if (VT == MVT::i32)
2507     return true;
2508 
2509   if (VT == MVT::i64) {
2510     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2511     return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2512            Subtarget->hasScalarCompareEq64();
2513   }
2514 
2515   if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2516     return true;
2517 
2518   return false;
2519 }
2520 
combineBallotPattern(SDValue VCMP,bool & Negate)2521 static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2522   assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2523   // Special case for amdgcn.ballot:
2524   // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2525   // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2526   // =>
2527   // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2528   // This is possible because divergent ISD::SETCC is selected as V_CMP and
2529   // Cond becomes a i(WaveSize) full mask value.
2530   // Note that ballot doesn't use SETEQ condition but its easy to support it
2531   // here for completeness, so in this case Negate is set true on return.
2532   auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2533   if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2534       isNullConstant(VCMP.getOperand(1))) {
2535 
2536     auto Cond = VCMP.getOperand(0);
2537     if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2538       Cond = Cond.getOperand(0);
2539 
2540     if (isBoolSGPR(Cond)) {
2541       Negate = VCMP_CC == ISD::SETEQ;
2542       return Cond;
2543     }
2544   }
2545   return SDValue();
2546 }
2547 
SelectBRCOND(SDNode * N)2548 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2549   SDValue Cond = N->getOperand(1);
2550 
2551   if (Cond.isUndef()) {
2552     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2553                          N->getOperand(2), N->getOperand(0));
2554     return;
2555   }
2556 
2557   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2558 
2559   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2560   bool AndExec = !UseSCCBr;
2561   bool Negate = false;
2562 
2563   if (Cond.getOpcode() == ISD::SETCC &&
2564       Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2565     SDValue VCMP = Cond->getOperand(0);
2566     auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2567     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2568         isNullConstant(Cond->getOperand(1)) &&
2569         // We may encounter ballot.i64 in wave32 mode on -O0.
2570         VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2571       // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2572       // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2573       // BRCOND i1 %C, %BB
2574       // =>
2575       // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2576       // VCC = COPY i(WaveSize) %VCMP
2577       // S_CBRANCH_VCCNZ/VCCZ %BB
2578       Negate = CC == ISD::SETEQ;
2579       bool NegatedBallot = false;
2580       if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2581         Cond = BallotCond;
2582         UseSCCBr = !BallotCond->isDivergent();
2583         Negate = Negate ^ NegatedBallot;
2584       } else {
2585         // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2586         // selected as V_CMP, but this may change for uniform condition.
2587         Cond = VCMP;
2588         UseSCCBr = false;
2589       }
2590     }
2591     // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2592     // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2593     // used.
2594     AndExec = false;
2595   }
2596 
2597   unsigned BrOp =
2598       UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2599                : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2600   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2601   SDLoc SL(N);
2602 
2603   if (AndExec) {
2604     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2605     // analyzed what generates the vcc value, so we do not know whether vcc
2606     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2607     // disabled lanes.
2608     //
2609     // For the case that we select S_CBRANCH_SCC1 and it gets
2610     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2611     // SIInstrInfo::moveToVALU which inserts the S_AND).
2612     //
2613     // We could add an analysis of what generates the vcc value here and omit
2614     // the S_AND when is unnecessary. But it would be better to add a separate
2615     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2616     // catches both cases.
2617     Cond = SDValue(
2618         CurDAG->getMachineNode(
2619             Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2620             MVT::i1,
2621             CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2622                                                       : AMDGPU::EXEC,
2623                                 MVT::i1),
2624             Cond),
2625         0);
2626   }
2627 
2628   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2629   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2630                        N->getOperand(2), // Basic Block
2631                        VCC.getValue(0));
2632 }
2633 
SelectFP_EXTEND(SDNode * N)2634 void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2635   if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2636       !N->isDivergent()) {
2637     SDValue Src = N->getOperand(0);
2638     if (Src.getValueType() == MVT::f16) {
2639       if (isExtractHiElt(Src, Src)) {
2640         CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2641                              {Src});
2642         return;
2643       }
2644     }
2645   }
2646 
2647   SelectCode(N);
2648 }
2649 
SelectDSAppendConsume(SDNode * N,unsigned IntrID)2650 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2651   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2652   // be copied to an SGPR with readfirstlane.
2653   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2654     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2655 
2656   SDValue Chain = N->getOperand(0);
2657   SDValue Ptr = N->getOperand(2);
2658   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2659   MachineMemOperand *MMO = M->getMemOperand();
2660   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2661 
2662   SDValue Offset;
2663   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2664     SDValue PtrBase = Ptr.getOperand(0);
2665     SDValue PtrOffset = Ptr.getOperand(1);
2666 
2667     const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2668     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2669       N = glueCopyToM0(N, PtrBase);
2670       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2671     }
2672   }
2673 
2674   if (!Offset) {
2675     N = glueCopyToM0(N, Ptr);
2676     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2677   }
2678 
2679   SDValue Ops[] = {
2680     Offset,
2681     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2682     Chain,
2683     N->getOperand(N->getNumOperands() - 1) // New glue
2684   };
2685 
2686   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2687   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2688 }
2689 
2690 // We need to handle this here because tablegen doesn't support matching
2691 // instructions with multiple outputs.
SelectDSBvhStackIntrinsic(SDNode * N,unsigned IntrID)2692 void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2693   unsigned Opc;
2694   switch (IntrID) {
2695   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2696   case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2697     Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2698     break;
2699   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2700     Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2701     break;
2702   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2703     Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2704     break;
2705   }
2706   SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2707                    N->getOperand(5), N->getOperand(0)};
2708 
2709   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2710   MachineMemOperand *MMO = M->getMemOperand();
2711   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2712   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2713 }
2714 
gwsIntrinToOpcode(unsigned IntrID)2715 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2716   switch (IntrID) {
2717   case Intrinsic::amdgcn_ds_gws_init:
2718     return AMDGPU::DS_GWS_INIT;
2719   case Intrinsic::amdgcn_ds_gws_barrier:
2720     return AMDGPU::DS_GWS_BARRIER;
2721   case Intrinsic::amdgcn_ds_gws_sema_v:
2722     return AMDGPU::DS_GWS_SEMA_V;
2723   case Intrinsic::amdgcn_ds_gws_sema_br:
2724     return AMDGPU::DS_GWS_SEMA_BR;
2725   case Intrinsic::amdgcn_ds_gws_sema_p:
2726     return AMDGPU::DS_GWS_SEMA_P;
2727   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2728     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2729   default:
2730     llvm_unreachable("not a gws intrinsic");
2731   }
2732 }
2733 
SelectDS_GWS(SDNode * N,unsigned IntrID)2734 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2735   if (!Subtarget->hasGWS() ||
2736       (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2737        !Subtarget->hasGWSSemaReleaseAll())) {
2738     // Let this error.
2739     SelectCode(N);
2740     return;
2741   }
2742 
2743   // Chain, intrinsic ID, vsrc, offset
2744   const bool HasVSrc = N->getNumOperands() == 4;
2745   assert(HasVSrc || N->getNumOperands() == 3);
2746 
2747   SDLoc SL(N);
2748   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2749   int ImmOffset = 0;
2750   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2751   MachineMemOperand *MMO = M->getMemOperand();
2752 
2753   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2754   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2755 
2756   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2757   // offset field) % 64. Some versions of the programming guide omit the m0
2758   // part, or claim it's from offset 0.
2759   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2760     // If we have a constant offset, try to use the 0 in m0 as the base.
2761     // TODO: Look into changing the default m0 initialization value. If the
2762     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2763     // the immediate offset.
2764     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2765     ImmOffset = ConstOffset->getZExtValue();
2766   } else {
2767     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2768       ImmOffset = BaseOffset.getConstantOperandVal(1);
2769       BaseOffset = BaseOffset.getOperand(0);
2770     }
2771 
2772     // Prefer to do the shift in an SGPR since it should be possible to use m0
2773     // as the result directly. If it's already an SGPR, it will be eliminated
2774     // later.
2775     SDNode *SGPROffset
2776       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2777                                BaseOffset);
2778     // Shift to offset in m0
2779     SDNode *M0Base
2780       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2781                                SDValue(SGPROffset, 0),
2782                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2783     glueCopyToM0(N, SDValue(M0Base, 0));
2784   }
2785 
2786   SDValue Chain = N->getOperand(0);
2787   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2788 
2789   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2790   SmallVector<SDValue, 5> Ops;
2791   if (HasVSrc)
2792     Ops.push_back(N->getOperand(2));
2793   Ops.push_back(OffsetField);
2794   Ops.push_back(Chain);
2795 
2796   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2797   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2798 }
2799 
SelectInterpP1F16(SDNode * N)2800 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2801   if (Subtarget->getLDSBankCount() != 16) {
2802     // This is a single instruction with a pattern.
2803     SelectCode(N);
2804     return;
2805   }
2806 
2807   SDLoc DL(N);
2808 
2809   // This requires 2 instructions. It is possible to write a pattern to support
2810   // this, but the generated isel emitter doesn't correctly deal with multiple
2811   // output instructions using the same physical register input. The copy to m0
2812   // is incorrectly placed before the second instruction.
2813   //
2814   // TODO: Match source modifiers.
2815   //
2816   // def : Pat <
2817   //   (int_amdgcn_interp_p1_f16
2818   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2819   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2820   //                             (i1 timm:$high), M0),
2821   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2822   //       timm:$attrchan, 0,
2823   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2824   //   let Predicates = [has16BankLDS];
2825   // }
2826 
2827   // 16 bank LDS
2828   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2829                                       N->getOperand(5), SDValue());
2830 
2831   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2832 
2833   SDNode *InterpMov =
2834     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2835         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2836         N->getOperand(3),  // Attr
2837         N->getOperand(2),  // Attrchan
2838         ToM0.getValue(1) // In glue
2839   });
2840 
2841   SDNode *InterpP1LV =
2842     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2843         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2844         N->getOperand(1), // Src0
2845         N->getOperand(3), // Attr
2846         N->getOperand(2), // Attrchan
2847         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2848         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2849         N->getOperand(4), // high
2850         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2851         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2852         SDValue(InterpMov, 1)
2853   });
2854 
2855   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2856 }
2857 
SelectINTRINSIC_W_CHAIN(SDNode * N)2858 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2859   unsigned IntrID = N->getConstantOperandVal(1);
2860   switch (IntrID) {
2861   case Intrinsic::amdgcn_ds_append:
2862   case Intrinsic::amdgcn_ds_consume: {
2863     if (N->getValueType(0) != MVT::i32)
2864       break;
2865     SelectDSAppendConsume(N, IntrID);
2866     return;
2867   }
2868   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2869   case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2870   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2871   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2872     SelectDSBvhStackIntrinsic(N, IntrID);
2873     return;
2874   case Intrinsic::amdgcn_init_whole_wave:
2875     CurDAG->getMachineFunction()
2876         .getInfo<SIMachineFunctionInfo>()
2877         ->setInitWholeWave();
2878     break;
2879   }
2880 
2881   SelectCode(N);
2882 }
2883 
SelectINTRINSIC_WO_CHAIN(SDNode * N)2884 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2885   unsigned IntrID = N->getConstantOperandVal(0);
2886   unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2887   SDNode *ConvGlueNode = N->getGluedNode();
2888   if (ConvGlueNode) {
2889     // FIXME: Possibly iterate over multiple glue nodes?
2890     assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2891     ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2892     ConvGlueNode =
2893         CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2894                                MVT::Glue, SDValue(ConvGlueNode, 0));
2895   } else {
2896     ConvGlueNode = nullptr;
2897   }
2898   switch (IntrID) {
2899   case Intrinsic::amdgcn_wqm:
2900     Opcode = AMDGPU::WQM;
2901     break;
2902   case Intrinsic::amdgcn_softwqm:
2903     Opcode = AMDGPU::SOFT_WQM;
2904     break;
2905   case Intrinsic::amdgcn_wwm:
2906   case Intrinsic::amdgcn_strict_wwm:
2907     Opcode = AMDGPU::STRICT_WWM;
2908     break;
2909   case Intrinsic::amdgcn_strict_wqm:
2910     Opcode = AMDGPU::STRICT_WQM;
2911     break;
2912   case Intrinsic::amdgcn_interp_p1_f16:
2913     SelectInterpP1F16(N);
2914     return;
2915   case Intrinsic::amdgcn_permlane16_swap:
2916   case Intrinsic::amdgcn_permlane32_swap: {
2917     if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2918          !Subtarget->hasPermlane16Swap()) ||
2919         (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2920          !Subtarget->hasPermlane32Swap())) {
2921       SelectCode(N); // Hit the default error
2922       return;
2923     }
2924 
2925     Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2926                  ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2927                  : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2928 
2929     SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2930     if (ConvGlueNode)
2931       NewOps.push_back(SDValue(ConvGlueNode, 0));
2932 
2933     bool FI = N->getConstantOperandVal(3);
2934     NewOps[2] = CurDAG->getTargetConstant(
2935         FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
2936 
2937     CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2938     return;
2939   }
2940   default:
2941     SelectCode(N);
2942     break;
2943   }
2944 
2945   if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2946     SDValue Src = N->getOperand(1);
2947     CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2948   }
2949 
2950   if (ConvGlueNode) {
2951     SmallVector<SDValue, 4> NewOps(N->ops());
2952     NewOps.push_back(SDValue(ConvGlueNode, 0));
2953     CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2954   }
2955 }
2956 
SelectINTRINSIC_VOID(SDNode * N)2957 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2958   unsigned IntrID = N->getConstantOperandVal(1);
2959   switch (IntrID) {
2960   case Intrinsic::amdgcn_ds_gws_init:
2961   case Intrinsic::amdgcn_ds_gws_barrier:
2962   case Intrinsic::amdgcn_ds_gws_sema_v:
2963   case Intrinsic::amdgcn_ds_gws_sema_br:
2964   case Intrinsic::amdgcn_ds_gws_sema_p:
2965   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2966     SelectDS_GWS(N, IntrID);
2967     return;
2968   default:
2969     break;
2970   }
2971 
2972   SelectCode(N);
2973 }
2974 
SelectWAVE_ADDRESS(SDNode * N)2975 void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2976   SDValue Log2WaveSize =
2977     CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2978   CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2979                        {N->getOperand(0), Log2WaveSize});
2980 }
2981 
SelectSTACKRESTORE(SDNode * N)2982 void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2983   SDValue SrcVal = N->getOperand(1);
2984   if (SrcVal.getValueType() != MVT::i32) {
2985     SelectCode(N); // Emit default error
2986     return;
2987   }
2988 
2989   SDValue CopyVal;
2990   Register SP = TLI->getStackPointerRegisterToSaveRestore();
2991   SDLoc SL(N);
2992 
2993   if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2994     CopyVal = SrcVal.getOperand(0);
2995   } else {
2996     SDValue Log2WaveSize = CurDAG->getTargetConstant(
2997         Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2998 
2999     if (N->isDivergent()) {
3000       SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3001                                               MVT::i32, SrcVal),
3002                        0);
3003     }
3004 
3005     CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3006                                              {SrcVal, Log2WaveSize}),
3007                       0);
3008   }
3009 
3010   SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3011   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3012 }
3013 
SelectVOP3ModsImpl(SDValue In,SDValue & Src,unsigned & Mods,bool IsCanonicalizing,bool AllowAbs) const3014 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3015                                             unsigned &Mods,
3016                                             bool IsCanonicalizing,
3017                                             bool AllowAbs) const {
3018   Mods = SISrcMods::NONE;
3019   Src = In;
3020 
3021   if (Src.getOpcode() == ISD::FNEG) {
3022     Mods |= SISrcMods::NEG;
3023     Src = Src.getOperand(0);
3024   } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3025     // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3026     // denormal mode, but we're implicitly canonicalizing in a source operand.
3027     auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3028     if (LHS && LHS->isZero()) {
3029       Mods |= SISrcMods::NEG;
3030       Src = Src.getOperand(1);
3031     }
3032   }
3033 
3034   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3035     Mods |= SISrcMods::ABS;
3036     Src = Src.getOperand(0);
3037   }
3038 
3039   return true;
3040 }
3041 
SelectVOP3Mods(SDValue In,SDValue & Src,SDValue & SrcMods) const3042 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3043                                         SDValue &SrcMods) const {
3044   unsigned Mods;
3045   if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3046                          /*AllowAbs=*/true)) {
3047     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3048     return true;
3049   }
3050 
3051   return false;
3052 }
3053 
SelectVOP3ModsNonCanonicalizing(SDValue In,SDValue & Src,SDValue & SrcMods) const3054 bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3055     SDValue In, SDValue &Src, SDValue &SrcMods) const {
3056   unsigned Mods;
3057   if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3058                          /*AllowAbs=*/true)) {
3059     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3060     return true;
3061   }
3062 
3063   return false;
3064 }
3065 
SelectVOP3BMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3066 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3067                                          SDValue &SrcMods) const {
3068   unsigned Mods;
3069   if (SelectVOP3ModsImpl(In, Src, Mods,
3070                          /*IsCanonicalizing=*/true,
3071                          /*AllowAbs=*/false)) {
3072     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3073     return true;
3074   }
3075 
3076   return false;
3077 }
3078 
SelectVOP3NoMods(SDValue In,SDValue & Src) const3079 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3080   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3081     return false;
3082 
3083   Src = In;
3084   return true;
3085 }
3086 
SelectVINTERPModsImpl(SDValue In,SDValue & Src,SDValue & SrcMods,bool OpSel) const3087 bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3088                                                SDValue &SrcMods,
3089                                                bool OpSel) const {
3090   unsigned Mods;
3091   if (SelectVOP3ModsImpl(In, Src, Mods,
3092                          /*IsCanonicalizing=*/true,
3093                          /*AllowAbs=*/false)) {
3094     if (OpSel)
3095       Mods |= SISrcMods::OP_SEL_0;
3096     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3097     return true;
3098   }
3099 
3100   return false;
3101 }
3102 
SelectVINTERPMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3103 bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3104                                            SDValue &SrcMods) const {
3105   return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3106 }
3107 
SelectVINTERPModsHi(SDValue In,SDValue & Src,SDValue & SrcMods) const3108 bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3109                                              SDValue &SrcMods) const {
3110   return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3111 }
3112 
SelectVOP3Mods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const3113 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3114                                          SDValue &SrcMods, SDValue &Clamp,
3115                                          SDValue &Omod) const {
3116   SDLoc DL(In);
3117   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3118   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3119 
3120   return SelectVOP3Mods(In, Src, SrcMods);
3121 }
3122 
SelectVOP3BMods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const3123 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3124                                           SDValue &SrcMods, SDValue &Clamp,
3125                                           SDValue &Omod) const {
3126   SDLoc DL(In);
3127   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3128   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3129 
3130   return SelectVOP3BMods(In, Src, SrcMods);
3131 }
3132 
SelectVOP3OMods(SDValue In,SDValue & Src,SDValue & Clamp,SDValue & Omod) const3133 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3134                                          SDValue &Clamp, SDValue &Omod) const {
3135   Src = In;
3136 
3137   SDLoc DL(In);
3138   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3139   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3140 
3141   return true;
3142 }
3143 
SelectVOP3PMods(SDValue In,SDValue & Src,SDValue & SrcMods,bool IsDOT) const3144 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3145                                          SDValue &SrcMods, bool IsDOT) const {
3146   unsigned Mods = SISrcMods::NONE;
3147   Src = In;
3148 
3149   // TODO: Handle G_FSUB 0 as fneg
3150   if (Src.getOpcode() == ISD::FNEG) {
3151     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3152     Src = Src.getOperand(0);
3153   }
3154 
3155   if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3156       (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3157     unsigned VecMods = Mods;
3158 
3159     SDValue Lo = stripBitcast(Src.getOperand(0));
3160     SDValue Hi = stripBitcast(Src.getOperand(1));
3161 
3162     if (Lo.getOpcode() == ISD::FNEG) {
3163       Lo = stripBitcast(Lo.getOperand(0));
3164       Mods ^= SISrcMods::NEG;
3165     }
3166 
3167     if (Hi.getOpcode() == ISD::FNEG) {
3168       Hi = stripBitcast(Hi.getOperand(0));
3169       Mods ^= SISrcMods::NEG_HI;
3170     }
3171 
3172     if (isExtractHiElt(Lo, Lo))
3173       Mods |= SISrcMods::OP_SEL_0;
3174 
3175     if (isExtractHiElt(Hi, Hi))
3176       Mods |= SISrcMods::OP_SEL_1;
3177 
3178     unsigned VecSize = Src.getValueSizeInBits();
3179     Lo = stripExtractLoElt(Lo);
3180     Hi = stripExtractLoElt(Hi);
3181 
3182     if (Lo.getValueSizeInBits() > VecSize) {
3183       Lo = CurDAG->getTargetExtractSubreg(
3184         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3185         MVT::getIntegerVT(VecSize), Lo);
3186     }
3187 
3188     if (Hi.getValueSizeInBits() > VecSize) {
3189       Hi = CurDAG->getTargetExtractSubreg(
3190         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3191         MVT::getIntegerVT(VecSize), Hi);
3192     }
3193 
3194     assert(Lo.getValueSizeInBits() <= VecSize &&
3195            Hi.getValueSizeInBits() <= VecSize);
3196 
3197     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3198       // Really a scalar input. Just select from the low half of the register to
3199       // avoid packing.
3200 
3201       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3202         Src = Lo;
3203       } else {
3204         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3205 
3206         SDLoc SL(In);
3207         SDValue Undef = SDValue(
3208           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3209                                  Lo.getValueType()), 0);
3210         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3211                                     : AMDGPU::SReg_64RegClassID;
3212         const SDValue Ops[] = {
3213           CurDAG->getTargetConstant(RC, SL, MVT::i32),
3214           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3215           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3216 
3217         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3218                                              Src.getValueType(), Ops), 0);
3219       }
3220       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3221       return true;
3222     }
3223 
3224     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3225       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3226                       .bitcastToAPInt().getZExtValue();
3227       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3228         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3229         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3230         return true;
3231       }
3232     }
3233 
3234     Mods = VecMods;
3235   } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3236              Src.getNumOperands() == 2) {
3237 
3238     // TODO: We should repeat the build_vector source check above for the
3239     // vector_shuffle for negates and casts of individual elements.
3240 
3241     auto *SVN = cast<ShuffleVectorSDNode>(Src);
3242     ArrayRef<int> Mask = SVN->getMask();
3243 
3244     if (Mask[0] < 2 && Mask[1] < 2) {
3245       // src1 should be undef.
3246       SDValue ShuffleSrc = SVN->getOperand(0);
3247 
3248       if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3249         ShuffleSrc = ShuffleSrc.getOperand(0);
3250         Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3251       }
3252 
3253       if (Mask[0] == 1)
3254         Mods |= SISrcMods::OP_SEL_0;
3255       if (Mask[1] == 1)
3256         Mods |= SISrcMods::OP_SEL_1;
3257 
3258       Src = ShuffleSrc;
3259       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3260       return true;
3261     }
3262   }
3263 
3264   // Packed instructions do not have abs modifiers.
3265   Mods |= SISrcMods::OP_SEL_1;
3266 
3267   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3268   return true;
3269 }
3270 
SelectVOP3PModsDOT(SDValue In,SDValue & Src,SDValue & SrcMods) const3271 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3272                                             SDValue &SrcMods) const {
3273   return SelectVOP3PMods(In, Src, SrcMods, true);
3274 }
3275 
SelectVOP3PModsNeg(SDValue In,SDValue & Src) const3276 bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3277   const ConstantSDNode *C = cast<ConstantSDNode>(In);
3278   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3279   // 1 promotes packed values to signed, 0 treats them as unsigned.
3280   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3281 
3282   unsigned Mods = SISrcMods::OP_SEL_1;
3283   unsigned SrcSign = C->getZExtValue();
3284   if (SrcSign == 1)
3285     Mods ^= SISrcMods::NEG;
3286 
3287   Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3288   return true;
3289 }
3290 
SelectWMMAOpSelVOP3PMods(SDValue In,SDValue & Src) const3291 bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3292                                                   SDValue &Src) const {
3293   const ConstantSDNode *C = cast<ConstantSDNode>(In);
3294   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3295 
3296   unsigned Mods = SISrcMods::OP_SEL_1;
3297   unsigned SrcVal = C->getZExtValue();
3298   if (SrcVal == 1)
3299     Mods |= SISrcMods::OP_SEL_0;
3300 
3301   Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3302   return true;
3303 }
3304 
buildRegSequence32(SmallVectorImpl<SDValue> & Elts,llvm::SelectionDAG * CurDAG,const SDLoc & DL)3305 static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3306                                          llvm::SelectionDAG *CurDAG,
3307                                          const SDLoc &DL) {
3308   unsigned DstRegClass;
3309   EVT DstTy;
3310   switch (Elts.size()) {
3311   case 8:
3312     DstRegClass = AMDGPU::VReg_256RegClassID;
3313     DstTy = MVT::v8i32;
3314     break;
3315   case 4:
3316     DstRegClass = AMDGPU::VReg_128RegClassID;
3317     DstTy = MVT::v4i32;
3318     break;
3319   case 2:
3320     DstRegClass = AMDGPU::VReg_64RegClassID;
3321     DstTy = MVT::v2i32;
3322     break;
3323   default:
3324     llvm_unreachable("unhandled Reg sequence size");
3325   }
3326 
3327   SmallVector<SDValue, 17> Ops;
3328   Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3329   for (unsigned i = 0; i < Elts.size(); ++i) {
3330     Ops.push_back(Elts[i]);
3331     Ops.push_back(CurDAG->getTargetConstant(
3332         SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
3333   }
3334   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3335 }
3336 
buildRegSequence16(SmallVectorImpl<SDValue> & Elts,llvm::SelectionDAG * CurDAG,const SDLoc & DL)3337 static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3338                                          llvm::SelectionDAG *CurDAG,
3339                                          const SDLoc &DL) {
3340   SmallVector<SDValue, 8> PackedElts;
3341   assert("unhandled Reg sequence size" &&
3342          (Elts.size() == 8 || Elts.size() == 16));
3343 
3344   // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3345   // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3346   for (unsigned i = 0; i < Elts.size(); i += 2) {
3347     SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3348     SDValue HiSrc;
3349     if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3350       PackedElts.push_back(HiSrc);
3351     } else {
3352       SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3353       MachineSDNode *Packed =
3354           CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3355                                  {Elts[i + 1], Elts[i], PackLoLo});
3356       PackedElts.push_back(SDValue(Packed, 0));
3357     }
3358   }
3359 
3360   return buildRegSequence32(PackedElts, CurDAG, DL);
3361 }
3362 
buildRegSequence(SmallVectorImpl<SDValue> & Elts,llvm::SelectionDAG * CurDAG,const SDLoc & DL,unsigned ElementSize)3363 static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3364                                        llvm::SelectionDAG *CurDAG,
3365                                        const SDLoc &DL, unsigned ElementSize) {
3366   if (ElementSize == 16)
3367     return buildRegSequence16(Elts, CurDAG, DL);
3368   if (ElementSize == 32)
3369     return buildRegSequence32(Elts, CurDAG, DL);
3370   llvm_unreachable("Unhandled element size");
3371 }
3372 
selectWMMAModsNegAbs(unsigned ModOpcode,unsigned & Mods,SmallVectorImpl<SDValue> & Elts,SDValue & Src,llvm::SelectionDAG * CurDAG,const SDLoc & DL,unsigned ElementSize)3373 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3374                                  SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3375                                  llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3376                                  unsigned ElementSize) {
3377   if (ModOpcode == ISD::FNEG) {
3378     Mods |= SISrcMods::NEG;
3379     // Check if all elements also have abs modifier
3380     SmallVector<SDValue, 8> NegAbsElts;
3381     for (auto El : Elts) {
3382       if (El.getOpcode() != ISD::FABS)
3383         break;
3384       NegAbsElts.push_back(El->getOperand(0));
3385     }
3386     if (Elts.size() != NegAbsElts.size()) {
3387       // Neg
3388       Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3389     } else {
3390       // Neg and Abs
3391       Mods |= SISrcMods::NEG_HI;
3392       Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3393     }
3394   } else {
3395     assert(ModOpcode == ISD::FABS);
3396     // Abs
3397     Mods |= SISrcMods::NEG_HI;
3398     Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3399   }
3400 }
3401 
3402 // Check all f16 elements for modifiers while looking through b32 and v2b16
3403 // build vector, stop if element does not satisfy ModifierCheck.
3404 static void
checkWMMAElementsModifiersF16(BuildVectorSDNode * BV,std::function<bool (SDValue)> ModifierCheck)3405 checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3406                               std::function<bool(SDValue)> ModifierCheck) {
3407   for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3408     if (auto *F16Pair =
3409             dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3410       for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3411         SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3412         if (!ModifierCheck(ElF16))
3413           break;
3414       }
3415     }
3416   }
3417 }
3418 
SelectWMMAModsF16Neg(SDValue In,SDValue & Src,SDValue & SrcMods) const3419 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3420                                               SDValue &SrcMods) const {
3421   Src = In;
3422   unsigned Mods = SISrcMods::OP_SEL_1;
3423 
3424   // mods are on f16 elements
3425   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3426     SmallVector<SDValue, 8> EltsF16;
3427 
3428     checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3429       if (Element.getOpcode() != ISD::FNEG)
3430         return false;
3431       EltsF16.push_back(Element.getOperand(0));
3432       return true;
3433     });
3434 
3435     // All elements have neg modifier
3436     if (BV->getNumOperands() * 2 == EltsF16.size()) {
3437       Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3438       Mods |= SISrcMods::NEG;
3439       Mods |= SISrcMods::NEG_HI;
3440     }
3441   }
3442 
3443   // mods are on v2f16 elements
3444   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3445     SmallVector<SDValue, 8> EltsV2F16;
3446     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3447       SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3448       // Based on first element decide which mod we match, neg or abs
3449       if (ElV2f16.getOpcode() != ISD::FNEG)
3450         break;
3451       EltsV2F16.push_back(ElV2f16.getOperand(0));
3452     }
3453 
3454     // All pairs of elements have neg modifier
3455     if (BV->getNumOperands() == EltsV2F16.size()) {
3456       Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3457       Mods |= SISrcMods::NEG;
3458       Mods |= SISrcMods::NEG_HI;
3459     }
3460   }
3461 
3462   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3463   return true;
3464 }
3465 
SelectWMMAModsF16NegAbs(SDValue In,SDValue & Src,SDValue & SrcMods) const3466 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3467                                                  SDValue &SrcMods) const {
3468   Src = In;
3469   unsigned Mods = SISrcMods::OP_SEL_1;
3470   unsigned ModOpcode;
3471 
3472   // mods are on f16 elements
3473   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3474     SmallVector<SDValue, 8> EltsF16;
3475     checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3476       // Based on first element decide which mod we match, neg or abs
3477       if (EltsF16.empty())
3478         ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3479       if (ElF16.getOpcode() != ModOpcode)
3480         return false;
3481       EltsF16.push_back(ElF16.getOperand(0));
3482       return true;
3483     });
3484 
3485     // All elements have ModOpcode modifier
3486     if (BV->getNumOperands() * 2 == EltsF16.size())
3487       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3488                            16);
3489   }
3490 
3491   // mods are on v2f16 elements
3492   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3493     SmallVector<SDValue, 8> EltsV2F16;
3494 
3495     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3496       SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3497       // Based on first element decide which mod we match, neg or abs
3498       if (EltsV2F16.empty())
3499         ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3500       if (ElV2f16->getOpcode() != ModOpcode)
3501         break;
3502       EltsV2F16.push_back(ElV2f16->getOperand(0));
3503     }
3504 
3505     // All elements have ModOpcode modifier
3506     if (BV->getNumOperands() == EltsV2F16.size())
3507       selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3508                            32);
3509   }
3510 
3511   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3512   return true;
3513 }
3514 
SelectWMMAModsF32NegAbs(SDValue In,SDValue & Src,SDValue & SrcMods) const3515 bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3516                                                  SDValue &SrcMods) const {
3517   Src = In;
3518   unsigned Mods = SISrcMods::OP_SEL_1;
3519   SmallVector<SDValue, 8> EltsF32;
3520 
3521   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3522     assert(BV->getNumOperands() > 0);
3523     // Based on first element decide which mod we match, neg or abs
3524     SDValue ElF32 = stripBitcast(BV->getOperand(0));
3525     unsigned ModOpcode =
3526         (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3527     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3528       SDValue ElF32 = stripBitcast(BV->getOperand(i));
3529       if (ElF32.getOpcode() != ModOpcode)
3530         break;
3531       EltsF32.push_back(ElF32.getOperand(0));
3532     }
3533 
3534     // All elements had ModOpcode modifier
3535     if (BV->getNumOperands() == EltsF32.size())
3536       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3537                            32);
3538   }
3539 
3540   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3541   return true;
3542 }
3543 
SelectWMMAVISrc(SDValue In,SDValue & Src) const3544 bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3545   if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3546     BitVector UndefElements;
3547     if (SDValue Splat = BV->getSplatValue(&UndefElements))
3548       if (isInlineImmediate(Splat.getNode())) {
3549         if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3550           unsigned Imm = C->getAPIntValue().getSExtValue();
3551           Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3552           return true;
3553         }
3554         if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3555           unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3556           Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3557           return true;
3558         }
3559         llvm_unreachable("unhandled Constant node");
3560       }
3561   }
3562 
3563   // 16 bit splat
3564   SDValue SplatSrc32 = stripBitcast(In);
3565   if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3566     if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3567       SDValue SplatSrc16 = stripBitcast(Splat32);
3568       if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3569         if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3570           const SIInstrInfo *TII = Subtarget->getInstrInfo();
3571           std::optional<APInt> RawValue;
3572           if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3573             RawValue = C->getValueAPF().bitcastToAPInt();
3574           else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3575             RawValue = C->getAPIntValue();
3576 
3577           if (RawValue.has_value()) {
3578             EVT VT = In.getValueType().getScalarType();
3579             if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3580               APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3581                                    ? APFloatBase::IEEEhalf()
3582                                    : APFloatBase::BFloat(),
3583                                RawValue.value());
3584               if (TII->isInlineConstant(FloatVal)) {
3585                 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3586                                                 MVT::i16);
3587                 return true;
3588               }
3589             } else if (VT.getSimpleVT() == MVT::i16) {
3590               if (TII->isInlineConstant(RawValue.value())) {
3591                 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3592                                                 MVT::i16);
3593                 return true;
3594               }
3595             } else
3596               llvm_unreachable("unknown 16-bit type");
3597           }
3598         }
3599     }
3600 
3601   return false;
3602 }
3603 
SelectSWMMACIndex8(SDValue In,SDValue & Src,SDValue & IndexKey) const3604 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3605                                             SDValue &IndexKey) const {
3606   unsigned Key = 0;
3607   Src = In;
3608 
3609   if (In.getOpcode() == ISD::SRL) {
3610     const llvm::SDValue &ShiftSrc = In.getOperand(0);
3611     ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3612     if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3613         ShiftAmt->getZExtValue() % 8 == 0) {
3614       Key = ShiftAmt->getZExtValue() / 8;
3615       Src = ShiftSrc;
3616     }
3617   }
3618 
3619   IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3620   return true;
3621 }
3622 
SelectSWMMACIndex16(SDValue In,SDValue & Src,SDValue & IndexKey) const3623 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3624                                              SDValue &IndexKey) const {
3625   unsigned Key = 0;
3626   Src = In;
3627 
3628   if (In.getOpcode() == ISD::SRL) {
3629     const llvm::SDValue &ShiftSrc = In.getOperand(0);
3630     ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3631     if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3632         ShiftAmt->getZExtValue() == 16) {
3633       Key = 1;
3634       Src = ShiftSrc;
3635     }
3636   }
3637 
3638   IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3639   return true;
3640 }
3641 
SelectVOP3OpSel(SDValue In,SDValue & Src,SDValue & SrcMods) const3642 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3643                                          SDValue &SrcMods) const {
3644   Src = In;
3645   // FIXME: Handle op_sel
3646   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3647   return true;
3648 }
3649 
SelectVOP3OpSelMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3650 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3651                                              SDValue &SrcMods) const {
3652   // FIXME: Handle op_sel
3653   return SelectVOP3Mods(In, Src, SrcMods);
3654 }
3655 
3656 // The return value is not whether the match is possible (which it always is),
3657 // but whether or not it a conversion is really used.
SelectVOP3PMadMixModsImpl(SDValue In,SDValue & Src,unsigned & Mods) const3658 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3659                                                    unsigned &Mods) const {
3660   Mods = 0;
3661   SelectVOP3ModsImpl(In, Src, Mods);
3662 
3663   if (Src.getOpcode() == ISD::FP_EXTEND) {
3664     Src = Src.getOperand(0);
3665     assert(Src.getValueType() == MVT::f16);
3666     Src = stripBitcast(Src);
3667 
3668     // Be careful about folding modifiers if we already have an abs. fneg is
3669     // applied last, so we don't want to apply an earlier fneg.
3670     if ((Mods & SISrcMods::ABS) == 0) {
3671       unsigned ModsTmp;
3672       SelectVOP3ModsImpl(Src, Src, ModsTmp);
3673 
3674       if ((ModsTmp & SISrcMods::NEG) != 0)
3675         Mods ^= SISrcMods::NEG;
3676 
3677       if ((ModsTmp & SISrcMods::ABS) != 0)
3678         Mods |= SISrcMods::ABS;
3679     }
3680 
3681     // op_sel/op_sel_hi decide the source type and source.
3682     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3683     // If the sources's op_sel is set, it picks the high half of the source
3684     // register.
3685 
3686     Mods |= SISrcMods::OP_SEL_1;
3687     if (isExtractHiElt(Src, Src)) {
3688       Mods |= SISrcMods::OP_SEL_0;
3689 
3690       // TODO: Should we try to look for neg/abs here?
3691     }
3692 
3693     // Prevent unnecessary subreg COPY to VGPR_16
3694     if (Src.getOpcode() == ISD::TRUNCATE &&
3695         Src.getOperand(0).getValueType() == MVT::i32) {
3696       Src = Src.getOperand(0);
3697     }
3698     return true;
3699   }
3700 
3701   return false;
3702 }
3703 
SelectVOP3PMadMixModsExt(SDValue In,SDValue & Src,SDValue & SrcMods) const3704 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3705                                                   SDValue &SrcMods) const {
3706   unsigned Mods = 0;
3707   if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3708     return false;
3709   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3710   return true;
3711 }
3712 
SelectVOP3PMadMixMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3713 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3714                                                SDValue &SrcMods) const {
3715   unsigned Mods = 0;
3716   SelectVOP3PMadMixModsImpl(In, Src, Mods);
3717   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3718   return true;
3719 }
3720 
3721 // Match BITOP3 operation and return a number of matched instructions plus
3722 // truth table.
BitOp3_Op(SDValue In,SmallVectorImpl<SDValue> & Src)3723 static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3724                                               SmallVectorImpl<SDValue> &Src) {
3725   unsigned NumOpcodes = 0;
3726   uint8_t LHSBits, RHSBits;
3727 
3728   auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3729     // Define truth table given Src0, Src1, Src2 bits permutations:
3730     //                          0     0     0
3731     //                          0     0     1
3732     //                          0     1     0
3733     //                          0     1     1
3734     //                          1     0     0
3735     //                          1     0     1
3736     //                          1     1     0
3737     //                          1     1     1
3738     const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3739 
3740     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3741       if (C->isAllOnes()) {
3742         Bits = 0xff;
3743         return true;
3744       }
3745       if (C->isZero()) {
3746         Bits = 0;
3747         return true;
3748       }
3749     }
3750 
3751     for (unsigned I = 0; I < Src.size(); ++I) {
3752       // Try to find existing reused operand
3753       if (Src[I] == Op) {
3754         Bits = SrcBits[I];
3755         return true;
3756       }
3757       // Try to replace parent operator
3758       if (Src[I] == In) {
3759         Bits = SrcBits[I];
3760         Src[I] = Op;
3761         return true;
3762       }
3763     }
3764 
3765     if (Src.size() == 3) {
3766       // No room left for operands. Try one last time, there can be a 'not' of
3767       // one of our source operands. In this case we can compute the bits
3768       // without growing Src vector.
3769       if (Op.getOpcode() == ISD::XOR) {
3770         if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3771           if (C->isAllOnes()) {
3772             SDValue LHS = Op.getOperand(0);
3773             for (unsigned I = 0; I < Src.size(); ++I) {
3774               if (Src[I] == LHS) {
3775                 Bits = ~SrcBits[I];
3776                 return true;
3777               }
3778             }
3779           }
3780         }
3781       }
3782 
3783       return false;
3784     }
3785 
3786     Bits = SrcBits[Src.size()];
3787     Src.push_back(Op);
3788     return true;
3789   };
3790 
3791   switch (In.getOpcode()) {
3792   case ISD::AND:
3793   case ISD::OR:
3794   case ISD::XOR: {
3795     SDValue LHS = In.getOperand(0);
3796     SDValue RHS = In.getOperand(1);
3797 
3798     SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3799     if (!getOperandBits(LHS, LHSBits) ||
3800         !getOperandBits(RHS, RHSBits)) {
3801       Src = Backup;
3802       return std::make_pair(0, 0);
3803     }
3804 
3805     // Recursion is naturally limited by the size of the operand vector.
3806     auto Op = BitOp3_Op(LHS, Src);
3807     if (Op.first) {
3808       NumOpcodes += Op.first;
3809       LHSBits = Op.second;
3810     }
3811 
3812     Op = BitOp3_Op(RHS, Src);
3813     if (Op.first) {
3814       NumOpcodes += Op.first;
3815       RHSBits = Op.second;
3816     }
3817     break;
3818   }
3819   default:
3820     return std::make_pair(0, 0);
3821   }
3822 
3823   uint8_t TTbl;
3824   switch (In.getOpcode()) {
3825   case ISD::AND:
3826     TTbl = LHSBits & RHSBits;
3827     break;
3828   case ISD::OR:
3829     TTbl = LHSBits | RHSBits;
3830     break;
3831   case ISD::XOR:
3832     TTbl = LHSBits ^ RHSBits;
3833     break;
3834   default:
3835     break;
3836   }
3837 
3838   return std::make_pair(NumOpcodes + 1, TTbl);
3839 }
3840 
SelectBITOP3(SDValue In,SDValue & Src0,SDValue & Src1,SDValue & Src2,SDValue & Tbl) const3841 bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3842                                       SDValue &Src2, SDValue &Tbl) const {
3843   SmallVector<SDValue, 3> Src;
3844   uint8_t TTbl;
3845   unsigned NumOpcodes;
3846 
3847   std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3848 
3849   // Src.empty() case can happen if all operands are all zero or all ones.
3850   // Normally it shall be optimized out before reaching this.
3851   if (NumOpcodes < 2 || Src.empty())
3852     return false;
3853 
3854   // For a uniform case threshold should be higher to account for moves between
3855   // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3856   // and a readtfirstlane after.
3857   if (NumOpcodes < 4 && !In->isDivergent())
3858     return false;
3859 
3860   if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3861     // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3862     // asm more readable. This cannot be modeled with AddedComplexity because
3863     // selector does not know how many operations did we match.
3864     if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3865         (In.getOperand(0).getOpcode() == In.getOpcode() ||
3866          In.getOperand(1).getOpcode() == In.getOpcode()))
3867       return false;
3868 
3869     if (In.getOpcode() == ISD::OR &&
3870         (In.getOperand(0).getOpcode() == ISD::AND ||
3871          In.getOperand(1).getOpcode() == ISD::AND))
3872       return false;
3873   }
3874 
3875   // Last operand can be ignored, turning a ternary operation into a binary.
3876   // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3877   // 'c' with 'a' here without changing the answer. In some pathological
3878   // cases it should be possible to get an operation with a single operand
3879   // too if optimizer would not catch it.
3880   while (Src.size() < 3)
3881     Src.push_back(Src[0]);
3882 
3883   Src0 = Src[0];
3884   Src1 = Src[1];
3885   Src2 = Src[2];
3886 
3887   Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3888   return true;
3889 }
3890 
getHi16Elt(SDValue In) const3891 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3892   if (In.isUndef())
3893     return CurDAG->getUNDEF(MVT::i32);
3894 
3895   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3896     SDLoc SL(In);
3897     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3898   }
3899 
3900   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3901     SDLoc SL(In);
3902     return CurDAG->getConstant(
3903       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3904   }
3905 
3906   SDValue Src;
3907   if (isExtractHiElt(In, Src))
3908     return Src;
3909 
3910   return SDValue();
3911 }
3912 
isVGPRImm(const SDNode * N) const3913 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3914   assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
3915 
3916   const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
3917   const SIInstrInfo *SII = Subtarget->getInstrInfo();
3918 
3919   unsigned Limit = 0;
3920   bool AllUsesAcceptSReg = true;
3921   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3922     Limit < 10 && U != E; ++U, ++Limit) {
3923     const TargetRegisterClass *RC =
3924         getOperandRegClass(U->getUser(), U->getOperandNo());
3925 
3926     // If the register class is unknown, it could be an unknown
3927     // register class that needs to be an SGPR, e.g. an inline asm
3928     // constraint
3929     if (!RC || SIRI->isSGPRClass(RC))
3930       return false;
3931 
3932     if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3933       AllUsesAcceptSReg = false;
3934       SDNode *User = U->getUser();
3935       if (User->isMachineOpcode()) {
3936         unsigned Opc = User->getMachineOpcode();
3937         const MCInstrDesc &Desc = SII->get(Opc);
3938         if (Desc.isCommutable()) {
3939           unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3940           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3941           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3942             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3943             const TargetRegisterClass *CommutedRC =
3944                 getOperandRegClass(U->getUser(), CommutedOpNo);
3945             if (CommutedRC == &AMDGPU::VS_32RegClass ||
3946                 CommutedRC == &AMDGPU::VS_64RegClass)
3947               AllUsesAcceptSReg = true;
3948           }
3949         }
3950       }
3951       // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3952       // commuting current user. This means have at least one use
3953       // that strictly require VGPR. Thus, we will not attempt to commute
3954       // other user instructions.
3955       if (!AllUsesAcceptSReg)
3956         break;
3957     }
3958   }
3959   return !AllUsesAcceptSReg && (Limit < 10);
3960 }
3961 
isUniformLoad(const SDNode * N) const3962 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3963   const auto *Ld = cast<LoadSDNode>(N);
3964 
3965   const MachineMemOperand *MMO = Ld->getMemOperand();
3966   if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
3967     return false;
3968 
3969   return MMO->getSize().hasValue() &&
3970          Ld->getAlign() >=
3971              Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3972                             uint64_t(4))) &&
3973          ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3974            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3975           (Subtarget->getScalarizeGlobalBehavior() &&
3976            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3977            Ld->isSimple() &&
3978            static_cast<const SITargetLowering *>(getTargetLowering())
3979                ->isMemOpHasNoClobberedMemOperand(N)));
3980 }
3981 
PostprocessISelDAG()3982 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3983   const AMDGPUTargetLowering& Lowering =
3984     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3985   bool IsModified = false;
3986   do {
3987     IsModified = false;
3988 
3989     // Go over all selected nodes and try to fold them a bit more
3990     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3991     while (Position != CurDAG->allnodes_end()) {
3992       SDNode *Node = &*Position++;
3993       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3994       if (!MachineNode)
3995         continue;
3996 
3997       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3998       if (ResNode != Node) {
3999         if (ResNode)
4000           ReplaceUses(Node, ResNode);
4001         IsModified = true;
4002       }
4003     }
4004     CurDAG->RemoveDeadNodes();
4005   } while (IsModified);
4006 }
4007 
AMDGPUDAGToDAGISelLegacy(TargetMachine & TM,CodeGenOptLevel OptLevel)4008 AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
4009                                                    CodeGenOptLevel OptLevel)
4010     : SelectionDAGISelLegacy(
4011           ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
4012 
4013 char AMDGPUDAGToDAGISelLegacy::ID = 0;
4014